metrics_exploitation.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Tool to generate reference cloud masks for validation of operational cloud masks.
The elaboration is performed using an active learning procedure.

The code was written by Louis Baetens during a training period at CESBIO, funded by CNES, under the direction of O.Hagolle

==================== Copyright
Software (metrics_exploitation.py)

Copyright© 2019 Centre National d’Etudes Spatiales

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License version 3
as published by the Free Software Foundation.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this program.  If not, see
https://www.gnu.org/licenses/gpl-3.0.fr.html
"""
import json
import csv
import os.path as op
from pprint import pprint
import numpy as np
import os
import shutil
import matplotlib.pyplot as plt
import glob
import random
from matplotlib.lines import Line2D


def matrix_loading(confusion_matrix_path):
    ''' Loads the matrix given its path, and returns
    the confusion matrix along with the classes names
    '''

    # loads the data
    datafile = open(confusion_matrix_path, 'r')
    lines = datafile.readlines()
    datafile.close()
    lines = [l.replace('\n', '') for l in lines]

    headers = lines[0:2]
    data = lines[2:]

    # Get the references classes
    classes_ref = headers[0].split(':')[1].split(',')
    classes_ref = [int(c) for c in classes_ref]

    # Get the produced classes
    classes_produced = headers[1].split(':')[1].split(',')
    classes_produced = [int(c) for c in classes_produced]

    print(classes_ref)
    print(classes_produced)

    # Get the data and shapes the array, while converting to int
    confusion_matrix_original = []
    for row in data:
        confusion_matrix_original.append([int(r) for r in row.split(',')])

    # To make the confusion matrix square, i.e. if some classes are
    # missing in the referenced or produced data :
    classes_all = list(set(classes_ref + classes_produced))
    nb_class = len(classes_all)

    print('Classes: {}'.format(classes_all))

    conf_mat = np.zeros((nb_class, nb_class), float)

    for i in range(nb_class):
        for j in range(nb_class):
            I = None
            J = None
            try:
                I = classes_ref.index(classes_all[i])
                J = classes_produced.index(classes_all[j])
            except:
                I = None
                J = None
            if I != None and J != None:
                conf_mat[i][j] = confusion_matrix_original[I][J]

    print('Confusion matrix:')
    pprint(conf_mat)

    return classes_all, conf_mat


def multiclass_stats(confusion_matrix_path):
    ''' Return the percentage of well and mis classified samples from 
    the confusion matrix
    '''
    classes, cm = matrix_loading(confusion_matrix_path)
    nb_classes = len(classes)

    diag = 0
    non_diag = 0
    for k in range(nb_classes):
        diag += cm[k][k]
        non_diag += sum(cm[k])-cm[k][k]
    total_samples = diag + non_diag

    well_classified = float(diag)/float(total_samples)
    ill_classified = float(non_diag)/float(total_samples)

    print('Mean of well classified samples on all classes: {:.2f} %'.format(well_classified*100))
    print('Mean of misclassified samples on all classes: {:.2f} %'.format(ill_classified*100))

    return well_classified, ill_classified


def get_binary_classes(global_parameters):
    ''' Get the classes corresponding to clouds or not clouds class
    '''
    clouds_classes = []
    not_clouds_classes = []

    clouds_classes.append(int(global_parameters["masks"]["low_clouds"]["class"]))
    clouds_classes.append(int(global_parameters["masks"]["high_clouds"]["class"]))
    clouds_classes.append(int(global_parameters["masks"]["clouds_shadows"]["class"]))

    not_clouds_classes.append(int(global_parameters["masks"]["background"]["class"]))
    not_clouds_classes.append(int(global_parameters["masks"]["land"]["class"]))
    not_clouds_classes.append(int(global_parameters["masks"]["water"]["class"]))
    not_clouds_classes.append(int(global_parameters["masks"]["snow"]["class"]))

    return clouds_classes, not_clouds_classes


def multi_to_binary_confusion_matrix(global_parameters, confusion_matrix_path):
    ''' Transform a multiclass confusion matrix to a binary one,
    allowing the computation of metrics such as accuracy or F1 score
    '''
    clouds_classes, not_clouds_classes = get_binary_classes(global_parameters)

    classes, cm = matrix_loading(confusion_matrix_path)
    nb_classes = len(classes)

    FNs = []
    FPs = []
    TPs = []
    TNs = []

    for i in range(nb_classes):
        for j in range(nb_classes):
            if classes[i] in clouds_classes and classes[j] in clouds_classes:
                TPs.append(cm[i][j])
            if classes[i] in not_clouds_classes and classes[j] in not_clouds_classes:
                TNs.append(cm[i][j])
            if classes[i] in clouds_classes and classes[j] in not_clouds_classes:
                FNs.append(cm[i][j])
            if classes[i] in not_clouds_classes and classes[j] in clouds_classes:
                FPs.append(cm[i][j])

    TN = float(sum(TNs))
    TP = float(sum(TPs))
    FP = float(sum(FPs))
    FN = float(sum(FNs))

    print('Binary confusion matrix:')
    print('{:10}'.format(TP) + ' | ' + '{:10}'.format(FP) +
          '\n --------------------\n' + '{:10}'.format(FN) + ' | ' + '{:10}'.format(TN))

    bin_conf_mat = [[TP, FN], [FP, TN]]

    return bin_conf_mat


def binary_stats(bin_conf_mat, all_stats=False):
    ''' Compute the metrics from a binary confusion matrix
    '''

    TP = bin_conf_mat[0][0]  # true positive
    FN = bin_conf_mat[0][1]  # false negative
    FP = bin_conf_mat[1][0]  # false positive
    TN = bin_conf_mat[1][1]  # true negative

    accuracy = (TP+TN)/(TP+TN+FP+FN)
    if TP+FP == 0:
        precision = 1
    else:
        precision = (TP)/(TP+FP)
    if TP+FN == 0:
        recall = 1
    else:
        recall = (TP)/(TP+FN)
    if TN+FP == 0:
        specificity = 1
    else:
        specificity = (TN)/(TN+FP)
    f1_score = 2*(precision*recall)/(precision+recall)

    print('Accuracy: {:.2f} %'.format(accuracy*100))
    print('F1 Score: {:.2f} %'.format(f1_score*100))

    print('(Precision: {:.2f} %)'.format(precision*100))
    print('(Recall: {:.2f} %)'.format(recall*100))
    print('(Specificity: {:.2f} %)'.format(specificity*100))

    if all_stats == False:
        return accuracy, f1_score

    else:
        return accuracy, f1_score, precision, recall, specificity


def get_model_metrics(global_parameters, confusion_matrix_path='', write2file=True):
    ''' Pack all the function to display the metrics of a model
    If a specific confusion matrix path is not given, it takes the 
    current one
    '''

    if confusion_matrix_path == '':
        main_dir = global_parameters["user_choices"]["main_dir"]
        confusion_matrix_path = op.join(
            main_dir, 'Statistics', global_parameters["postprocessing"]["confusion_matrix"])

    multiclass_stats(confusion_matrix_path)
    bin_conf_mat = multi_to_binary_confusion_matrix(global_parameters, confusion_matrix_path)
    # ~ accuracy, f1_score = binary_stats(bin_conf_mat, all_stats = False)
    accuracy, f1_score, precision, recall, specificity = binary_stats(bin_conf_mat, all_stats=True)

    if write2file == True:
        statistics_dir = op.join(main_dir, 'Statistics')
        bin_conf_csv = op.join(
            statistics_dir, global_parameters["postprocessing"]["binary_confusion_matrix"])
        model_metrics_csv = op.join(
            statistics_dir, global_parameters["postprocessing"]["model_metrics"])

        # write the binary confusion matrix
        with open(bin_conf_csv, 'w') as csvfile:
            spamwriter = csv.writer(csvfile, delimiter=',')
            spamwriter.writerows(bin_conf_mat)

        # write the model metrics
        with open(model_metrics_csv, 'w') as csvfile:
            spamwriter = csv.writer(csvfile, delimiter=',')
            spamwriter.writerow(['accuracy', accuracy])
            spamwriter.writerow(['f1_score', f1_score])
            spamwriter.writerow(['precision', precision])
            spamwriter.writerow(['recall', recall])
            spamwriter.writerow(['specificity', specificity])

    return accuracy, f1_score


def save_model_metrics(global_parameters):
    '''
    Save the model metrics for the current training and validation sets.

    Used for the K-fold cross validation for example
    '''
    # create the save folder
    main_dir = global_parameters["user_choices"]["main_dir"]
    statistics_dir = op.join(main_dir, 'Statistics')

    k = 0
    while op.exists(op.join(statistics_dir, 'K_fold_{}'.format(k))):
        k += 1
    K_fold_dir = op.join(statistics_dir, 'K_fold_{}'.format(k))
    os.makedirs(K_fold_dir)
    print('K_fold_{} directory created'.format(k))

    # copy the interesting files into it
    files_of_interest = []
    src_dirs = []

    for interesting_param in ["confusion_matrix", "binary_confusion_matrix", "model_metrics"]:
        src_dirs.append(statistics_dir)
        files_of_interest.append(global_parameters["postprocessing"][interesting_param])

    files_of_interest.append(global_parameters["general"]["class_stats"])
    src_dirs.append(statistics_dir)

    for n in range(len(files_of_interest)):
        file_name = files_of_interest[n]
        src_dir = src_dirs[n]
        src = op.join(src_dir, file_name)
        dst = op.join(K_fold_dir, file_name)
        if op.exists(src):
            shutil.copyfile(src, dst)

    samples_dir = op.join(main_dir, 'Intermediate')
    samples_files = []

    samples_files.append(global_parameters["general"]["validation_shp"])
    samples_files.append(global_parameters["general"]["training_shp"])

    samples_files = [op.join(samples_dir, s) for s in samples_files]

    for valid_f in samples_files:
        # get the extension of the file
        _, extension = op.splitext(valid_f)
        src_basename, _ = op.splitext(valid_f)
        all_src = glob.glob(src_basename + '*')

        for src in all_src:
            if not '_ext' in op.basename(src):
                dst = op.join(K_fold_dir, op.basename(src))
                shutil.copy(src, dst)


def retrieve_Kfold_data(global_parameters, metrics_plotting=False, location='', date=''):
    '''
    After having run the model K times, this function is used to do some
    stats on all the runs
    '''
    if location != '' and date != '':
        paths_configuration = json.load(open(op.join('..', 'paths_configuration.json')))
        Data_ALCD_dir = paths_configuration["data_paths"]["data_alcd"]
        main_dir = glob.glob(op.join(Data_ALCD_dir, '{}_*_{}'.format(location, date)))[0]

    else:
        main_dir = global_parameters["user_choices"]["main_dir"]
        location = global_parameters["user_choices"]["location"]
        date = global_parameters["user_choices"]["current_date"]
    statistics_dir = op.join(main_dir, 'Statistics')
    model_metrics_csv_basename = op.join(global_parameters["postprocessing"]["model_metrics"])

    model_metrics_iter_files = []
    k = 0
    while op.exists(op.join(statistics_dir, 'K_fold_{}'.format(k))):
        model_k_file = op.join(statistics_dir, 'K_fold_{}'.format(k), model_metrics_csv_basename)
        model_metrics_iter_files.append(model_k_file)
        k += 1

    all_metrics = []
    for k_file in model_metrics_iter_files:
        current_metrics = []
        metrics_names = []
        with open(k_file, 'r') as csvfile:
            spamreader = csv.reader(csvfile, delimiter=',')
            for row in spamreader:
                metrics_names.append(row[0])
                current_metrics.append(float(row[1]))
        all_metrics.append(current_metrics)
    # ~ print(all_metrics)
    # Compute the mean and standard deviation for each metric
    means = np.mean(all_metrics, axis=0)
    stds = np.std(all_metrics, axis=0)

    # save the stats
    out_json = op.join(statistics_dir, 'k_fold_summary.json')
    data = {}
    data["means"] = list(means)
    data["stds"] = list(stds)
    data["metrics_names"] = metrics_names
    data["all_metrics"] = all_metrics
    data["K"] = len(all_metrics)

    jsonFile = open(out_json, "w+")
    jsonFile.write(json.dumps(data, indent=3, sort_keys=True))
    jsonFile.close()

    if metrics_plotting:
        indices = [0, 1, 2, 3]
        accuracies = [m[0] for m in all_metrics]
        f1scores = [m[1] for m in all_metrics]
        recalls = [m[2] for m in all_metrics]
        precisions = [m[3] for m in all_metrics]

        plt.errorbar(indices, means[0:4], stds[0:4], linestyle='',
                     marker='o', color='b')
        met_nb = 0

        for metric in [accuracies, f1scores, recalls, precisions]:
            rnd = [(indices[met_nb] - 0.1 + 0.2*(float(j)/len(accuracies)))
                   for j in range(len(accuracies))]
            plt.scatter(rnd, metric, color='k', marker='.', alpha=0.2)
            met_nb += 1

        plt.ylim(0.5, 1)
        metrics_names = ['Accuracy\n{:.1f}%'.format(means[0]*100),
                         'F1-score\n{:.1f}%'.format(means[1]*100),
                         'Recall\n{:.1f}%'.format(means[2]*100),
                         'Precision\n{:.1f}%'.format(means[3]*100)]
        plt.xticks(indices, metrics_names)

        nb_dates = float(len(accuracies))/11
        plt.title(
            'Metrics of a {}-fold random cross-validation\n{}, {}'.format(len(accuracies), location, date))
        plt.xlabel('Score type')
        plt.ylabel('Scores')

        # Custom legend
        custom_lines = []
        custom_lines.append(Line2D([0], [0], color='w', markerfacecolor='k', marker='.', alpha=0.2))
        custom_lines.append(Line2D([0], [0], color='w', markerfacecolor='b', marker='o', alpha=1))
        legend_labels = ['Single fold point', 'Mean and std of all folds']
        plt.legend(custom_lines, legend_labels, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

        #~ plt.show()

        out_fig = op.join(statistics_dir, 'kfold_metrics.png')
        plt.savefig(out_fig, bbox_inches='tight')
        #~ plt.close()

    return means, stds, all_metrics


def load_previous_global_parameters(location, date):
    paths_configuration = json.load(open(op.join('..', 'paths_configuration.json')))
    Data_ALCD_dir = paths_configuration["data_paths"]["data_alcd"]

    main_dir_path = glob.glob(op.join(Data_ALCD_dir, '{}_*_{}'.format(location, date)))[0]
    glo_param_path = op.join(main_dir_path, 'In_data', 'used_global_parameters.json')
    global_parameters = json.load(open(glo_param_path))

    return global_parameters


def get_all_locations_dates(csv_file):
    locations = []
    clear_dates = []
    cloudy_dates = []
    with open(csv_file, 'rb') as f:
        reader = csv.reader(f)
        headers = next(reader, None)  # skip the headers
        for row in reader:
            locations.append(row[1])
            clear_dates.append(row[2])
            cloudy_dates.append(row[3])
    return locations, clear_dates, cloudy_dates


def plot_statistics_all_sites():
    global_parameters = json.load(open(op.join('parameters_files', 'global_parameters.json')))
    # ~ csv_file = op.join('tmp', 'all_sites_dates.csv')
    csv_file = op.join('/mnt/data/home/baetensl/clouds_detection_git/Various_data',
                       'all_sites_dates.csv')
    locations, _, dates = get_all_locations_dates(csv_file)

    all_metrics = []

    low_accuracies = []
    low_accuracies_scenes = []

    for j in range(len(locations)):
        location = locations[j]
        date = dates[j]

        try:
            _, _, temp_metrics = retrieve_Kfold_data(
                global_parameters, metrics_plotting=False, location=location, date=date)
            all_metrics.extend(temp_metrics)
            accuracies_tmp = [t[0] for t in temp_metrics]

            if any(a < 0.9 for a in accuracies_tmp):
                low_accuracies.append(accuracies_tmp)
                low_accuracies_scenes.append((location + date))
            print('{}_{}, min : {}'.format(location, date, np.min(accuracies_tmp)))

        except:
            print('Error on {}, {}'.format(location, date))

    print(len(all_metrics))

    means = np.mean(all_metrics, axis=0)
    stds = np.std(all_metrics, axis=0)
    print('Means')
    print(means)
    print('Standard deviations')
    print(stds)

    plt.figure()
    indices = [0, 1, 2, 3]
    accuracies = [m[0] for m in all_metrics]
    f1scores = [m[1] for m in all_metrics]
    recalls = [m[2] for m in all_metrics]
    precisions = [m[3] for m in all_metrics]

    met_nb = 0
    for metric in [accuracies, f1scores, recalls, precisions]:

        rnd = [(indices[met_nb] - 0.1 + 0.2*(float(k)/len(accuracies)))
               for k in range(len(accuracies))]
        plt.scatter(rnd, metric, color='k', marker='.', alpha=0.2)
        met_nb += 1
    plt.errorbar(indices, means[0:4], stds[0:4], linestyle='',
                 marker='o', lw=2, elinewidth=2, capsize=8, capthick=1, color='b')
    plt.ylim(0.5, 1)

    metrics_names = ['Accuracy\n{:.1f}%'.format(means[0]*100),
                     'F1-score\n{:.1f}%'.format(means[1]*100),
                     'Recall\n{:.1f}%'.format(means[2]*100),
                     'Precision\n{:.1f}%'.format(means[3]*100)]
    plt.xticks(indices, metrics_names)

    nb_dates = float(len(accuracies))/11
    plt.title('Metrics of a 10-fold cross-validation \n on {:.0f} scenes'.format(len(locations)))
    plt.xlabel('Score type')
    plt.ylabel('Scores')

    # Custom legend
    custom_lines = []
    custom_lines.append(Line2D([0], [0], color='w', markerfacecolor='k', marker='.', alpha=0.2))
    custom_lines.append(Line2D([0], [0], color='w', markerfacecolor='b', marker='o', alpha=1))
    legend_labels = ['Single validation point', 'Mean and std of all sites']
    plt.legend(custom_lines, legend_labels, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

    plt.show(block=False)

    out_fig = op.join('tmp_report', 'kfold_synthese.png')
    print('Figure saved in {}'.format(out_fig))
    plt.savefig(out_fig, bbox_inches='tight')
    plt.close()
    print('Scenes with low accuracies:')
    print(low_accuracies_scenes)


def plot_mean_statistics_all_sites(plot_both=True):
    global_parameters = json.load(open(op.join('parameters_files', 'global_parameters.json')))
    csv_file = op.join('/mnt/data/home/baetensl/clouds_detection_git/Tools', 'all_scenes.csv')
    locations, _, dates = get_all_locations_dates(csv_file)

    all_metrics = []
    accuracies = []
    f1scores = []
    scenes_names = []

    for j in range(len(locations)):
        location = locations[j]
        date = dates[j]

        try:
            _, _, temp_metrics = retrieve_Kfold_data(
                global_parameters, metrics_plotting=False, location=location, date=date)
            all_metrics.extend(temp_metrics)

            accuracies.append([[m[0] for m in temp_metrics]])
            f1scores.append([[m[1] for m in temp_metrics]])
            location_txt = location.replace('Alta_Floresta_Brazil', 'AltaFloresta')
            scenes_names.append('{}_{}'.format(location_txt, date))

        except:
            print('Error on {}, {}'.format(location, date))

    acc_means = [np.mean(a) for a in accuracies]
    acc_stds = [np.std(a) for a in accuracies]
    f1_means = [np.mean(f) for f in f1scores]
    f1_stds = [np.std(f) for f in f1scores]

    x_positions = range(len(acc_means))
    acc_positions = [x-0.1 for x in x_positions]
    f1_positions = [x+0.1 for x in x_positions]

    plt.figure()
    if plot_both:
        plt.errorbar(acc_positions, acc_means, acc_stds, linestyle='',
                     marker='o', lw=1, elinewidth=1, capsize=3, capthick=1, color='b')
        plt.errorbar(f1_positions, f1_means, f1_stds, linestyle='',
                     marker='o', lw=1, elinewidth=1, capsize=3, capthick=1, color='g')

        #~ plt.title('Metrics of a 10-fold cross-validation \n on {:.0f} scenes'.format(len(acc_means)))

        # Custom legend
        custom_lines = []
        custom_lines.append(Line2D([0], [0], color='w', markerfacecolor='b', marker='o', alpha=1))
        custom_lines.append(Line2D([0], [0], color='w', markerfacecolor='g', marker='o', alpha=1))
        legend_labels = ['Accuracy', 'F1-score']
        #~ plt.legend(custom_lines, legend_labels, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
        plt.legend(custom_lines, legend_labels, loc=0)

    else:
        plt.errorbar(x_positions, acc_means, acc_stds, linestyle='',
                     marker='o', lw=1, elinewidth=1, capsize=3, capthick=1, color='b')

        plt.xlim(min(acc_positions)-0.2, max(f1_positions)+0.2)

        plt.title(
            'Mean accuracies of a 10-fold cross-validation \n on {:.0f} scenes'.format(len(acc_means)))

    plt.xticks(x_positions, scenes_names, rotation='vertical')
    #~ plt.xlabel('Scene')
    plt.ylabel('Scores')

    plt.ylim(0.7, 1.0)
    plt.xlim(min(x_positions)-0.4, max(f1_positions)+0.4)

    plt.show(block=False)

    out_fig = op.join('tmp_report', 'kfold_synthese_mean.png')
    plt.savefig(out_fig, bbox_inches='tight')
    plt.close()
    print('Figure saved in {}'.format(out_fig))

    print('Accuracy: mean = {:.5f}, std = {:.5f}'.format(np.mean(acc_means), np.mean(acc_stds)))
    print('F1-score: mean = {:.5f}, std = {:.5f}'.format(np.mean(f1_means), np.mean(f1_stds)))

    return


def main():
    global_parameters = json.load(open(op.join('parameters_files', 'global_parameters.json')))
    #~ retrieve_Kfold_data(global_parameters, metrics_plotting = True)
    plot_mean_statistics_all_sites(plot_both=True)


if __name__ == '__main__':
    main()