From 773a28577e4b0f081924e15c4cc866ac28591ab2 Mon Sep 17 00:00:00 2001 From: Simon <simon.leglaive@gmail.com> Date: Thu, 22 Feb 2024 10:19:01 +0100 Subject: [PATCH] metrics --- metrics.py | 731 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 731 insertions(+) create mode 100644 metrics.py diff --git a/metrics.py b/metrics.py new file mode 100644 index 0000000..37d1ec9 --- /dev/null +++ b/metrics.py @@ -0,0 +1,731 @@ +#MIT License +# +#Copyright (c) 2019 Sounds of New York City (SONYC) +# +#Permission is hereby granted, free of charge, to any person obtaining a copy of +#this software and associated documentation files (the "Software"), to deal in +#the Software without restriction, including without limitation the rights to +#use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +#of the Software, and to permit persons to whom the Software is furnished to do +#so, subject to the following conditions: +# +#The above copyright notice and this permission notice shall be included in all +#copies or substantial portions of the Software. +# +#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +#SOFTWARE. + +import numpy as np +import oyaml as yaml +import pandas as pd +from sklearn.metrics import auc, confusion_matrix +import warnings + +""" +from https://github.com/sonyc-project/urban-sound-tagging-baseline/blob/master/urban-sound-tagging-baseline/metrics.py +""" + +def confusion_matrix_fine( + Y_true, Y_pred, is_true_incomplete, is_pred_incomplete): + """ + Counts overall numbers of true positives (TP), false positives (FP), + and false negatives (FN) in the predictions of a system, for a number K + of fine-level classes within a given coarse category, in a dataset of N + different samples. In addition to the K so-called "complete" tags (i.e. + with a determinate fine-level category as well as a determinate + coarse-level category), we consider the potential presence of an "incomplete" + tag, i.e. denoting the presence of a class with a determinate coarse-level + category yet no determinate fine-level category. This incomplete tag + be present in either the prediction or the ground truth. + + Our method for evaluating a multilabel classifier on potentially incomplete + knowledge of the ground truth consists of two parts, which are ultimately + aggregated into a single count. + + For the samples with complete knowledge of both ground truth (Part I in the + code below), we simply apply classwise Boolean logic to compute TP, FP, and + FN independently for every fine-level tag, and finally aggregate across + all tags. + + However, for the samples with incomplete knowledge of the ground truth + (Part II in the code below), we perform a "coarsening" of the prediction by + apply a disjunction on the fine-level complete tags as well as the + coarse incomplete tag. If that coarsened prediction is positive, the sample + produces a true positive; otherwise, it produces a false negative. + + Samples which contain the incomplete tag in the prediction but not the + ground truth overlap Parts I and II. In this case, we sum the zero, one, + or multiple false alarm(s) from Part I with the one false alarm from Part II + to produce a final number of false positives FP. + + Parameters + ---------- + Y_true: array of bool, shape = [n_samples, n_classes] + One-hot encoding of true presence for complete fine tags. + Y_true[n, k] is equal to 1 if the class k is truly present in sample n, + and equal to 0 otherwise. + + Y_pred: array of bool, shape = [n_samples, n_classes] + One-hot encoding of predicted class presence for complete fine tags. + Y_true[n, k] is equal to 1 if the class k is truly present in sample n, + and equal to 0 otherwise. + + is_true_incomplete: array of bool, shape = [n_samples] + One-hot encoding of true presence for the incomplete fine tag. + is_true[n] is equal to 1 if an item that truly belongs to the + coarse category at hand, but the fine-level tag of that item is + truly uncertain, or truly unlike any of the K available fine tags. + + is_pred_incomplete: array of bool, shape = [n_samples] + One-hot encoding of predicted presence for the incomplete fine tag. + is_true[n] is equal to 1 if the system predicts the existence of an + item that does belongs to the coarse category at hand, yet its + fine-level tag of that item is uncertain or unlike any of the + K available fine tags. + + Returns + ------- + TP: int + Number of true positives. + + FP: int + Number of false positives. + + FN: int + Number of false negatives. + """ + + ## PART I. SAMPLES WITH COMPLETE GROUND TRUTH AND COMPLETE PREDICTION + # Negate the true_incomplete Boolean and replicate it K times, where + # K is the number of fine tags. + # For each sample and fine tag, this mask is equal to 0 if the + # ground truth contains the incomplete fine tag and 1 if the ground + # truth does not contain the incomplete fine tag. + # The result is a (N, K) matrix. + Y_true_complete = np.tile(np.logical_not( + is_true_incomplete)[:, np.newaxis], (1, Y_pred.shape[1])) + + # Compute true positives for samples with complete ground truth. + # For each sample n and each complete tag k, is_TP_complete is equal to 1 + # if and only if the following two conditions are met: + # (i) the ground truth of sample n contains complete fine tag k + # (ii) the prediction of sample n contains complete fine tag k + # The result is a (N, K) matrix. + is_TP_complete = np.logical_and.reduce((Y_true, Y_pred)) + + # Compute false positives for samples with complete ground truth. + # For each sample n and each complete tag k, is_FP_complete is equal to 1 + # if and only if the following three conditions are met: + # (i) the ground truth of sample n is complete + # (ii) the ground truth of sample n does not contain complete fine tag k + # (iii) the prediction of sample n contains complete fine tag k + # The result is a (N, K) matrix. + is_FP_complete = np.logical_and.reduce( + (np.logical_not(Y_true), Y_pred, Y_true_complete)) + + # Compute false negatives for samples with complete ground truth. + # For each sample n and each complete tag k, is_FN_complete is equal to 1 + # if and only if the following two conditions are met: + # (i) the ground truth of sample n contains complete fine tag k + # (ii) the prediction of sample n does not contain complete fine tag k + # The result is a (N, K) matrix. + is_FN_complete = np.logical_and(Y_true, np.logical_not(Y_pred)) + + + ## PART II. SAMPLES WITH INCOMPLETE GROUND TRUTH OR INCOMPLETE PREDICTION. + # Compute a vector of "coarsened prediction". + # For each sample, the coarsened prediction is equal to 1 if any of the + # complete fine tags is predicted as present, or if the incomplete fine + # tag is predicted as present. Conversely, it is set equal to 0 if all + # of the complete fine tags are predicted as absent, and if the incomplete + # fine tags are predicted as absent. + # The result is a (N,) vector. + y_pred_coarsened_without_incomplete = np.logical_or.reduce(Y_pred, axis=1) + y_pred_coarsened = np.logical_or( + y_pred_coarsened_without_incomplete, is_pred_incomplete) + + # Compute a vector of "coarsened ground truth". + # For each sample, the coarsened ground truth is equal to 1 if none of the + # complete fine tags are truly present, and if the incomplete fine tag is + # truly present. Conversely, it is set equal to 0 if any of the complete + # fine tags is truly present, or if the incomplete fine tag is truly absent. + # The result is a (N,) vector. + y_true_coarsened_without_incomplete =\ + np.logical_and.reduce(np.logical_not(Y_true), axis=1) + y_true_coarsened = np.logical_and( + y_true_coarsened_without_incomplete, is_true_incomplete) + + # Compute true positives for samples with incomplete ground truth. + # For each sample n, is_TP_incomplete is equal to 1 + # if and only if the following three conditions are met: + # (i) the ground truth contains the incomplete fine tag + # (ii) the coarsened prediction of sample n contains at least one tag + # (iii) none of the predicted complete tags match a true complete tag + # The result is a (N,) vector. + is_TP_incomplete = np.logical_and.reduce(( + is_true_incomplete, + y_pred_coarsened, + np.logical_and.reduce(np.logical_not(is_TP_complete), axis=1))) + + # Compute false positives for samples with incomplete ground truth. + # For each sample n, is_FP_incomplete is equal to 1 + # if and only if the following two conditions are met: + # (i) the ground truth does not contain the incomplete fine tag + # (ii) no complete fine tags are in the ground truth + # (iii) the prediction contains the incomplete fine tag + # (iv) not all complete fine tags are in the prediction + # The result is a (N,) vector. + is_FP_incomplete = np.logical_and.reduce(( + np.logical_not(is_true_incomplete), + np.logical_not(np.logical_or.reduce(Y_true, axis=1)), + is_pred_incomplete, + np.logical_not(np.logical_and.reduce(Y_pred, axis=1)))) + + # Compute false negatives for samples with incomplete ground truth. + # For each sample n, is_FN_incomplete is equal to 1 + # if and only if the following two conditions are met: + # (i) the incomplete fine tag is present in the ground truth + # (ii) the coarsened prediction of sample n does not contain any tag + # The result is a (N,) vector. + is_FN_incomplete = np.logical_and( + y_true_coarsened, np.logical_not(y_pred_coarsened)) + + + ## PART III. AGGREGATE EVALUATION OF ALL SAMPLES + # The following three sums are performed over NxK Booleans, + # implicitly converted as integers 0 (False) and 1 (True). + TP_complete = np.sum(is_TP_complete) + FP_complete = np.sum(is_FP_complete) + FN_complete = np.sum(is_FN_complete) + + # The following three sums are performed over N Booleans, + # implicitly converted as integers 0 (False) and 1 (True). + TP_incomplete = np.sum(is_TP_incomplete) + FP_incomplete = np.sum(is_FP_incomplete) + FN_incomplete = np.sum(is_FN_incomplete) + + # Sum FP, TP, and FN for samples that have complete ground truth + # with FP, TP, and FN for samples that have incomplete ground truth. + TP = TP_complete + TP_incomplete + FP = FP_complete + FP_incomplete + FN = FN_complete + FN_incomplete + return TP, FP, FN + + +def confusion_matrix_coarse(y_true, y_pred): + """ + Counts overall numbers of true positives (TP), false positives (FP), + and false negatives (FN) in the predictions of a system, for a single + Boolean attribute, in a dataset of N different samples. + + + Parameters + ---------- + y_true: array of bool, shape = [n_samples,] + One-hot encoding of true presence for a given coarse tag. + y_true[n] is equal to 1 if the tag is present in the sample. + + y_pred: array of bool, shape = [n_samples,] + One-hot encoding of predicted presence for a given coarse tag. + y_pred[n] is equal to 1 if the tag is present in the sample. + + + Returns + ------- + TP: int + Number of true positives. + + FP: int + Number of false positives. + + FN: int + Number of false negatives. + """ + cm = confusion_matrix(y_true, y_pred) + FP = cm[0, 1] + FN = cm[1, 0] + TP = cm[1, 1] + return TP, FP, FN + + +def evaluate(prediction_path, annotation_path, yaml_path, mode, split="validate"): + # Set minimum threshold. + min_threshold = 0.01 + + # Create dictionary to parse tags + with open(yaml_path, 'r') as stream: + yaml_dict = yaml.load(stream, Loader=yaml.Loader) + + # Parse ground truth. + gt_df = parse_ground_truth(annotation_path, yaml_path, split) + + # Parse predictions. + if mode == "fine": + pred_df = parse_fine_prediction(prediction_path, yaml_path) + elif mode == "coarse": + pred_df = parse_coarse_prediction(prediction_path, yaml_path) + + # Check consistency between ground truth and predictions. + # Make sure the files evaluated in both tables match. + pred_audio_set = set(pred_df['audio_filename'].tolist()) + true_audio_set = set(gt_df['audio_filename'].tolist()) + if not (pred_audio_set == true_audio_set): + extra_files = pred_audio_set - true_audio_set + missing_files = true_audio_set - pred_audio_set + err_msg =\ + "File mismatch between ground truth and prediction table.\n\n" \ + "Missing files: {}\n\n Extra files: {}" + raise ValueError(err_msg.format(list(missing_files), list(extra_files))) + + # Make sure the size of the tables match + if not (len(gt_df) == len(pred_df)): + err_msg =\ + "Size mismatch between ground truth ({} files) " \ + "and prediction table ({} files)." + raise ValueError(err_msg.format(len(gt_df), len(pred_df))) + + # Initialize dictionary of DataFrames. + df_dict = {} + + # Loop over coarse categories. + for coarse_id in yaml_dict["coarse"]: + # List columns corresponding to that category + if mode == "coarse": + columns = [str(coarse_id)] + else: + columns = [column for column in pred_df.columns + if (str(column).startswith(str(coarse_id))) and + ("-" in str(column)) and + (not str(column).endswith("X"))] + + # Sort columns in alphanumeric order. + columns.sort() + + # Restrict prediction to columns of interest. + restricted_pred_df = pred_df[columns] + + # Restrict ground truth to columns of interest. + restricted_gt_df = gt_df[columns] + + # Aggregate all prediction values into a "raveled" vector. + # We make an explicit numpy, so that the original DataFrame + # is left unchanged. + thresholds = np.ravel(np.copy(restricted_pred_df.values)) + + # Sort in place. + thresholds.sort() + + # Skip very low values. + # This is to speed up the computation of the precision-recall curve + # in the low-precision regime. + thresholds = thresholds[np.searchsorted(thresholds, min_threshold):] + + # Append a 1 to the list of thresholds. + # This will cause TP and FP to fall down to zero, but FN will be nonzero. + # This is useful for estimating the low-recall regime, and it + # facilitates micro-averaged AUPRC because if provides an upper bound + # on valid thresholds across coarse categories. + thresholds = np.append(thresholds, 1.0) + + # List thresholds by restricting observed confidences to unique elements. + thresholds = np.unique(thresholds)[::-1] + + # Count number of thresholds. + n_thresholds = len(thresholds) + TPs = np.zeros((n_thresholds,)).astype('int') + FPs = np.zeros((n_thresholds,)).astype('int') + FNs = np.zeros((n_thresholds,)).astype('int') + + # FINE MODE. + if mode == "fine": + incomplete_tag = str(coarse_id) + "-X" + + # Load ground truth as numpy array. + Y_true = restricted_gt_df.values + is_true_incomplete = gt_df[incomplete_tag].values + + # Loop over thresholds in a decreasing order. + for i, threshold in enumerate(thresholds): + # Threshold prediction for complete tag. + Y_pred = restricted_pred_df.values >= threshold + + # Threshold prediction for incomplete tag. + is_pred_incomplete =\ + pred_df[incomplete_tag].values >= threshold + + # Evaluate. + TPs[i], FPs[i], FNs[i] = confusion_matrix_fine( + Y_true, Y_pred, is_true_incomplete, is_pred_incomplete) + + # COARSE MODE. + elif mode == "coarse": + # Load ground truth as numpy array. + Y_true = restricted_gt_df.values + + # Loop over thresholds in a decreasing order. + for i, threshold in enumerate(thresholds): + # Threshold prediction. + Y_pred = restricted_pred_df.values >= threshold + + # Evaluate. + TPs[i], FPs[i], FNs[i] = confusion_matrix_coarse(Y_true, Y_pred) + + # Build DataFrame from columns. + eval_df = pd.DataFrame({ + "threshold": thresholds, "TP": TPs, "FP": FPs, "FN": FNs}) + + # Add columns for precision, recall, and F1-score. + # NB: we take the maximum between TPs+FPs and mu=0.5 in the + # denominator in order to avoid division by zero. + # This only ever happens if TP+FP < 1, which + # implies TP = 0 (because TP and FP are nonnegative integers), + # and therefore a numerator of exactly zero. Therefore, any additive + # offset mu would do as long as 0 < mu < 1. Choosing mu = 0.5 is + # purely arbitrary and has no effect on the outcome (i.e. zero). + mu = 0.5 + eval_df["P"] = TPs / np.maximum(TPs + FPs, mu) + + # Likewise for recalls, although this numerical safeguard is probably + # less necessary given that TP+FN=0 implies that there are zero + # positives in the ground truth, which is unlikely but no unheard of. + eval_df["R"] = TPs / np.maximum(TPs + FNs, mu) + + # Compute F1-scores. + # NB: we use the harmonic mean formula (2/F = 1/P + 1/R) rather than + # the more common F = (2*P*R)/(P+R) in order circumvent the edge case + # where both P and R are equal to 0 (i.e. TP = 0). + eval_df["F"] = 2 / (1/eval_df["P"] + 1/eval_df["R"]) + + # Store DataFrame in the dictionary. + df_dict[coarse_id] = eval_df + + # Return dictionary. + return df_dict + + +def micro_averaged_auprc(df_dict, return_df=False): + """ + Compute micro-averaged area under the precision-recall curve (AUPRC) + from a dictionary of class-wise DataFrames obtained via `evaluate`. + """ + # List all unique values of thresholds across coarse categories. + thresholds = np.unique( + np.hstack([x["threshold"] for x in df_dict.values()])) + + # Count number of unique thresholds. + n_thresholds = len(thresholds) + + # Initialize arrays for TP, FP, and FN + TPs = np.zeros((n_thresholds,)).astype('int') + FPs = np.zeros((n_thresholds,)).astype('int') + FNs = np.zeros((n_thresholds,)).astype('int') + + # Loop over thresholds. + for i, threshold in enumerate(thresholds): + + # Initialize counters of TP, FP, and FN across all categories. + global_TP, global_FP, global_FN = 0, 0, 0 + + # Loop over coarse categories. + for coarse_id in df_dict.keys(): + + # Find last row above threshold. + coarse_df = df_dict[coarse_id] + coarse_thresholds = coarse_df["threshold"] + row = coarse_df[coarse_thresholds>=threshold].iloc[-1] + + # Increment TP, FP, and FN. + global_TP += row["TP"] + global_FP += row["FP"] + global_FN += row["FN"] + + # Store micro-averaged values of TP, FP, and FN for the given threshold. + TPs[i] = global_TP + FPs[i] = global_FP + FNs[i] = global_FN + + # Build DataFrame from columns. + eval_df = pd.DataFrame({ + "threshold": thresholds, "TP": TPs, "FP": FPs, "FN": FNs}) + + # Add columns for precision, recall, and F1-score. + # NB: we take the maximum between TPs+FPs and mu = 0.5 in the + # denominator in order to avoid division by zero. + # This only ever happens if TP+FP < 1, which + # implies TP = 0 (because TP and FP are nonnegative integers), + # and therefore a numerator of exactly zero. Therefore, any additive + # offset mu would do as long as 0 < mu < 1. Choosing mu = 0.5 is + # purely arbitrary and has no effect on the outcome (i.e. zero). + mu = 0.5 + eval_df["P"] = TPs / np.maximum(TPs + FPs, mu) + + # Likewise for recalls, although this numerical safeguard is probably + # less necessary given that TP+FN=0 implies that there are zero + # positives in the ground truth, which is unlikely but no unheard of. + eval_df["R"] = TPs / np.maximum(TPs + FNs, mu) + + # Sort PR curve by ascending recall. + sorting_indices = np.argsort(list(eval_df["R"])) + recalls = np.array([0.0] + list(eval_df["R"][sorting_indices]) + [1.0]) + precisions = np.array([1.0] + list(eval_df["P"][sorting_indices]) + [0.0]) + auprc = auc(recalls, precisions) + + # If the DataFrame containing the full P-R curve is requested. + if return_df: + # Compute F1-scores. + # NB: we use the harmonic mean formula (2/F = 1/P + 1/R) rather than + # the more common F = (2*P*R)/(P+R) in order circumvent the edge case + # where both P and R are equal to 0 (i.e. TP = 0). + eval_df["F"] = 2 / (1/eval_df["P"] + 1/eval_df["R"]) + + # Return + return auprc, eval_df + else: + # Otherwise, return only the AUPRC as a scalar. + return auprc + + + +def macro_averaged_auprc(df_dict, return_classwise=False): + """ + Compute macro-averaged area under the precision-recall curve (AUPRC) + from a dictionary of class-wise DataFrames obtaines via `evaluate`. + """ + # Initialize list of category-wise AUPRCs. + auprcs = [] + coarse_id_list = df_dict.keys() + + # Loop over coarse categories. + for coarse_id in coarse_id_list: + # Load precisions and recalls. + # NB: we prepend a (1,0) and append a (0,1) to the curve so that the + # curve reaches the top-left and bottom-right quadrants of the + # precision-recall square. + sorting_indices = df_dict[coarse_id]["R"].argsort() + recalls = np.array( + [0.0] + list(df_dict[coarse_id]["R"][sorting_indices]) + [1.0]) + precisions = np.array( + [1.0] + list(df_dict[coarse_id]["P"][sorting_indices]) + [0.0]) + auprcs.append(auc(recalls, precisions)) + + # Average AUPRCs across coarse categories with uniform weighting. + mean_auprc = np.mean(auprcs) + + if return_classwise: + class_auprc = {coarse_id: auprc + for coarse_id, auprc in zip(coarse_id_list, auprcs)} + return mean_auprc, class_auprc + else: + return mean_auprc + + +def parse_coarse_prediction(pred_csv_path, yaml_path): + """ + Parse coarse-level predictions from a CSV file containing both fine-level + and coarse-level predictions (and possibly additional metadata). + Returns a Pandas DataFrame in which the column names are coarse + IDs of the form 1, 2, 3 etc. + + + Parameters + ---------- + pred_csv_path: string + Path to the CSV file containing predictions. + + yaml_path: string + Path to the YAML file containing coarse taxonomy. + + + Returns + ------- + pred_coarse_df: DataFrame + Coarse-level complete predictions. + """ + + # Create dictionary to parse tags + with open(yaml_path, 'r') as stream: + yaml_dict = yaml.load(stream, Loader=yaml.Loader) + + # Collect tag names as strings and map them to coarse ID pairs. + rev_coarse_dict = {"_".join([str(k), yaml_dict["coarse"][k]]): k + for k in yaml_dict["coarse"]} + + # Read comma-separated values with the Pandas library + pred_df = pd.read_csv(pred_csv_path) + + # Assign a predicted column to each coarse key, by using the tag as an + # intermediate hashing step. + pred_coarse_dict = {} + for c in rev_coarse_dict: + if c in pred_df: + pred_coarse_dict[str(rev_coarse_dict[c])] = pred_df[c] + else: + pred_coarse_dict[str(rev_coarse_dict[c])] = np.zeros((len(pred_df),)) + warnings.warn("Column not found: " + c) + + # Copy over the audio filename strings corresponding to each sample. + pred_coarse_dict["audio_filename"] = pred_df["audio_filename"] + + # Build a new Pandas DataFrame with coarse keys as column names. + pred_coarse_df = pd.DataFrame.from_dict(pred_coarse_dict) + + # Return output in DataFrame format. + # The column names are of the form 1, 2, 3, etc. + return pred_coarse_df.sort_values('audio_filename') + + +def parse_fine_prediction(pred_csv_path, yaml_path): + """ + Parse fine-level predictions from a CSV file containing both fine-level + and coarse-level predictions (and possibly additional metadata). + Returns a Pandas DataFrame in which the column names are mixed (coarse-fine) + IDs of the form 1-1, 1-2, 1-3, ..., 1-X, 2-1, 2-2, 2-3, ... 2-X, 3-1, etc. + + + Parameters + ---------- + pred_csv_path: string + Path to the CSV file containing predictions. + + yaml_path: string + Path to the YAML file containing fine taxonomy. + + + Returns + ------- + pred_fine_df: DataFrame + Fine-level complete predictions. + """ + + # Create dictionary to parse tags + with open(yaml_path, 'r') as stream: + yaml_dict = yaml.load(stream, Loader=yaml.Loader) + + # Collect tag names as strings and map them to mixed (coarse-fine) ID pairs. + # The "mixed key" is a hyphenation of the coarse ID and fine ID. + fine_dict = {} + for coarse_id in yaml_dict["fine"]: + for fine_id in yaml_dict["fine"][coarse_id]: + mixed_key = "-".join([str(coarse_id), str(fine_id)]) + fine_dict[mixed_key] = "_".join([ + mixed_key, yaml_dict["fine"][coarse_id][fine_id]]) + + # Invert the key-value relationship between mixed key and tag. + # Now, tags are the keys, and mixed keys (coarse-fine IDs) are the values. + # This is possible because tags are unique. + rev_fine_dict = {fine_dict[k]: k for k in fine_dict} + + # Read comma-separated values with the Pandas library + pred_df = pd.read_csv(pred_csv_path) + + # Assign a predicted column to each mixed key, by using the tag as an + # intermediate hashing step. + pred_fine_dict = {} + for f in sorted(rev_fine_dict.keys()): + if f in pred_df: + pred_fine_dict[rev_fine_dict[f]] = pred_df[f] + else: + pred_fine_dict[rev_fine_dict[f]] = np.zeros((len(pred_df),)) + warnings.warn("Column not found: " + f) + + # Loop over coarse tags. + n_samples = len(pred_df) + coarse_dict = yaml_dict["coarse"] + for coarse_id in yaml_dict["coarse"]: + # Construct incomplete fine tag by appending -X to the coarse tag. + incomplete_tag = str(coarse_id) + "-X" + + # If the incomplete tag is not in the prediction, append a column of zeros. + # This is the case e.g. for coarse ID 7 ("dogs") which has a single + # fine-level tag ("7-1_dog-barking-whining") and thus no incomplete + # tag 7-X. + if incomplete_tag not in fine_dict.keys(): + pred_fine_dict[incomplete_tag] =\ + np.zeros((n_samples,)).astype('int') + + + # Copy over the audio filename strings corresponding to each sample. + pred_fine_dict["audio_filename"] = pred_df["audio_filename"] + + # Build a new Pandas DataFrame with mixed keys as column names. + pred_fine_df = pd.DataFrame.from_dict(pred_fine_dict) + + # Return output in DataFrame format. + # Column names are 1-1, 1-2, 1-3 ... 1-X, 2-1, 2-2, 2-3 ... 2-X, 3-1, etc. + return pred_fine_df.sort_values('audio_filename') + + +def parse_ground_truth(annotation_path, yaml_path, split="validate"): + """ + Parse ground truth annotations from a CSV file containing both fine-level + and coarse-level predictions (and possibly additional metadata). + Returns a Pandas DataFrame in which the column names are coarse + IDs of the form 1, 2, 3 etc. + + + Parameters + ---------- + annotation_path: string + Path to the CSV file containing predictions. + + yaml_path: string + Path to the YAML file containing coarse taxonomy. + + + Returns + ------- + gt_df: DataFrame + Ground truth. + """ + # Create dictionary to parse tags + with open(yaml_path, 'r') as stream: + yaml_dict = yaml.load(stream, Loader=yaml.Loader) + + # Load CSV file into a Pandas DataFrame. + ann_df = pd.read_csv(annotation_path) + + # Restrict to ground truth ("annotator zero"). + gt_df = ann_df[ + (ann_df["annotator_id"]==0) & (ann_df["split"]==split)] + + # Rename coarse columns. + coarse_dict = yaml_dict["coarse"] + coarse_renaming = { + "_".join([str(c), coarse_dict[c], "presence"]): str(c) + for c in coarse_dict} + gt_df = gt_df.rename(columns=coarse_renaming) + + # Collect tag names as strings and map them to mixed (coarse-fine) ID pairs. + # The "mixed key" is a hyphenation of the coarse ID and fine ID. + fine_dict = {} + for coarse_id in yaml_dict["fine"]: + for fine_id in yaml_dict["fine"][coarse_id]: + mixed_key = "-".join([str(coarse_id), str(fine_id)]) + fine_dict[mixed_key] = yaml_dict["fine"][coarse_id][fine_id] + + # Rename fine columns. + fine_renaming = {"_".join([k, fine_dict[k], "presence"]): k + for k in fine_dict} + gt_df = gt_df.rename(columns=fine_renaming) + + # Loop over coarse tags. + n_samples = len(gt_df) + coarse_dict = yaml_dict["coarse"] + for coarse_id in yaml_dict["coarse"]: + # Construct incomplete fine tag by appending -X to the coarse tag. + incomplete_tag = str(coarse_id) + "-X" + + # If the incomplete tag is not in the prediction, append a column of zeros. + # This is the case e.g. for coarse ID 7 ("dogs") which has a single + # fine-level tag ("7-1_dog-barking-whining") and thus no incomplete + # tag 7-X. + if incomplete_tag not in gt_df.columns: + gt_df[incomplete_tag] = np.zeros((n_samples,)).astype('int') + + # Return output in DataFrame format. + return gt_df.sort_values('audio_filename') -- GitLab