From 773a28577e4b0f081924e15c4cc866ac28591ab2 Mon Sep 17 00:00:00 2001
From: Simon <simon.leglaive@gmail.com>
Date: Thu, 22 Feb 2024 10:19:01 +0100
Subject: [PATCH] metrics

---
 metrics.py | 731 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 731 insertions(+)
 create mode 100644 metrics.py

diff --git a/metrics.py b/metrics.py
new file mode 100644
index 0000000..37d1ec9
--- /dev/null
+++ b/metrics.py
@@ -0,0 +1,731 @@
+#MIT License
+#
+#Copyright (c) 2019 Sounds of New York City (SONYC)
+#
+#Permission is hereby granted, free of charge, to any person obtaining a copy of
+#this software and associated documentation files (the "Software"), to deal in
+#the Software without restriction, including without limitation the rights to
+#use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+#of the Software, and to permit persons to whom the Software is furnished to do
+#so, subject to the following conditions:
+#
+#The above copyright notice and this permission notice shall be included in all
+#copies or substantial portions of the Software.
+#
+#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+#SOFTWARE.
+
+import numpy as np
+import oyaml as yaml
+import pandas as pd
+from sklearn.metrics import auc, confusion_matrix
+import warnings
+
+"""
+from https://github.com/sonyc-project/urban-sound-tagging-baseline/blob/master/urban-sound-tagging-baseline/metrics.py
+"""
+
+def confusion_matrix_fine(
+        Y_true, Y_pred, is_true_incomplete, is_pred_incomplete):
+    """
+    Counts overall numbers of true positives (TP), false positives (FP),
+    and false negatives (FN) in the predictions of a system, for a number K
+    of fine-level classes within a given coarse category, in a dataset of N
+    different samples. In addition to the K so-called "complete" tags (i.e.
+    with a determinate fine-level category as well as a determinate
+    coarse-level category), we consider the potential presence of an "incomplete"
+    tag, i.e. denoting the presence of a class with a determinate coarse-level
+    category yet no determinate fine-level category. This incomplete tag
+    be present in either the prediction or the ground truth.
+
+    Our method for evaluating a multilabel classifier on potentially incomplete
+    knowledge of the ground truth consists of two parts, which are ultimately
+    aggregated into a single count.
+
+    For the samples with complete knowledge of both ground truth (Part I in the
+    code below), we simply apply classwise Boolean logic to compute TP, FP, and
+    FN independently for every fine-level tag, and finally aggregate across
+    all tags.
+
+    However, for the samples with incomplete knowledge of the ground truth
+    (Part II in the code below), we perform a "coarsening" of the prediction by
+    apply a disjunction on the fine-level complete tags as well as the
+    coarse incomplete tag. If that coarsened prediction is positive, the sample
+    produces a true positive; otherwise, it produces a false negative.
+
+    Samples which contain the incomplete tag in the prediction but not the
+    ground truth overlap Parts I and II. In this case, we sum the zero, one,
+    or multiple false alarm(s) from Part I with the one false alarm from Part II
+    to produce a final number of false positives FP.
+
+    Parameters
+    ----------
+    Y_true: array of bool, shape = [n_samples, n_classes]
+        One-hot encoding of true presence for complete fine tags.
+        Y_true[n, k] is equal to 1 if the class k is truly present in sample n,
+        and equal to 0 otherwise.
+
+    Y_pred: array of bool, shape = [n_samples, n_classes]
+        One-hot encoding of predicted class presence for complete fine tags.
+        Y_true[n, k] is equal to 1 if the class k is truly present in sample n,
+        and equal to 0 otherwise.
+
+    is_true_incomplete: array of bool, shape = [n_samples]
+        One-hot encoding of true presence for the incomplete fine tag.
+        is_true[n] is equal to 1 if an item that truly belongs to the
+        coarse category at hand, but the fine-level tag of that item is
+        truly uncertain, or truly unlike any of the K available fine tags.
+
+    is_pred_incomplete: array of bool, shape = [n_samples]
+        One-hot encoding of predicted presence for the incomplete fine tag.
+        is_true[n] is equal to 1 if the system predicts the existence of an
+        item that does belongs to the coarse category at hand, yet its
+        fine-level tag of that item is uncertain or unlike any of the
+        K available fine tags.
+
+    Returns
+    -------
+    TP: int
+        Number of true positives.
+
+    FP: int
+        Number of false positives.
+
+    FN: int
+        Number of false negatives.
+    """
+
+    ## PART I. SAMPLES WITH COMPLETE GROUND TRUTH AND COMPLETE PREDICTION
+    # Negate the true_incomplete Boolean and replicate it K times, where
+    # K is the number of fine tags.
+    # For each sample and fine tag, this mask is equal to 0 if the
+    # ground truth contains the incomplete fine tag and 1 if the ground
+    # truth does not contain the incomplete fine tag.
+    # The result is a (N, K) matrix.
+    Y_true_complete = np.tile(np.logical_not(
+        is_true_incomplete)[:, np.newaxis], (1, Y_pred.shape[1]))
+
+    # Compute true positives for samples with complete ground truth.
+    # For each sample n and each complete tag k, is_TP_complete is equal to 1
+    # if and only if the following two conditions are met:
+    # (i)  the ground truth of sample n contains complete fine tag k
+    # (ii) the prediction of sample n contains complete fine tag k
+    # The result is a (N, K) matrix.
+    is_TP_complete = np.logical_and.reduce((Y_true, Y_pred))
+
+    # Compute false positives for samples with complete ground truth.
+    # For each sample n and each complete tag k, is_FP_complete is equal to 1
+    # if and only if the following three conditions are met:
+    # (i)   the ground truth of sample n is complete
+    # (ii)  the ground truth of sample n does not contain complete fine tag k
+    # (iii) the prediction of sample n contains complete fine tag k
+    # The result is a (N, K) matrix.
+    is_FP_complete = np.logical_and.reduce(
+        (np.logical_not(Y_true), Y_pred, Y_true_complete))
+
+    # Compute false negatives for samples with complete ground truth.
+    # For each sample n and each complete tag k, is_FN_complete is equal to 1
+    # if and only if the following two conditions are met:
+    # (i)  the ground truth of sample n contains complete fine tag k
+    # (ii) the prediction of sample n does not contain complete fine tag k
+    # The result is a (N, K) matrix.
+    is_FN_complete = np.logical_and(Y_true, np.logical_not(Y_pred))
+
+
+    ## PART II. SAMPLES WITH INCOMPLETE GROUND TRUTH OR INCOMPLETE PREDICTION.
+    # Compute a vector of "coarsened prediction".
+    # For each sample, the coarsened prediction is equal to 1 if any of the
+    # complete fine tags is predicted as present, or if the incomplete fine
+    # tag is predicted as present. Conversely, it is set equal to 0 if all
+    # of the complete fine tags are predicted as absent, and if the incomplete
+    # fine tags are predicted as absent.
+    # The result is a (N,) vector.
+    y_pred_coarsened_without_incomplete = np.logical_or.reduce(Y_pred, axis=1)
+    y_pred_coarsened = np.logical_or(
+        y_pred_coarsened_without_incomplete, is_pred_incomplete)
+
+    # Compute a vector of "coarsened ground truth".
+    # For each sample, the coarsened ground truth is equal to 1 if none of the
+    # complete fine tags are truly present, and if the incomplete fine tag is
+    # truly present. Conversely, it is set equal to 0 if any of the complete
+    # fine tags is truly present, or if the incomplete fine tag is truly absent.
+    # The result is a (N,) vector.
+    y_true_coarsened_without_incomplete =\
+        np.logical_and.reduce(np.logical_not(Y_true), axis=1)
+    y_true_coarsened = np.logical_and(
+        y_true_coarsened_without_incomplete, is_true_incomplete)
+
+    # Compute true positives for samples with incomplete ground truth.
+    # For each sample n, is_TP_incomplete is equal to 1
+    # if and only if the following three conditions are met:
+    # (i)   the ground truth contains the incomplete fine tag
+    # (ii)  the coarsened prediction of sample n contains at least one tag
+    # (iii) none of the predicted complete tags match a true complete tag
+    # The result is a (N,) vector.
+    is_TP_incomplete = np.logical_and.reduce((
+        is_true_incomplete,
+        y_pred_coarsened,
+        np.logical_and.reduce(np.logical_not(is_TP_complete), axis=1)))
+
+    # Compute false positives for samples with incomplete ground truth.
+    # For each sample n, is_FP_incomplete is equal to 1
+    # if and only if the following two conditions are met:
+    # (i)   the ground truth does not contain the incomplete fine tag
+    # (ii)  no complete fine tags are in the ground truth
+    # (iii) the prediction contains the incomplete fine tag
+    # (iv)  not all complete fine tags are in the prediction
+    # The result is a (N,) vector.
+    is_FP_incomplete = np.logical_and.reduce((
+        np.logical_not(is_true_incomplete),
+        np.logical_not(np.logical_or.reduce(Y_true, axis=1)),
+        is_pred_incomplete,
+        np.logical_not(np.logical_and.reduce(Y_pred, axis=1))))
+
+    # Compute false negatives for samples with incomplete ground truth.
+    # For each sample n, is_FN_incomplete is equal to 1
+    # if and only if the following two conditions are met:
+    # (i)   the incomplete fine tag is present in the ground truth
+    # (ii)  the coarsened prediction of sample n does not contain any tag
+    # The result is a (N,) vector.
+    is_FN_incomplete = np.logical_and(
+        y_true_coarsened, np.logical_not(y_pred_coarsened))
+
+
+    ## PART III. AGGREGATE EVALUATION OF ALL SAMPLES
+    # The following three sums are performed over NxK Booleans,
+    # implicitly converted as integers 0 (False) and 1 (True).
+    TP_complete = np.sum(is_TP_complete)
+    FP_complete = np.sum(is_FP_complete)
+    FN_complete = np.sum(is_FN_complete)
+
+    # The following three sums are performed over N Booleans,
+    # implicitly converted as integers 0 (False) and 1 (True).
+    TP_incomplete = np.sum(is_TP_incomplete)
+    FP_incomplete = np.sum(is_FP_incomplete)
+    FN_incomplete = np.sum(is_FN_incomplete)
+
+    # Sum FP, TP, and FN for samples that have complete ground truth
+    # with FP, TP, and FN for samples that have incomplete ground truth.
+    TP = TP_complete + TP_incomplete
+    FP = FP_complete + FP_incomplete
+    FN = FN_complete + FN_incomplete
+    return TP, FP, FN
+
+
+def confusion_matrix_coarse(y_true, y_pred):
+    """
+    Counts overall numbers of true positives (TP), false positives (FP),
+    and false negatives (FN) in the predictions of a system, for a single
+    Boolean attribute, in a dataset of N different samples.
+
+
+    Parameters
+    ----------
+    y_true: array of bool, shape = [n_samples,]
+        One-hot encoding of true presence for a given coarse tag.
+        y_true[n] is equal to 1 if the tag is present in the sample.
+
+    y_pred: array of bool, shape = [n_samples,]
+        One-hot encoding of predicted presence for a given coarse tag.
+        y_pred[n] is equal to 1 if the tag is present in the sample.
+
+
+    Returns
+    -------
+    TP: int
+        Number of true positives.
+
+    FP: int
+        Number of false positives.
+
+    FN: int
+        Number of false negatives.
+    """
+    cm = confusion_matrix(y_true, y_pred)
+    FP = cm[0, 1]
+    FN = cm[1, 0]
+    TP = cm[1, 1]
+    return TP, FP, FN
+
+
+def evaluate(prediction_path, annotation_path, yaml_path, mode, split="validate"):
+    # Set minimum threshold.
+    min_threshold = 0.01
+
+    # Create dictionary to parse tags
+    with open(yaml_path, 'r') as stream:
+        yaml_dict = yaml.load(stream, Loader=yaml.Loader)
+
+    # Parse ground truth.
+    gt_df = parse_ground_truth(annotation_path, yaml_path, split)
+
+    # Parse predictions.
+    if mode == "fine":
+        pred_df = parse_fine_prediction(prediction_path, yaml_path)
+    elif mode == "coarse":
+        pred_df = parse_coarse_prediction(prediction_path, yaml_path)
+
+    # Check consistency between ground truth and predictions.
+    # Make sure the files evaluated in both tables match.
+    pred_audio_set = set(pred_df['audio_filename'].tolist())
+    true_audio_set = set(gt_df['audio_filename'].tolist())
+    if not (pred_audio_set == true_audio_set):
+        extra_files = pred_audio_set - true_audio_set
+        missing_files = true_audio_set - pred_audio_set
+        err_msg =\
+            "File mismatch between ground truth and prediction table.\n\n" \
+            "Missing files: {}\n\n Extra files: {}"
+        raise ValueError(err_msg.format(list(missing_files), list(extra_files)))
+
+    # Make sure the size of the tables match
+    if not (len(gt_df) == len(pred_df)):
+        err_msg =\
+            "Size mismatch between ground truth ({} files) " \
+            "and prediction table ({} files)."
+        raise ValueError(err_msg.format(len(gt_df), len(pred_df)))
+
+    # Initialize dictionary of DataFrames.
+    df_dict = {}
+
+    # Loop over coarse categories.
+    for coarse_id in yaml_dict["coarse"]:
+        # List columns corresponding to that category
+        if mode == "coarse":
+            columns = [str(coarse_id)]
+        else:
+            columns = [column for column in pred_df.columns
+                if (str(column).startswith(str(coarse_id))) and
+                   ("-" in str(column)) and
+                   (not str(column).endswith("X"))]
+
+        # Sort columns in alphanumeric order.
+        columns.sort()
+
+        # Restrict prediction to columns of interest.
+        restricted_pred_df = pred_df[columns]
+
+        # Restrict ground truth to columns of interest.
+        restricted_gt_df = gt_df[columns]
+
+        # Aggregate all prediction values into a "raveled" vector.
+        # We make an explicit numpy, so that the original DataFrame
+        # is left unchanged.
+        thresholds = np.ravel(np.copy(restricted_pred_df.values))
+
+        # Sort in place.
+        thresholds.sort()
+
+        # Skip very low values.
+        # This is to speed up the computation of the precision-recall curve
+        # in the low-precision regime.
+        thresholds = thresholds[np.searchsorted(thresholds, min_threshold):]
+
+        # Append a 1 to the list of thresholds.
+        # This will cause TP and FP to fall down to zero, but FN will be nonzero.
+        # This is useful for estimating the low-recall regime, and it
+        # facilitates micro-averaged AUPRC because if provides an upper bound
+        # on valid thresholds across coarse categories.
+        thresholds = np.append(thresholds, 1.0)
+
+        # List thresholds by restricting observed confidences to unique elements.
+        thresholds = np.unique(thresholds)[::-1]
+
+        # Count number of thresholds.
+        n_thresholds = len(thresholds)
+        TPs = np.zeros((n_thresholds,)).astype('int')
+        FPs = np.zeros((n_thresholds,)).astype('int')
+        FNs = np.zeros((n_thresholds,)).astype('int')
+
+        # FINE MODE.
+        if mode == "fine":
+            incomplete_tag = str(coarse_id) + "-X"
+
+            # Load ground truth as numpy array.
+            Y_true = restricted_gt_df.values
+            is_true_incomplete = gt_df[incomplete_tag].values
+
+            # Loop over thresholds in a decreasing order.
+            for i, threshold in enumerate(thresholds):
+                # Threshold prediction for complete tag.
+                Y_pred = restricted_pred_df.values >= threshold
+
+                # Threshold prediction for incomplete tag.
+                is_pred_incomplete =\
+                    pred_df[incomplete_tag].values >= threshold
+
+                # Evaluate.
+                TPs[i], FPs[i], FNs[i] = confusion_matrix_fine(
+                    Y_true, Y_pred, is_true_incomplete, is_pred_incomplete)
+
+        # COARSE MODE.
+        elif mode == "coarse":
+            # Load ground truth as numpy array.
+            Y_true = restricted_gt_df.values
+
+            # Loop over thresholds in a decreasing order.
+            for i, threshold in enumerate(thresholds):
+                # Threshold prediction.
+                Y_pred = restricted_pred_df.values >= threshold
+
+                # Evaluate.
+                TPs[i], FPs[i], FNs[i] = confusion_matrix_coarse(Y_true, Y_pred)
+
+        # Build DataFrame from columns.
+        eval_df = pd.DataFrame({
+            "threshold": thresholds, "TP": TPs, "FP": FPs, "FN": FNs})
+
+        # Add columns for precision, recall, and F1-score.
+        # NB: we take the maximum between TPs+FPs and mu=0.5 in the
+        # denominator in order to avoid division by zero.
+        # This only ever happens if TP+FP < 1, which
+        # implies TP = 0 (because TP and FP are nonnegative integers),
+        # and therefore a numerator of exactly zero. Therefore, any additive
+        # offset mu would do as long as 0 < mu < 1. Choosing mu = 0.5 is
+        # purely arbitrary and has no effect on the outcome (i.e. zero).
+        mu = 0.5
+        eval_df["P"] = TPs / np.maximum(TPs + FPs, mu)
+
+        # Likewise for recalls, although this numerical safeguard is probably
+        # less necessary given that TP+FN=0 implies that there are zero
+        # positives in the ground truth, which is unlikely but no unheard of.
+        eval_df["R"] = TPs / np.maximum(TPs + FNs, mu)
+
+        # Compute F1-scores.
+        # NB: we use the harmonic mean formula (2/F = 1/P + 1/R) rather than
+        # the more common F = (2*P*R)/(P+R) in order circumvent the edge case
+        # where both P and R are equal to 0 (i.e. TP = 0).
+        eval_df["F"] = 2 / (1/eval_df["P"] + 1/eval_df["R"])
+
+        # Store DataFrame in the dictionary.
+        df_dict[coarse_id] = eval_df
+
+    # Return dictionary.
+    return df_dict
+
+
+def micro_averaged_auprc(df_dict, return_df=False):
+    """
+    Compute micro-averaged area under the precision-recall curve (AUPRC)
+    from a dictionary of class-wise DataFrames obtained via `evaluate`.
+    """
+    # List all unique values of thresholds across coarse categories.
+    thresholds = np.unique(
+        np.hstack([x["threshold"] for x in df_dict.values()]))
+
+    # Count number of unique thresholds.
+    n_thresholds = len(thresholds)
+
+    # Initialize arrays for TP, FP, and FN
+    TPs = np.zeros((n_thresholds,)).astype('int')
+    FPs = np.zeros((n_thresholds,)).astype('int')
+    FNs = np.zeros((n_thresholds,)).astype('int')
+
+    # Loop over thresholds.
+    for i, threshold in enumerate(thresholds):
+
+        # Initialize counters of TP, FP, and FN across all categories.
+        global_TP, global_FP, global_FN = 0, 0, 0
+
+        # Loop over coarse categories.
+        for coarse_id in df_dict.keys():
+
+            # Find last row above threshold.
+            coarse_df = df_dict[coarse_id]
+            coarse_thresholds = coarse_df["threshold"]
+            row = coarse_df[coarse_thresholds>=threshold].iloc[-1]
+
+            # Increment TP, FP, and FN.
+            global_TP += row["TP"]
+            global_FP += row["FP"]
+            global_FN += row["FN"]
+
+        # Store micro-averaged values of TP, FP, and FN for the given threshold.
+        TPs[i] = global_TP
+        FPs[i] = global_FP
+        FNs[i] = global_FN
+
+    # Build DataFrame from columns.
+    eval_df = pd.DataFrame({
+        "threshold": thresholds, "TP": TPs, "FP": FPs, "FN": FNs})
+
+    # Add columns for precision, recall, and F1-score.
+    # NB: we take the maximum between TPs+FPs and mu = 0.5 in the
+    # denominator in order to avoid division by zero.
+    # This only ever happens if TP+FP < 1, which
+    # implies TP = 0 (because TP and FP are nonnegative integers),
+    # and therefore a numerator of exactly zero. Therefore, any additive
+    # offset mu would do as long as 0 < mu < 1. Choosing mu = 0.5 is
+    # purely arbitrary and has no effect on the outcome (i.e. zero).
+    mu = 0.5
+    eval_df["P"] = TPs / np.maximum(TPs + FPs, mu)
+
+    # Likewise for recalls, although this numerical safeguard is probably
+    # less necessary given that TP+FN=0 implies that there are zero
+    # positives in the ground truth, which is unlikely but no unheard of.
+    eval_df["R"] = TPs / np.maximum(TPs + FNs, mu)
+
+    # Sort PR curve by ascending recall.
+    sorting_indices = np.argsort(list(eval_df["R"]))
+    recalls = np.array([0.0] + list(eval_df["R"][sorting_indices]) + [1.0])
+    precisions = np.array([1.0] + list(eval_df["P"][sorting_indices]) + [0.0])
+    auprc = auc(recalls, precisions)
+
+    # If the DataFrame containing the full P-R curve is requested.
+    if return_df:
+        # Compute F1-scores.
+        # NB: we use the harmonic mean formula (2/F = 1/P + 1/R) rather than
+        # the more common F = (2*P*R)/(P+R) in order circumvent the edge case
+        # where both P and R are equal to 0 (i.e. TP = 0).
+        eval_df["F"] = 2 / (1/eval_df["P"] + 1/eval_df["R"])
+
+        # Return
+        return auprc, eval_df
+    else:
+        # Otherwise, return only the AUPRC as a scalar.
+        return auprc
+
+
+
+def macro_averaged_auprc(df_dict, return_classwise=False):
+    """
+    Compute macro-averaged area under the precision-recall curve (AUPRC)
+    from a dictionary of class-wise DataFrames obtaines via `evaluate`.
+    """
+    # Initialize list of category-wise AUPRCs.
+    auprcs = []
+    coarse_id_list = df_dict.keys()
+
+    # Loop over coarse categories.
+    for coarse_id in coarse_id_list:
+        # Load precisions and recalls.
+        # NB: we prepend a (1,0) and append a (0,1) to the curve so that the
+        # curve reaches the top-left and bottom-right quadrants of the
+        # precision-recall square.
+        sorting_indices = df_dict[coarse_id]["R"].argsort()
+        recalls = np.array(
+            [0.0] + list(df_dict[coarse_id]["R"][sorting_indices]) + [1.0])
+        precisions = np.array(
+            [1.0] + list(df_dict[coarse_id]["P"][sorting_indices]) + [0.0])
+        auprcs.append(auc(recalls, precisions))
+
+    # Average AUPRCs across coarse categories with uniform weighting.
+    mean_auprc = np.mean(auprcs)
+
+    if return_classwise:
+        class_auprc = {coarse_id: auprc
+                       for coarse_id, auprc in zip(coarse_id_list, auprcs)}
+        return mean_auprc, class_auprc
+    else:
+        return mean_auprc
+
+
+def parse_coarse_prediction(pred_csv_path, yaml_path):
+    """
+    Parse coarse-level predictions from a CSV file containing both fine-level
+    and coarse-level predictions (and possibly additional metadata).
+    Returns a Pandas DataFrame in which the column names are coarse
+    IDs of the form 1, 2, 3 etc.
+
+
+    Parameters
+    ----------
+    pred_csv_path: string
+        Path to the CSV file containing predictions.
+
+    yaml_path: string
+        Path to the YAML file containing coarse taxonomy.
+
+
+    Returns
+    -------
+    pred_coarse_df: DataFrame
+        Coarse-level complete predictions.
+    """
+
+    # Create dictionary to parse tags
+    with open(yaml_path, 'r') as stream:
+        yaml_dict = yaml.load(stream, Loader=yaml.Loader)
+
+    # Collect tag names as strings and map them to coarse ID pairs.
+    rev_coarse_dict = {"_".join([str(k), yaml_dict["coarse"][k]]): k
+        for k in yaml_dict["coarse"]}
+
+    # Read comma-separated values with the Pandas library
+    pred_df = pd.read_csv(pred_csv_path)
+
+    # Assign a predicted column to each coarse key, by using the tag as an
+    # intermediate hashing step.
+    pred_coarse_dict = {}
+    for c in rev_coarse_dict:
+        if c in pred_df:
+            pred_coarse_dict[str(rev_coarse_dict[c])] = pred_df[c]
+        else:
+            pred_coarse_dict[str(rev_coarse_dict[c])] = np.zeros((len(pred_df),))
+            warnings.warn("Column not found: " + c)
+
+    # Copy over the audio filename strings corresponding to each sample.
+    pred_coarse_dict["audio_filename"] = pred_df["audio_filename"]
+
+    # Build a new Pandas DataFrame with coarse keys as column names.
+    pred_coarse_df = pd.DataFrame.from_dict(pred_coarse_dict)
+
+    # Return output in DataFrame format.
+    # The column names are of the form 1, 2, 3, etc.
+    return pred_coarse_df.sort_values('audio_filename')
+
+
+def parse_fine_prediction(pred_csv_path, yaml_path):
+    """
+    Parse fine-level predictions from a CSV file containing both fine-level
+    and coarse-level predictions (and possibly additional metadata).
+    Returns a Pandas DataFrame in which the column names are mixed (coarse-fine)
+    IDs of the form 1-1, 1-2, 1-3, ..., 1-X, 2-1, 2-2, 2-3, ... 2-X, 3-1, etc.
+
+
+    Parameters
+    ----------
+    pred_csv_path: string
+        Path to the CSV file containing predictions.
+
+    yaml_path: string
+        Path to the YAML file containing fine taxonomy.
+
+
+    Returns
+    -------
+    pred_fine_df: DataFrame
+        Fine-level complete predictions.
+    """
+
+    # Create dictionary to parse tags
+    with open(yaml_path, 'r') as stream:
+        yaml_dict = yaml.load(stream, Loader=yaml.Loader)
+
+    # Collect tag names as strings and map them to mixed (coarse-fine) ID pairs.
+    # The "mixed key" is a hyphenation of the coarse ID and fine ID.
+    fine_dict = {}
+    for coarse_id in yaml_dict["fine"]:
+        for fine_id in yaml_dict["fine"][coarse_id]:
+            mixed_key = "-".join([str(coarse_id), str(fine_id)])
+            fine_dict[mixed_key] = "_".join([
+                mixed_key, yaml_dict["fine"][coarse_id][fine_id]])
+
+    # Invert the key-value relationship between mixed key and tag.
+    # Now, tags are the keys, and mixed keys (coarse-fine IDs) are the values.
+    # This is possible because tags are unique.
+    rev_fine_dict = {fine_dict[k]: k for k in fine_dict}
+
+    # Read comma-separated values with the Pandas library
+    pred_df = pd.read_csv(pred_csv_path)
+
+    # Assign a predicted column to each mixed key, by using the tag as an
+    # intermediate hashing step.
+    pred_fine_dict = {}
+    for f in sorted(rev_fine_dict.keys()):
+        if f in pred_df:
+            pred_fine_dict[rev_fine_dict[f]] = pred_df[f]
+        else:
+            pred_fine_dict[rev_fine_dict[f]] = np.zeros((len(pred_df),))
+            warnings.warn("Column not found: " + f)
+
+    # Loop over coarse tags.
+    n_samples = len(pred_df)
+    coarse_dict = yaml_dict["coarse"]
+    for coarse_id in yaml_dict["coarse"]:
+        # Construct incomplete fine tag by appending -X to the coarse tag.
+        incomplete_tag = str(coarse_id) + "-X"
+
+        # If the incomplete tag is not in the prediction, append a column of zeros.
+        # This is the case e.g. for coarse ID 7 ("dogs") which has a single
+        # fine-level tag ("7-1_dog-barking-whining") and thus no incomplete
+        # tag 7-X.
+        if incomplete_tag not in fine_dict.keys():
+            pred_fine_dict[incomplete_tag] =\
+                np.zeros((n_samples,)).astype('int')
+
+
+    # Copy over the audio filename strings corresponding to each sample.
+    pred_fine_dict["audio_filename"] = pred_df["audio_filename"]
+
+    # Build a new Pandas DataFrame with mixed keys as column names.
+    pred_fine_df = pd.DataFrame.from_dict(pred_fine_dict)
+
+    # Return output in DataFrame format.
+    # Column names are 1-1, 1-2, 1-3 ... 1-X, 2-1, 2-2, 2-3 ... 2-X, 3-1, etc.
+    return pred_fine_df.sort_values('audio_filename')
+
+
+def parse_ground_truth(annotation_path, yaml_path, split="validate"):
+    """
+    Parse ground truth annotations from a CSV file containing both fine-level
+    and coarse-level predictions (and possibly additional metadata).
+    Returns a Pandas DataFrame in which the column names are coarse
+    IDs of the form 1, 2, 3 etc.
+
+
+    Parameters
+    ----------
+    annotation_path: string
+        Path to the CSV file containing predictions.
+
+    yaml_path: string
+        Path to the YAML file containing coarse taxonomy.
+
+
+    Returns
+    -------
+    gt_df: DataFrame
+        Ground truth.
+    """
+    # Create dictionary to parse tags
+    with open(yaml_path, 'r') as stream:
+        yaml_dict = yaml.load(stream, Loader=yaml.Loader)
+
+    # Load CSV file into a Pandas DataFrame.
+    ann_df = pd.read_csv(annotation_path)
+
+    # Restrict to ground truth ("annotator zero").
+    gt_df = ann_df[
+        (ann_df["annotator_id"]==0) & (ann_df["split"]==split)]
+
+    # Rename coarse columns.
+    coarse_dict = yaml_dict["coarse"]
+    coarse_renaming = {
+        "_".join([str(c), coarse_dict[c], "presence"]): str(c)
+        for c in coarse_dict}
+    gt_df = gt_df.rename(columns=coarse_renaming)
+
+    # Collect tag names as strings and map them to mixed (coarse-fine) ID pairs.
+    # The "mixed key" is a hyphenation of the coarse ID and fine ID.
+    fine_dict = {}
+    for coarse_id in yaml_dict["fine"]:
+        for fine_id in yaml_dict["fine"][coarse_id]:
+            mixed_key = "-".join([str(coarse_id), str(fine_id)])
+            fine_dict[mixed_key] = yaml_dict["fine"][coarse_id][fine_id]
+
+    # Rename fine columns.
+    fine_renaming = {"_".join([k, fine_dict[k], "presence"]): k
+        for k in fine_dict}
+    gt_df = gt_df.rename(columns=fine_renaming)
+
+    # Loop over coarse tags.
+    n_samples = len(gt_df)
+    coarse_dict = yaml_dict["coarse"]
+    for coarse_id in yaml_dict["coarse"]:
+        # Construct incomplete fine tag by appending -X to the coarse tag.
+        incomplete_tag = str(coarse_id) + "-X"
+
+        # If the incomplete tag is not in the prediction, append a column of zeros.
+        # This is the case e.g. for coarse ID 7 ("dogs") which has a single
+        # fine-level tag ("7-1_dog-barking-whining") and thus no incomplete
+        # tag 7-X.
+        if incomplete_tag not in gt_df.columns:
+            gt_df[incomplete_tag] = np.zeros((n_samples,)).astype('int')
+
+    # Return output in DataFrame format.
+    return gt_df.sort_values('audio_filename')
-- 
GitLab