From 97df12305ac56161c5ee65dc5e93b631a70feed9 Mon Sep 17 00:00:00 2001
From: Mahmoud Bentriou <mahmoud.bentriou@centralesupelec.fr>
Date: Thu, 13 May 2021 13:21:52 +0200
Subject: [PATCH] implementation of abc model choice with random forest

---
 algorithms/abc_model_choice.jl        | 90 +++++++++++++++++++++++++++
 core/MarkovProcesses.jl               | 19 +++++-
 tests/abc_model_choice/toy_example.jl | 72 +++++++++++++++++++++
 3 files changed, 179 insertions(+), 2 deletions(-)
 create mode 100644 algorithms/abc_model_choice.jl
 create mode 100644 tests/abc_model_choice/toy_example.jl

diff --git a/algorithms/abc_model_choice.jl b/algorithms/abc_model_choice.jl
new file mode 100644
index 0000000..4ccdc99
--- /dev/null
+++ b/algorithms/abc_model_choice.jl
@@ -0,0 +1,90 @@
+
+struct AbcModelChoiceDataset
+    models_indexes::Vector{Int}
+    summary_stats_vector::Vector
+    epsilon::Float64
+end
+
+struct RandomForestABC
+    reference_table::AbcModelChoiceDataset
+    clf
+    summary_stats_observations
+    estim_model
+end
+
+function getproperty(dataset::AbcModelChoiceDataset, sym::Symbol)
+    if sym == :X
+        return dataset.summary_stats_vector
+    elseif sym == :y
+        return dataset.models_indexes
+    else
+        return getfield(dataset, sym)
+    end
+end
+
+function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}},
+                                  summary_stats_observations,
+                                  summary_stats_func::Function, distance_func::Function,
+                                  k::Int, N::Int)
+    nbr_models = length(models)
+    models_prior = Categorical([1/nbr_models for i = 1:nbr_models])
+    return abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N)
+end
+
+function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}, models_prior::DiscreteUnivariateDistribution,
+                                  summary_stats_observations,
+                                  summary_stats_func::Function, distance_func::Function,
+                                  k::Int, N::Int)
+    @assert length(models) >= 2 "Should contain at least 2 models"
+    @assert ncategories(models_prior) == length(models) "Number of categories of models' prior and number of models do not equal"
+
+    models_indexes = zeros(Int, N)
+    summary_stats_vector = Vector{typeof(summary_stats_observations)}(undef, N)
+    distances = zeros(N)
+    bool_parametric = typeof(models) <: Vector{ParametricModel} 
+    for i = 1:N
+        current_idx_model = rand(models_prior)
+        models_indexes[i] = current_idx_model
+        if bool_parametric
+            param_model = models[current_idx_model]
+            vec_p = rand(param_model.distribution)
+            sim = simulate(param_model, vec_p)
+        else
+            sim = simulate(models[current_idx_model])
+        end
+        summary_stats_vector[i] = summary_stats_func(sim)
+        distances[i] = distance_func(summary_stats_vector[i], summary_stats_observations)
+    end
+    k_nn = sortperm(distances, alg = QuickSort)[1:k]
+
+    return AbcModelChoiceDataset(models_indexes[k_nn], summary_stats_vector[k_nn], distances[k_nn[end]])
+end
+
+function rf_abc_model_choice(models::Vector{<:Union{Model,ParametricModel}},
+                             summary_stats_observations,
+                             summary_stats_func::Function, N_ref::Int;
+                             k::Int = N_ref, distance_func::Function = (x,y) -> 1, 
+                             hyperparameters_range::Dict = Dict(:n_estimators => [200], :min_samples_leaf => [1, 2],
+                                                                :min_samples_split => [2,5]))
+    @assert k <= N_ref
+    trainset = abc_model_choice_dataset(models, summary_stats_observations, summary_stats_func, distance_func, k, N_ref)
+    gridsearch = GridSearchCV(RandomForestClassifier(oob_score=true), hyperparameters_range)
+    fit!(gridsearch, trainset.X, trainset.y)
+    best_rf = gridsearch.best_estimator_
+    return RandomForestABC(trainset, best_rf, summary_stats_observations, predict(best_rf, [summary_stats_observations]))
+end
+
+# P(m = m^(ss_obs) | ss_obs) estimate
+function posterior_proba_model(rf_abc::RandomForestABC)
+    oob_votes = rf_abc.clf.oob_decision_function_
+    y_pred_oob = argmax.([oob_votes[i,:] for i = 1:size(oob_votes)[1]])
+    y_oob_regression = y_pred_oob .!= rf_abc.reference_table.y
+    dict_params = Dict()
+    for param in ["n_estimators", "min_samples_leaf", "min_samples_split", "oob_score"]
+        dict_params[Symbol(param)] = get_params(rf_abc.clf)[param]
+    end
+    rf_regressor = RandomForestRegressor(;dict_params...)
+    fit!(rf_regressor, rf_abc.reference_table.X, y_oob_regression)
+    return 1 - predict(rf_regressor, [rf_abc.summary_stats_observations])[1]
+end
+
diff --git a/core/MarkovProcesses.jl b/core/MarkovProcesses.jl
index 0165793..8b63974 100644
--- a/core/MarkovProcesses.jl
+++ b/core/MarkovProcesses.jl
@@ -8,12 +8,25 @@ import Base: setindex!, setproperty!, fill!, copyto!
 import Dates
 import Distributed: @everywhere, @distributed
 import Distributions: Product, Uniform, Normal
-import Distributions: Distribution, Univariate, Continuous, UnivariateDistribution, 
+import Distributions: Distribution, Univariate, Continuous, 
+                      UnivariateDistribution, DiscreteUnivariateDistribution,
                       MultivariateDistribution, product_distribution
-import Distributions: insupport, isbounded, pdf
+import Distributions: insupport, isbounded, ncategories, pdf
 import FunctionWrappers: FunctionWrapper
 import Random: rand, rand!
+import ScikitLearn
+import ScikitLearn: fit!, predict, get_params
+import ScikitLearn.GridSearch: GridSearchCV
 import StaticArrays: SVector, @SVector
+# Python objects import
+import PyCall: PyNULL
+const RandomForestClassifier = PyNULL()
+const RandomForestRegressor = PyNULL()
+function __init__()
+    (ScikitLearn.Skcore).import_sklearn()
+    copy!(RandomForestClassifier, (ScikitLearn.Skcore.pyimport("sklearn.ensemble")).RandomForestClassifier)
+    copy!(RandomForestRegressor, (ScikitLearn.Skcore.pyimport("sklearn.ensemble")).RandomForestRegressor)
+end
 
 ## Exports
 export Distribution, Product, Uniform, Normal
@@ -52,6 +65,7 @@ export load_model, load_automaton, load_plots
 
 # Algorithms
 export automaton_abc, abc_smc
+export abc_model_choice_dataset, rf_abc_model_choice, posterior_proba_model
 
 # About biochemical networks
 export @network_model
@@ -63,6 +77,7 @@ include("model.jl")
 include("utils.jl")
 include("network_model.jl")
 include("../algorithms/abc_smc.jl")
+include("../algorithms/abc_model_choice.jl")
 
 end
 
diff --git a/tests/abc_model_choice/toy_example.jl b/tests/abc_model_choice/toy_example.jl
new file mode 100644
index 0000000..6346711
--- /dev/null
+++ b/tests/abc_model_choice/toy_example.jl
@@ -0,0 +1,72 @@
+
+using SpecialFunctions
+using LinearAlgebra
+using Random
+using Distributions
+using MarkovProcesses
+using ScikitLearn
+@sk_import metrics: (accuracy_score, classification_report)
+
+global n = 20
+
+struct Model1 <: Model end
+struct Model2 <: Model end
+struct Model3 <: Model end
+import MarkovProcesses: simulate
+
+function simulate(m::Model1)
+    param = rand(Exponential(1))
+    return rand(Exponential(param), n)
+end
+function simulate(m::Model2)
+    param = rand(Normal())
+    return rand(LogNormal(param,1), n)
+end
+function simulate(m::Model3)
+    param = rand(Exponential(1))
+    return rand(Gamma(2,1/param), n)
+end
+
+m1, m2, m3 = Model1(), Model2(), Model3()
+lh_m1(s) = exp(log(gamma(n+1)) - (n+1)*log(1+s[1]))
+lh_m2(s) = exp(-s[2]^2/(2n*(n+1)) - (s[3]^2)/2 + (s[2]^2)/(2n) - s[2]) * (2pi)^(-n/2)*(n+1)^(-1/2)
+lh_m3(s) = exp(s[2])*gamma(2n+1)/gamma(2)^n * (1+s[1])^(-2n-1)
+
+ss_func(y) = [sum(y), sum(log.(y)), sum(log.(y).^2)]
+dist_l2(s_sim,s_obs) = sqrt(dot(s_sim,s_obs))
+
+observations = simulate(m3)
+ss_observations = ss_func(observations)
+models = [m1, m2, m3]
+abc_testset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 1000, 1000)
+
+list_lh = [lh_m1, lh_m2, lh_m3]
+prob_model(ss, list_lh, idx_model) = list_lh[idx_model](ss) / sum([list_lh[i](ss) for i = eachindex(list_lh)])
+prob_model(ss, idx_model) = prob_model(ss, list_lh, idx_model)
+
+#=
+using Plots
+p = plot(title="Trainset")
+colors = ["black", "red", "green"]
+begin_idx = 1
+for i = 1:3
+    models_i = findall(x->x==i, abc_trainset.models_indexes)
+    nbr_obs = length(models_i)
+    end_idx = begin_idx + nbr_obs - 1
+    lh = list_lh[i]
+    @show i
+    @show nbr_obs
+    @show begin_idx:end_idx
+    scatter!(p, begin_idx:end_idx, prob_model.(abc_trainset.summary_stats_vector[models_i], 3), 
+                color = colors[i], markersize = 3.0, markershape = :cross, label = "Model $i")
+    global begin_idx = end_idx + 1
+end
+savefig("set.svg")
+=#
+
+grid = Dict(:n_estimators => [500], :min_samples_leaf => [1], :min_samples_split => [2])
+res_rf_abc = rf_abc_model_choice(models, ss_observations, ss_func, 29000; hyperparameters_range = grid)
+@show posterior_proba_model(res_rf_abc) 
+println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, abc_testset.X)))
+@show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, abc_testset.X))
+
-- 
GitLab