Add of distributed algorithm for the reference table creation in abc model...

Add of distributed algorithm for the reference table creation in abc model choice. Documentation for abc model choice methods.

Add of distributed algorithm for the reference table creation in abc model...
715f269f · Bentriou Mahmoud · 888e74c1 · 715f269f · 715f269f · 715f269f
Commit 715f269f authored 4 years ago by Bentriou Mahmoud
--- a/algorithms/abc_model_choice.jl
+++ b/algorithms/abc_model_choice.jl
@@ -22,16 +22,57 @@ function getproperty(dataset::AbcModelChoiceDataset, sym::Symbol)
    end
 end
+"""
+    `abc_model_choice_dataset(models, models_prior,
+                              summary_stats_observations,
+                              summary_stats_func::Function, distance_func::Function,
+                              k::Int, N_ref::Int; dir_results::Union{Nothing,String} = nothing)`
+Creates a reference table for ABC model choice.
+The mandatory arguments are:
+* `models` is a list of objects inherited from `Model` or `ParametricModel`,
+* `models_prior`: the prior over the models (by default: discrete uniform distribution)
+* `summary_stats_observations` are the summary statitics of the observations,
+* `summary_stats_func::Function`: the function that computes the summary statistics over a model simulation,
+* `distance_func`: the distance function over the summary statistics space,
+* `N_ref`: the number of samples in the reference table,
+* `k`: the k nearest samples from the observations to keep in the reference table (k < N_ref).
+The result is a `AbcModelChoiceDataset` with fields:
+* `summary_stats_matrix`: the (N_stats, N_ref) features matrix. Accessible via `.X`.
+* `models_indexes`: the labels vector. Accessible via `.y`.
+If specified, `dir_results` is the directory where the summary statistics matrix and associated models are stored (CSV).
+"""
+function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}, models_prior::DiscreteUnivariateDistribution,
+                                  summary_stats_observations,
+                                  summary_stats_func::Function, distance_func::Function,
+                                  k::Int, N_ref::Int; dir_results::Union{Nothing,String} = nothing)
+    if nprocs() == 1
+        return _abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N_ref; dir_results = dir_results)
+    end
+    return _distributed_abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N_ref; dir_results = dir_results)
+end
+"""
+    `abc_model_choice_dataset(models, models_prior,
+                              summary_stats_observations,
+                              summary_stats_func::Function, distance_func::Function,
+                              k::Int, N_ref::Int; dir_results::Union{Nothing,String} = nothing)`
+Creates a reference table for ABC model choice with discrete uniform prior distribution over the models.
+"""
 function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}},
                                  summary_stats_observations,
                                  summary_stats_func::Function, distance_func::Function,
-                                  k::Int, N::Int; dir_results::Union{Nothing,String} = nothing)
+                                  k::Int, N_ref::Int; dir_results::Union{Nothing,String} = nothing)
    nbr_models = length(models)
    models_prior = Categorical([1/nbr_models for i = 1:nbr_models])
-    return abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N; dir_results = dir_results)
+    return abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N_ref; dir_results = dir_results)
 end
-function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}, models_prior::DiscreteUnivariateDistribution,
+function _abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}, models_prior::DiscreteUnivariateDistribution,
                                  summary_stats_observations,
                                  summary_stats_func::Function, distance_func::Function,
                                  k::Int, N::Int; dir_results::Union{Nothing,String} = nothing)
@@ -57,27 +98,117 @@ function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}
        distances[i] = distance_func(ss_i, summary_stats_observations)
    end
    k_nn = sortperm(distances, alg = QuickSort)[1:k]
+    knn_models_indexes = models_indexes[k_nn]
+    knn_summary_stats_matrix = summary_stats_matrix[:,k_nn]
+    if dir_results != nothing
+        dir_results = basename(dir_results) != "" ? dir_results * "/" : dir_results
+        if !isdir(dir_results) mkdir(dir_results) end
+        writedlm(dir_results * "X_abc_dataset.csv", knn_summary_stats_matrix, ',')
+        writedlm(dir_results * "y_abc_dataset.csv", knn_models_indexes, ',')
+        file_cfg = open(dir_results * "config_abc_dataset.out", "w")
+        write(file_cfg, "Models: $(models) \n")
+        write(file_cfg, "N: $(N) \n")
+        write(file_cfg, "k: $(k) \n")
+        write(file_cfg, "tolerance epsilon: $(distances[k_nn[end]]) \n")
+        close(file_cfg)
+    end
+    return AbcModelChoiceDataset(knn_models_indexes, knn_summary_stats_matrix, distances[k_nn[end]])
+end
+function _distributed_abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}, models_prior::DiscreteUnivariateDistribution,
+                                              summary_stats_observations,
+                                              summary_stats_func::Function, distance_func::Function,
+                                              k::Int, N::Int; dir_results::Union{Nothing,String} = nothing)
+    @assert length(models) >= 2 "Should contain at least 2 models"
+    @assert ncategories(models_prior) == length(models) "Number of categories of models' prior and number of models do not equal"
+    models_indexes = SharedVector{Int}(N)
+    summary_stats_matrix = SharedMatrix{eltype(summary_stats_observations)}(length(summary_stats_observations), N)
+    distances = SharedVector{Float64}(N)
+    bool_parametric = typeof(models) <: Vector{ParametricModel} 
+    @sync @distributed for i = 1:N
+        current_idx_model = rand(models_prior)
+        models_indexes[i] = current_idx_model
+        if bool_parametric
+            param_model = models[current_idx_model]
+            vec_p = rand(param_model.distribution)
+            sim = simulate(param_model, vec_p)
+        else
+            sim = simulate(models[current_idx_model])
+        end
+        ss_i = summary_stats_func(sim)
+        summary_stats_matrix[:,i] = ss_i 
+        distances[i] = distance_func(ss_i, summary_stats_observations)
+    end
+    k_nn = sortperm(sdata(distances), alg = QuickSort)[1:k]
+    knn_models_indexes = sdata(models_indexes)[k_nn]
+    knn_summary_stats_matrix = sdata(summary_stats_matrix)[:,k_nn]
    if dir_results != nothing
        dir_results = basename(dir_results) != "" ? dir_results * "/" : dir_results
+        if !isdir(dir_results) mkdir(dir_results) end
+        writedlm(dir_results * "X_abc_dataset.csv", knn_summary_stats_matrix, ',')
+        writedlm(dir_results * "y_abc_dataset.csv", knn_models_indexes, ',')
+        file_cfg = open(dir_results * "config_abc_dataset.out", "w")
+        write(file_cfg, "Models: $(models) \n")
+        write(file_cfg, "N: $(N) \n")
+        write(file_cfg, "k: $(k) \n")
+        write(file_cfg, "tolerance epsilon: $(distances[k_nn[end]]) \n")
+        close(file_cfg)
    end
-    return AbcModelChoiceDataset(models_indexes[k_nn], summary_stats_matrix[:,k_nn], distances[k_nn[end]])
+    return AbcModelChoiceDataset(knn_models_indexes, knn_summary_stats_matrix, distances[k_nn[end]])
 end
+"""
+    `rf_abc_model_choice(models, summary_stats_observations,
+                         summary_stats_func::Function, N_ref::Int;
+                         k::Int = N_ref, distance_func::Function = (x,y) -> 1, 
+                         hyperparameters_range::Dict)`
+Run the Random Forest Approximate Bayesian Computation model choice method.
+The mandatory arguments are:
+* `models` is a list of objects inherited from `Model` or `ParametricModel`,
+* `summary_stats_observations` are the summary statitics of the observations
+* `N_ref`: the number of samples in the reference table.
+* `summary_stats_func::Function`: the function that computes the summary statistics over a model simulation.
+The optional arguments are:
+* `models_prior`: the prior over the models (by default: discrete uniform distribution)
+* `k`: the k nearest samples from the observations to keep in the reference table (by default: k = N_ref)
+* `distance_func`: the distance function, has to be defined if k < N_ref
+* `hyperparameters_range`: a dict with the hyperparameters range values for the cross validation
+fit of the Random Forest (by default: `Dict(:n_estimators => [200], :min_samples_leaf => [1], :min_samples_split => [2])`).
+See scikit-learn documentation of RandomForestClassifier for the hyperparameters name.
+The result is a `RandomForestABC` object with fields:
+* `reference_table` an AbcModelChoiceDataset that corresponds to the reference table of the algorithm, 
+* `clf` a random forest classifier (PyObject from scikit-learn),
+* `summary_stats_observations` are the summary statitics of the observations
+* `estim_model` is the underlying model of the observations inferred with the RF-ABC method.
+"""
 function rf_abc_model_choice(models::Vector{<:Union{Model,ParametricModel}},
                             summary_stats_observations,
                             summary_stats_func::Function, N_ref::Int;
+                             models_prior::DiscreteUnivariateDistribution = Categorical([1/length(models) for i = 1:length(models)]),
                             k::Int = N_ref, distance_func::Function = (x,y) -> 1, 
                             hyperparameters_range::Dict = Dict(:n_estimators => [200], :min_samples_leaf => [1],
                                                                :min_samples_split => [2]))
    @assert k <= N_ref
-    trainset = abc_model_choice_dataset(models, summary_stats_observations, summary_stats_func, distance_func, k, N_ref)
+    trainset = abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N_ref)
    gridsearch = GridSearchCV(RandomForestClassifier(oob_score=true), hyperparameters_range)
    fit!(gridsearch, transpose(trainset.X), trainset.y)
    best_rf = gridsearch.best_estimator_
    return RandomForestABC(trainset, best_rf, summary_stats_observations, predict(best_rf, [summary_stats_observations]))
 end
+"""
+    `posterior_proba_model(rf_abc::RandomForestABC)`
+Estimates the posterior probability of the model with the Random Forest ABC method.
+"""
 # P(m = m^(ss_obs) | ss_obs) estimate
 function posterior_proba_model(rf_abc::RandomForestABC)
    oob_votes = rf_abc.clf.oob_decision_function_

--- a/core/MarkovProcesses.jl
+++ b/core/MarkovProcesses.jl
@@ -17,6 +17,7 @@ import Random: rand, rand!
 import ScikitLearn
 import ScikitLearn: fit!, predict, get_params
 import ScikitLearn.GridSearch: GridSearchCV
+import SharedArrays: SharedVector, SharedMatrix, sdata
 import StaticArrays: SVector, @SVector
 # Python objects import
 import PyCall: PyNULL

--- a/tests/abc_model_choice/toy_example.jl
+++ b/tests/abc_model_choice/toy_example.jl
-using SpecialFunctions
+@everywhere begin
-using LinearAlgebra
+    using Distributions
-using Random
+    using MarkovProcesses
-using Distributions
+    using SpecialFunctions
-using MarkovProcesses
+    using LinearAlgebra
+    using Random
+end
 using ScikitLearn
 @sk_import metrics: (accuracy_score, classification_report)
-global n = 20
+@everywhere begin
+    global n = 20
-struct Model1 <: Model end
+    struct Model1 <: Model end
-struct Model2 <: Model end
+    struct Model2 <: Model end
-struct Model3 <: Model end
+    struct Model3 <: Model end
-import MarkovProcesses: simulate
+    import MarkovProcesses: simulate
-function simulate(m::Model1)
+    function simulate(m::Model1)
-    param = rand(Exponential(1))
+        param = rand(Exponential(1))
-    return rand(Exponential(param), n)
+        return rand(Exponential(param), n)
-end
+    end
-function simulate(m::Model2)
+    function simulate(m::Model2)
-    param = rand(Normal())
+        param = rand(Normal())
-    return rand(LogNormal(param,1), n)
+        return rand(LogNormal(param,1), n)
-end
+    end
-function simulate(m::Model3)
+    function simulate(m::Model3)
-    param = rand(Exponential(1))
+        param = rand(Exponential(1))
-    return rand(Gamma(2,1/param), n)
+        return rand(Gamma(2,1/param), n)
-end
+    end
-m1, m2, m3 = Model1(), Model2(), Model3()
+    lh_m1(s) = exp(log(gamma(n+1)) - (n+1)*log(1+s[1]))
-lh_m1(s) = exp(log(gamma(n+1)) - (n+1)*log(1+s[1]))
+    lh_m2(s) = exp(-s[2]^2/(2n*(n+1)) - (s[3]^2)/2 + (s[2]^2)/(2n) - s[2]) * (2pi)^(-n/2)*(n+1)^(-1/2)
-lh_m2(s) = exp(-s[2]^2/(2n*(n+1)) - (s[3]^2)/2 + (s[2]^2)/(2n) - s[2]) * (2pi)^(-n/2)*(n+1)^(-1/2)
+    lh_m3(s) = exp(s[2])*gamma(2n+1)/gamma(2)^n * (1+s[1])^(-2n-1)
-lh_m3(s) = exp(s[2])*gamma(2n+1)/gamma(2)^n * (1+s[1])^(-2n-1)
-ss_func(y) = [sum(y), sum(log.(y)), sum(log.(y).^2)]
+    ss_func(y) = [sum(y), sum(log.(y)), sum(log.(y).^2)]
-dist_l2(s_sim,s_obs) = norm(s_sim-s_obs)
+    dist_l2(s_sim,s_obs) = norm(s_sim-s_obs)
+end
+m1, m2, m3 = Model1(), Model2(), Model3()
 observations = simulate(m3)
 ss_observations = ss_func(observations)
 models = [m1, m2, m3]
-abc_testset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 1000, 1000)
+println("Testset 10000 samples")
+@timev abc_testset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 10000, 10000; dir_results = "toy_ex")
 list_lh = [lh_m1, lh_m2, lh_m3]
 prob_model(ss, list_lh, idx_model) = list_lh[idx_model](ss) / sum([list_lh[i](ss) for i = eachindex(list_lh)])
@@ -65,9 +70,12 @@ savefig("set.svg")
 =#
 grid = Dict(:n_estimators => [500], :min_samples_leaf => [1], :min_samples_split => [2], :n_jobs => [8])
+println("RF ABC")
 @timev res_rf_abc = rf_abc_model_choice(models, ss_observations, ss_func, 29000; hyperparameters_range = grid)
 @show posterior_proba_model(res_rf_abc)
 X_testset = transpose(abc_testset.X)
 println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, X_testset)))
 @show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, X_testset))
+return true
--- a/tests/abc_model_choice/toy_example_ma.jl
+++ b/tests/abc_model_choice/toy_example_ma.jl
@@ -80,3 +80,5 @@ X_testset = transpose(abc_testset.X)
 println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, X_testset)))
 @show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, X_testset))
+return true
--- a/tests/run_all.jl
+++ b/tests/run_all.jl
@@ -5,4 +5,5 @@ include("run_dist_lp.jl")
 include("run_automata.jl")
 include("run_cosmos.jl")
 include("run_abc_smc.jl")
+include("run_abc_model_choice.jl")