From 97df12305ac56161c5ee65dc5e93b631a70feed9 Mon Sep 17 00:00:00 2001 From: Mahmoud Bentriou <mahmoud.bentriou@centralesupelec.fr> Date: Thu, 13 May 2021 13:21:52 +0200 Subject: [PATCH] implementation of abc model choice with random forest --- algorithms/abc_model_choice.jl | 90 +++++++++++++++++++++++++++ core/MarkovProcesses.jl | 19 +++++- tests/abc_model_choice/toy_example.jl | 72 +++++++++++++++++++++ 3 files changed, 179 insertions(+), 2 deletions(-) create mode 100644 algorithms/abc_model_choice.jl create mode 100644 tests/abc_model_choice/toy_example.jl diff --git a/algorithms/abc_model_choice.jl b/algorithms/abc_model_choice.jl new file mode 100644 index 0000000..4ccdc99 --- /dev/null +++ b/algorithms/abc_model_choice.jl @@ -0,0 +1,90 @@ + +struct AbcModelChoiceDataset + models_indexes::Vector{Int} + summary_stats_vector::Vector + epsilon::Float64 +end + +struct RandomForestABC + reference_table::AbcModelChoiceDataset + clf + summary_stats_observations + estim_model +end + +function getproperty(dataset::AbcModelChoiceDataset, sym::Symbol) + if sym == :X + return dataset.summary_stats_vector + elseif sym == :y + return dataset.models_indexes + else + return getfield(dataset, sym) + end +end + +function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}, + summary_stats_observations, + summary_stats_func::Function, distance_func::Function, + k::Int, N::Int) + nbr_models = length(models) + models_prior = Categorical([1/nbr_models for i = 1:nbr_models]) + return abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N) +end + +function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}, models_prior::DiscreteUnivariateDistribution, + summary_stats_observations, + summary_stats_func::Function, distance_func::Function, + k::Int, N::Int) + @assert length(models) >= 2 "Should contain at least 2 models" + @assert ncategories(models_prior) == length(models) "Number of categories of models' prior and number of models do not equal" + + models_indexes = zeros(Int, N) + summary_stats_vector = Vector{typeof(summary_stats_observations)}(undef, N) + distances = zeros(N) + bool_parametric = typeof(models) <: Vector{ParametricModel} + for i = 1:N + current_idx_model = rand(models_prior) + models_indexes[i] = current_idx_model + if bool_parametric + param_model = models[current_idx_model] + vec_p = rand(param_model.distribution) + sim = simulate(param_model, vec_p) + else + sim = simulate(models[current_idx_model]) + end + summary_stats_vector[i] = summary_stats_func(sim) + distances[i] = distance_func(summary_stats_vector[i], summary_stats_observations) + end + k_nn = sortperm(distances, alg = QuickSort)[1:k] + + return AbcModelChoiceDataset(models_indexes[k_nn], summary_stats_vector[k_nn], distances[k_nn[end]]) +end + +function rf_abc_model_choice(models::Vector{<:Union{Model,ParametricModel}}, + summary_stats_observations, + summary_stats_func::Function, N_ref::Int; + k::Int = N_ref, distance_func::Function = (x,y) -> 1, + hyperparameters_range::Dict = Dict(:n_estimators => [200], :min_samples_leaf => [1, 2], + :min_samples_split => [2,5])) + @assert k <= N_ref + trainset = abc_model_choice_dataset(models, summary_stats_observations, summary_stats_func, distance_func, k, N_ref) + gridsearch = GridSearchCV(RandomForestClassifier(oob_score=true), hyperparameters_range) + fit!(gridsearch, trainset.X, trainset.y) + best_rf = gridsearch.best_estimator_ + return RandomForestABC(trainset, best_rf, summary_stats_observations, predict(best_rf, [summary_stats_observations])) +end + +# P(m = m^(ss_obs) | ss_obs) estimate +function posterior_proba_model(rf_abc::RandomForestABC) + oob_votes = rf_abc.clf.oob_decision_function_ + y_pred_oob = argmax.([oob_votes[i,:] for i = 1:size(oob_votes)[1]]) + y_oob_regression = y_pred_oob .!= rf_abc.reference_table.y + dict_params = Dict() + for param in ["n_estimators", "min_samples_leaf", "min_samples_split", "oob_score"] + dict_params[Symbol(param)] = get_params(rf_abc.clf)[param] + end + rf_regressor = RandomForestRegressor(;dict_params...) + fit!(rf_regressor, rf_abc.reference_table.X, y_oob_regression) + return 1 - predict(rf_regressor, [rf_abc.summary_stats_observations])[1] +end + diff --git a/core/MarkovProcesses.jl b/core/MarkovProcesses.jl index 0165793..8b63974 100644 --- a/core/MarkovProcesses.jl +++ b/core/MarkovProcesses.jl @@ -8,12 +8,25 @@ import Base: setindex!, setproperty!, fill!, copyto! import Dates import Distributed: @everywhere, @distributed import Distributions: Product, Uniform, Normal -import Distributions: Distribution, Univariate, Continuous, UnivariateDistribution, +import Distributions: Distribution, Univariate, Continuous, + UnivariateDistribution, DiscreteUnivariateDistribution, MultivariateDistribution, product_distribution -import Distributions: insupport, isbounded, pdf +import Distributions: insupport, isbounded, ncategories, pdf import FunctionWrappers: FunctionWrapper import Random: rand, rand! +import ScikitLearn +import ScikitLearn: fit!, predict, get_params +import ScikitLearn.GridSearch: GridSearchCV import StaticArrays: SVector, @SVector +# Python objects import +import PyCall: PyNULL +const RandomForestClassifier = PyNULL() +const RandomForestRegressor = PyNULL() +function __init__() + (ScikitLearn.Skcore).import_sklearn() + copy!(RandomForestClassifier, (ScikitLearn.Skcore.pyimport("sklearn.ensemble")).RandomForestClassifier) + copy!(RandomForestRegressor, (ScikitLearn.Skcore.pyimport("sklearn.ensemble")).RandomForestRegressor) +end ## Exports export Distribution, Product, Uniform, Normal @@ -52,6 +65,7 @@ export load_model, load_automaton, load_plots # Algorithms export automaton_abc, abc_smc +export abc_model_choice_dataset, rf_abc_model_choice, posterior_proba_model # About biochemical networks export @network_model @@ -63,6 +77,7 @@ include("model.jl") include("utils.jl") include("network_model.jl") include("../algorithms/abc_smc.jl") +include("../algorithms/abc_model_choice.jl") end diff --git a/tests/abc_model_choice/toy_example.jl b/tests/abc_model_choice/toy_example.jl new file mode 100644 index 0000000..6346711 --- /dev/null +++ b/tests/abc_model_choice/toy_example.jl @@ -0,0 +1,72 @@ + +using SpecialFunctions +using LinearAlgebra +using Random +using Distributions +using MarkovProcesses +using ScikitLearn +@sk_import metrics: (accuracy_score, classification_report) + +global n = 20 + +struct Model1 <: Model end +struct Model2 <: Model end +struct Model3 <: Model end +import MarkovProcesses: simulate + +function simulate(m::Model1) + param = rand(Exponential(1)) + return rand(Exponential(param), n) +end +function simulate(m::Model2) + param = rand(Normal()) + return rand(LogNormal(param,1), n) +end +function simulate(m::Model3) + param = rand(Exponential(1)) + return rand(Gamma(2,1/param), n) +end + +m1, m2, m3 = Model1(), Model2(), Model3() +lh_m1(s) = exp(log(gamma(n+1)) - (n+1)*log(1+s[1])) +lh_m2(s) = exp(-s[2]^2/(2n*(n+1)) - (s[3]^2)/2 + (s[2]^2)/(2n) - s[2]) * (2pi)^(-n/2)*(n+1)^(-1/2) +lh_m3(s) = exp(s[2])*gamma(2n+1)/gamma(2)^n * (1+s[1])^(-2n-1) + +ss_func(y) = [sum(y), sum(log.(y)), sum(log.(y).^2)] +dist_l2(s_sim,s_obs) = sqrt(dot(s_sim,s_obs)) + +observations = simulate(m3) +ss_observations = ss_func(observations) +models = [m1, m2, m3] +abc_testset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 1000, 1000) + +list_lh = [lh_m1, lh_m2, lh_m3] +prob_model(ss, list_lh, idx_model) = list_lh[idx_model](ss) / sum([list_lh[i](ss) for i = eachindex(list_lh)]) +prob_model(ss, idx_model) = prob_model(ss, list_lh, idx_model) + +#= +using Plots +p = plot(title="Trainset") +colors = ["black", "red", "green"] +begin_idx = 1 +for i = 1:3 + models_i = findall(x->x==i, abc_trainset.models_indexes) + nbr_obs = length(models_i) + end_idx = begin_idx + nbr_obs - 1 + lh = list_lh[i] + @show i + @show nbr_obs + @show begin_idx:end_idx + scatter!(p, begin_idx:end_idx, prob_model.(abc_trainset.summary_stats_vector[models_i], 3), + color = colors[i], markersize = 3.0, markershape = :cross, label = "Model $i") + global begin_idx = end_idx + 1 +end +savefig("set.svg") +=# + +grid = Dict(:n_estimators => [500], :min_samples_leaf => [1], :min_samples_split => [2]) +res_rf_abc = rf_abc_model_choice(models, ss_observations, ss_func, 29000; hyperparameters_range = grid) +@show posterior_proba_model(res_rf_abc) +println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, abc_testset.X))) +@show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, abc_testset.X)) + -- GitLab