Skip to content
Snippets Groups Projects
Commit 715f269f authored by Bentriou Mahmoud's avatar Bentriou Mahmoud
Browse files

Add of distributed algorithm for the reference table creation in abc model...

Add of distributed algorithm for the reference table creation in abc model choice. Documentation for abc model choice methods.
parent 888e74c1
No related branches found
No related tags found
No related merge requests found
...@@ -22,16 +22,57 @@ function getproperty(dataset::AbcModelChoiceDataset, sym::Symbol) ...@@ -22,16 +22,57 @@ function getproperty(dataset::AbcModelChoiceDataset, sym::Symbol)
end end
end end
"""
`abc_model_choice_dataset(models, models_prior,
summary_stats_observations,
summary_stats_func::Function, distance_func::Function,
k::Int, N_ref::Int; dir_results::Union{Nothing,String} = nothing)`
Creates a reference table for ABC model choice.
The mandatory arguments are:
* `models` is a list of objects inherited from `Model` or `ParametricModel`,
* `models_prior`: the prior over the models (by default: discrete uniform distribution)
* `summary_stats_observations` are the summary statitics of the observations,
* `summary_stats_func::Function`: the function that computes the summary statistics over a model simulation,
* `distance_func`: the distance function over the summary statistics space,
* `N_ref`: the number of samples in the reference table,
* `k`: the k nearest samples from the observations to keep in the reference table (k < N_ref).
The result is a `AbcModelChoiceDataset` with fields:
* `summary_stats_matrix`: the (N_stats, N_ref) features matrix. Accessible via `.X`.
* `models_indexes`: the labels vector. Accessible via `.y`.
If specified, `dir_results` is the directory where the summary statistics matrix and associated models are stored (CSV).
"""
function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}, models_prior::DiscreteUnivariateDistribution,
summary_stats_observations,
summary_stats_func::Function, distance_func::Function,
k::Int, N_ref::Int; dir_results::Union{Nothing,String} = nothing)
if nprocs() == 1
return _abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N_ref; dir_results = dir_results)
end
return _distributed_abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N_ref; dir_results = dir_results)
end
"""
`abc_model_choice_dataset(models, models_prior,
summary_stats_observations,
summary_stats_func::Function, distance_func::Function,
k::Int, N_ref::Int; dir_results::Union{Nothing,String} = nothing)`
Creates a reference table for ABC model choice with discrete uniform prior distribution over the models.
"""
function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}, function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}},
summary_stats_observations, summary_stats_observations,
summary_stats_func::Function, distance_func::Function, summary_stats_func::Function, distance_func::Function,
k::Int, N::Int; dir_results::Union{Nothing,String} = nothing) k::Int, N_ref::Int; dir_results::Union{Nothing,String} = nothing)
nbr_models = length(models) nbr_models = length(models)
models_prior = Categorical([1/nbr_models for i = 1:nbr_models]) models_prior = Categorical([1/nbr_models for i = 1:nbr_models])
return abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N; dir_results = dir_results) return abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N_ref; dir_results = dir_results)
end end
function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}, models_prior::DiscreteUnivariateDistribution, function _abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}, models_prior::DiscreteUnivariateDistribution,
summary_stats_observations, summary_stats_observations,
summary_stats_func::Function, distance_func::Function, summary_stats_func::Function, distance_func::Function,
k::Int, N::Int; dir_results::Union{Nothing,String} = nothing) k::Int, N::Int; dir_results::Union{Nothing,String} = nothing)
...@@ -57,27 +98,117 @@ function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}} ...@@ -57,27 +98,117 @@ function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}
distances[i] = distance_func(ss_i, summary_stats_observations) distances[i] = distance_func(ss_i, summary_stats_observations)
end end
k_nn = sortperm(distances, alg = QuickSort)[1:k] k_nn = sortperm(distances, alg = QuickSort)[1:k]
knn_models_indexes = models_indexes[k_nn]
knn_summary_stats_matrix = summary_stats_matrix[:,k_nn]
if dir_results != nothing
dir_results = basename(dir_results) != "" ? dir_results * "/" : dir_results
if !isdir(dir_results) mkdir(dir_results) end
writedlm(dir_results * "X_abc_dataset.csv", knn_summary_stats_matrix, ',')
writedlm(dir_results * "y_abc_dataset.csv", knn_models_indexes, ',')
file_cfg = open(dir_results * "config_abc_dataset.out", "w")
write(file_cfg, "Models: $(models) \n")
write(file_cfg, "N: $(N) \n")
write(file_cfg, "k: $(k) \n")
write(file_cfg, "tolerance epsilon: $(distances[k_nn[end]]) \n")
close(file_cfg)
end
return AbcModelChoiceDataset(knn_models_indexes, knn_summary_stats_matrix, distances[k_nn[end]])
end
function _distributed_abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}, models_prior::DiscreteUnivariateDistribution,
summary_stats_observations,
summary_stats_func::Function, distance_func::Function,
k::Int, N::Int; dir_results::Union{Nothing,String} = nothing)
@assert length(models) >= 2 "Should contain at least 2 models"
@assert ncategories(models_prior) == length(models) "Number of categories of models' prior and number of models do not equal"
models_indexes = SharedVector{Int}(N)
summary_stats_matrix = SharedMatrix{eltype(summary_stats_observations)}(length(summary_stats_observations), N)
distances = SharedVector{Float64}(N)
bool_parametric = typeof(models) <: Vector{ParametricModel}
@sync @distributed for i = 1:N
current_idx_model = rand(models_prior)
models_indexes[i] = current_idx_model
if bool_parametric
param_model = models[current_idx_model]
vec_p = rand(param_model.distribution)
sim = simulate(param_model, vec_p)
else
sim = simulate(models[current_idx_model])
end
ss_i = summary_stats_func(sim)
summary_stats_matrix[:,i] = ss_i
distances[i] = distance_func(ss_i, summary_stats_observations)
end
k_nn = sortperm(sdata(distances), alg = QuickSort)[1:k]
knn_models_indexes = sdata(models_indexes)[k_nn]
knn_summary_stats_matrix = sdata(summary_stats_matrix)[:,k_nn]
if dir_results != nothing if dir_results != nothing
dir_results = basename(dir_results) != "" ? dir_results * "/" : dir_results dir_results = basename(dir_results) != "" ? dir_results * "/" : dir_results
if !isdir(dir_results) mkdir(dir_results) end
writedlm(dir_results * "X_abc_dataset.csv", knn_summary_stats_matrix, ',')
writedlm(dir_results * "y_abc_dataset.csv", knn_models_indexes, ',')
file_cfg = open(dir_results * "config_abc_dataset.out", "w")
write(file_cfg, "Models: $(models) \n")
write(file_cfg, "N: $(N) \n")
write(file_cfg, "k: $(k) \n")
write(file_cfg, "tolerance epsilon: $(distances[k_nn[end]]) \n")
close(file_cfg)
end end
return AbcModelChoiceDataset(models_indexes[k_nn], summary_stats_matrix[:,k_nn], distances[k_nn[end]])
return AbcModelChoiceDataset(knn_models_indexes, knn_summary_stats_matrix, distances[k_nn[end]])
end end
"""
`rf_abc_model_choice(models, summary_stats_observations,
summary_stats_func::Function, N_ref::Int;
k::Int = N_ref, distance_func::Function = (x,y) -> 1,
hyperparameters_range::Dict)`
Run the Random Forest Approximate Bayesian Computation model choice method.
The mandatory arguments are:
* `models` is a list of objects inherited from `Model` or `ParametricModel`,
* `summary_stats_observations` are the summary statitics of the observations
* `N_ref`: the number of samples in the reference table.
* `summary_stats_func::Function`: the function that computes the summary statistics over a model simulation.
The optional arguments are:
* `models_prior`: the prior over the models (by default: discrete uniform distribution)
* `k`: the k nearest samples from the observations to keep in the reference table (by default: k = N_ref)
* `distance_func`: the distance function, has to be defined if k < N_ref
* `hyperparameters_range`: a dict with the hyperparameters range values for the cross validation
fit of the Random Forest (by default: `Dict(:n_estimators => [200], :min_samples_leaf => [1], :min_samples_split => [2])`).
See scikit-learn documentation of RandomForestClassifier for the hyperparameters name.
The result is a `RandomForestABC` object with fields:
* `reference_table` an AbcModelChoiceDataset that corresponds to the reference table of the algorithm,
* `clf` a random forest classifier (PyObject from scikit-learn),
* `summary_stats_observations` are the summary statitics of the observations
* `estim_model` is the underlying model of the observations inferred with the RF-ABC method.
"""
function rf_abc_model_choice(models::Vector{<:Union{Model,ParametricModel}}, function rf_abc_model_choice(models::Vector{<:Union{Model,ParametricModel}},
summary_stats_observations, summary_stats_observations,
summary_stats_func::Function, N_ref::Int; summary_stats_func::Function, N_ref::Int;
models_prior::DiscreteUnivariateDistribution = Categorical([1/length(models) for i = 1:length(models)]),
k::Int = N_ref, distance_func::Function = (x,y) -> 1, k::Int = N_ref, distance_func::Function = (x,y) -> 1,
hyperparameters_range::Dict = Dict(:n_estimators => [200], :min_samples_leaf => [1], hyperparameters_range::Dict = Dict(:n_estimators => [200], :min_samples_leaf => [1],
:min_samples_split => [2])) :min_samples_split => [2]))
@assert k <= N_ref @assert k <= N_ref
trainset = abc_model_choice_dataset(models, summary_stats_observations, summary_stats_func, distance_func, k, N_ref) trainset = abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N_ref)
gridsearch = GridSearchCV(RandomForestClassifier(oob_score=true), hyperparameters_range) gridsearch = GridSearchCV(RandomForestClassifier(oob_score=true), hyperparameters_range)
fit!(gridsearch, transpose(trainset.X), trainset.y) fit!(gridsearch, transpose(trainset.X), trainset.y)
best_rf = gridsearch.best_estimator_ best_rf = gridsearch.best_estimator_
return RandomForestABC(trainset, best_rf, summary_stats_observations, predict(best_rf, [summary_stats_observations])) return RandomForestABC(trainset, best_rf, summary_stats_observations, predict(best_rf, [summary_stats_observations]))
end end
"""
`posterior_proba_model(rf_abc::RandomForestABC)`
Estimates the posterior probability of the model with the Random Forest ABC method.
"""
# P(m = m^(ss_obs) | ss_obs) estimate # P(m = m^(ss_obs) | ss_obs) estimate
function posterior_proba_model(rf_abc::RandomForestABC) function posterior_proba_model(rf_abc::RandomForestABC)
oob_votes = rf_abc.clf.oob_decision_function_ oob_votes = rf_abc.clf.oob_decision_function_
......
...@@ -17,6 +17,7 @@ import Random: rand, rand! ...@@ -17,6 +17,7 @@ import Random: rand, rand!
import ScikitLearn import ScikitLearn
import ScikitLearn: fit!, predict, get_params import ScikitLearn: fit!, predict, get_params
import ScikitLearn.GridSearch: GridSearchCV import ScikitLearn.GridSearch: GridSearchCV
import SharedArrays: SharedVector, SharedMatrix, sdata
import StaticArrays: SVector, @SVector import StaticArrays: SVector, @SVector
# Python objects import # Python objects import
import PyCall: PyNULL import PyCall: PyNULL
......
using SpecialFunctions @everywhere begin
using LinearAlgebra using Distributions
using Random using MarkovProcesses
using Distributions using SpecialFunctions
using MarkovProcesses using LinearAlgebra
using Random
end
using ScikitLearn using ScikitLearn
@sk_import metrics: (accuracy_score, classification_report) @sk_import metrics: (accuracy_score, classification_report)
global n = 20 @everywhere begin
global n = 20
struct Model1 <: Model end struct Model1 <: Model end
struct Model2 <: Model end struct Model2 <: Model end
struct Model3 <: Model end struct Model3 <: Model end
import MarkovProcesses: simulate import MarkovProcesses: simulate
function simulate(m::Model1) function simulate(m::Model1)
param = rand(Exponential(1)) param = rand(Exponential(1))
return rand(Exponential(param), n) return rand(Exponential(param), n)
end end
function simulate(m::Model2) function simulate(m::Model2)
param = rand(Normal()) param = rand(Normal())
return rand(LogNormal(param,1), n) return rand(LogNormal(param,1), n)
end end
function simulate(m::Model3) function simulate(m::Model3)
param = rand(Exponential(1)) param = rand(Exponential(1))
return rand(Gamma(2,1/param), n) return rand(Gamma(2,1/param), n)
end end
m1, m2, m3 = Model1(), Model2(), Model3() lh_m1(s) = exp(log(gamma(n+1)) - (n+1)*log(1+s[1]))
lh_m1(s) = exp(log(gamma(n+1)) - (n+1)*log(1+s[1])) lh_m2(s) = exp(-s[2]^2/(2n*(n+1)) - (s[3]^2)/2 + (s[2]^2)/(2n) - s[2]) * (2pi)^(-n/2)*(n+1)^(-1/2)
lh_m2(s) = exp(-s[2]^2/(2n*(n+1)) - (s[3]^2)/2 + (s[2]^2)/(2n) - s[2]) * (2pi)^(-n/2)*(n+1)^(-1/2) lh_m3(s) = exp(s[2])*gamma(2n+1)/gamma(2)^n * (1+s[1])^(-2n-1)
lh_m3(s) = exp(s[2])*gamma(2n+1)/gamma(2)^n * (1+s[1])^(-2n-1)
ss_func(y) = [sum(y), sum(log.(y)), sum(log.(y).^2)] ss_func(y) = [sum(y), sum(log.(y)), sum(log.(y).^2)]
dist_l2(s_sim,s_obs) = norm(s_sim-s_obs) dist_l2(s_sim,s_obs) = norm(s_sim-s_obs)
end
m1, m2, m3 = Model1(), Model2(), Model3()
observations = simulate(m3) observations = simulate(m3)
ss_observations = ss_func(observations) ss_observations = ss_func(observations)
models = [m1, m2, m3] models = [m1, m2, m3]
abc_testset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 1000, 1000) println("Testset 10000 samples")
@timev abc_testset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 10000, 10000; dir_results = "toy_ex")
list_lh = [lh_m1, lh_m2, lh_m3] list_lh = [lh_m1, lh_m2, lh_m3]
prob_model(ss, list_lh, idx_model) = list_lh[idx_model](ss) / sum([list_lh[i](ss) for i = eachindex(list_lh)]) prob_model(ss, list_lh, idx_model) = list_lh[idx_model](ss) / sum([list_lh[i](ss) for i = eachindex(list_lh)])
...@@ -65,9 +70,12 @@ savefig("set.svg") ...@@ -65,9 +70,12 @@ savefig("set.svg")
=# =#
grid = Dict(:n_estimators => [500], :min_samples_leaf => [1], :min_samples_split => [2], :n_jobs => [8]) grid = Dict(:n_estimators => [500], :min_samples_leaf => [1], :min_samples_split => [2], :n_jobs => [8])
println("RF ABC")
@timev res_rf_abc = rf_abc_model_choice(models, ss_observations, ss_func, 29000; hyperparameters_range = grid) @timev res_rf_abc = rf_abc_model_choice(models, ss_observations, ss_func, 29000; hyperparameters_range = grid)
@show posterior_proba_model(res_rf_abc) @show posterior_proba_model(res_rf_abc)
X_testset = transpose(abc_testset.X) X_testset = transpose(abc_testset.X)
println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, X_testset))) println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, X_testset)))
@show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, X_testset)) @show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, X_testset))
return true
...@@ -80,3 +80,5 @@ X_testset = transpose(abc_testset.X) ...@@ -80,3 +80,5 @@ X_testset = transpose(abc_testset.X)
println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, X_testset))) println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, X_testset)))
@show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, X_testset)) @show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, X_testset))
return true
...@@ -5,4 +5,5 @@ include("run_dist_lp.jl") ...@@ -5,4 +5,5 @@ include("run_dist_lp.jl")
include("run_automata.jl") include("run_automata.jl")
include("run_cosmos.jl") include("run_cosmos.jl")
include("run_abc_smc.jl") include("run_abc_smc.jl")
include("run_abc_model_choice.jl")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment