Commit 715f269f authored by Bentriou Mahmoud's avatar Bentriou Mahmoud
Browse files

Add of distributed algorithm for the reference table creation in abc model...

Add of distributed algorithm for the reference table creation in abc model choice. Documentation for abc model choice methods.
parent 888e74c1
......@@ -22,16 +22,57 @@ function getproperty(dataset::AbcModelChoiceDataset, sym::Symbol)
end
end
"""
`abc_model_choice_dataset(models, models_prior,
summary_stats_observations,
summary_stats_func::Function, distance_func::Function,
k::Int, N_ref::Int; dir_results::Union{Nothing,String} = nothing)`
Creates a reference table for ABC model choice.
The mandatory arguments are:
* `models` is a list of objects inherited from `Model` or `ParametricModel`,
* `models_prior`: the prior over the models (by default: discrete uniform distribution)
* `summary_stats_observations` are the summary statitics of the observations,
* `summary_stats_func::Function`: the function that computes the summary statistics over a model simulation,
* `distance_func`: the distance function over the summary statistics space,
* `N_ref`: the number of samples in the reference table,
* `k`: the k nearest samples from the observations to keep in the reference table (k < N_ref).
The result is a `AbcModelChoiceDataset` with fields:
* `summary_stats_matrix`: the (N_stats, N_ref) features matrix. Accessible via `.X`.
* `models_indexes`: the labels vector. Accessible via `.y`.
If specified, `dir_results` is the directory where the summary statistics matrix and associated models are stored (CSV).
"""
function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}, models_prior::DiscreteUnivariateDistribution,
summary_stats_observations,
summary_stats_func::Function, distance_func::Function,
k::Int, N_ref::Int; dir_results::Union{Nothing,String} = nothing)
if nprocs() == 1
return _abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N_ref; dir_results = dir_results)
end
return _distributed_abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N_ref; dir_results = dir_results)
end
"""
`abc_model_choice_dataset(models, models_prior,
summary_stats_observations,
summary_stats_func::Function, distance_func::Function,
k::Int, N_ref::Int; dir_results::Union{Nothing,String} = nothing)`
Creates a reference table for ABC model choice with discrete uniform prior distribution over the models.
"""
function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}},
summary_stats_observations,
summary_stats_func::Function, distance_func::Function,
k::Int, N::Int; dir_results::Union{Nothing,String} = nothing)
k::Int, N_ref::Int; dir_results::Union{Nothing,String} = nothing)
nbr_models = length(models)
models_prior = Categorical([1/nbr_models for i = 1:nbr_models])
return abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N; dir_results = dir_results)
return abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N_ref; dir_results = dir_results)
end
function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}, models_prior::DiscreteUnivariateDistribution,
function _abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}, models_prior::DiscreteUnivariateDistribution,
summary_stats_observations,
summary_stats_func::Function, distance_func::Function,
k::Int, N::Int; dir_results::Union{Nothing,String} = nothing)
......@@ -57,27 +98,117 @@ function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}
distances[i] = distance_func(ss_i, summary_stats_observations)
end
k_nn = sortperm(distances, alg = QuickSort)[1:k]
knn_models_indexes = models_indexes[k_nn]
knn_summary_stats_matrix = summary_stats_matrix[:,k_nn]
if dir_results != nothing
dir_results = basename(dir_results) != "" ? dir_results * "/" : dir_results
if !isdir(dir_results) mkdir(dir_results) end
writedlm(dir_results * "X_abc_dataset.csv", knn_summary_stats_matrix, ',')
writedlm(dir_results * "y_abc_dataset.csv", knn_models_indexes, ',')
file_cfg = open(dir_results * "config_abc_dataset.out", "w")
write(file_cfg, "Models: $(models) \n")
write(file_cfg, "N: $(N) \n")
write(file_cfg, "k: $(k) \n")
write(file_cfg, "tolerance epsilon: $(distances[k_nn[end]]) \n")
close(file_cfg)
end
return AbcModelChoiceDataset(knn_models_indexes, knn_summary_stats_matrix, distances[k_nn[end]])
end
function _distributed_abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}, models_prior::DiscreteUnivariateDistribution,
summary_stats_observations,
summary_stats_func::Function, distance_func::Function,
k::Int, N::Int; dir_results::Union{Nothing,String} = nothing)
@assert length(models) >= 2 "Should contain at least 2 models"
@assert ncategories(models_prior) == length(models) "Number of categories of models' prior and number of models do not equal"
models_indexes = SharedVector{Int}(N)
summary_stats_matrix = SharedMatrix{eltype(summary_stats_observations)}(length(summary_stats_observations), N)
distances = SharedVector{Float64}(N)
bool_parametric = typeof(models) <: Vector{ParametricModel}
@sync @distributed for i = 1:N
current_idx_model = rand(models_prior)
models_indexes[i] = current_idx_model
if bool_parametric
param_model = models[current_idx_model]
vec_p = rand(param_model.distribution)
sim = simulate(param_model, vec_p)
else
sim = simulate(models[current_idx_model])
end
ss_i = summary_stats_func(sim)
summary_stats_matrix[:,i] = ss_i
distances[i] = distance_func(ss_i, summary_stats_observations)
end
k_nn = sortperm(sdata(distances), alg = QuickSort)[1:k]
knn_models_indexes = sdata(models_indexes)[k_nn]
knn_summary_stats_matrix = sdata(summary_stats_matrix)[:,k_nn]
if dir_results != nothing
dir_results = basename(dir_results) != "" ? dir_results * "/" : dir_results
if !isdir(dir_results) mkdir(dir_results) end
writedlm(dir_results * "X_abc_dataset.csv", knn_summary_stats_matrix, ',')
writedlm(dir_results * "y_abc_dataset.csv", knn_models_indexes, ',')
file_cfg = open(dir_results * "config_abc_dataset.out", "w")
write(file_cfg, "Models: $(models) \n")
write(file_cfg, "N: $(N) \n")
write(file_cfg, "k: $(k) \n")
write(file_cfg, "tolerance epsilon: $(distances[k_nn[end]]) \n")
close(file_cfg)
end
return AbcModelChoiceDataset(models_indexes[k_nn], summary_stats_matrix[:,k_nn], distances[k_nn[end]])
return AbcModelChoiceDataset(knn_models_indexes, knn_summary_stats_matrix, distances[k_nn[end]])
end
"""
`rf_abc_model_choice(models, summary_stats_observations,
summary_stats_func::Function, N_ref::Int;
k::Int = N_ref, distance_func::Function = (x,y) -> 1,
hyperparameters_range::Dict)`
Run the Random Forest Approximate Bayesian Computation model choice method.
The mandatory arguments are:
* `models` is a list of objects inherited from `Model` or `ParametricModel`,
* `summary_stats_observations` are the summary statitics of the observations
* `N_ref`: the number of samples in the reference table.
* `summary_stats_func::Function`: the function that computes the summary statistics over a model simulation.
The optional arguments are:
* `models_prior`: the prior over the models (by default: discrete uniform distribution)
* `k`: the k nearest samples from the observations to keep in the reference table (by default: k = N_ref)
* `distance_func`: the distance function, has to be defined if k < N_ref
* `hyperparameters_range`: a dict with the hyperparameters range values for the cross validation
fit of the Random Forest (by default: `Dict(:n_estimators => [200], :min_samples_leaf => [1], :min_samples_split => [2])`).
See scikit-learn documentation of RandomForestClassifier for the hyperparameters name.
The result is a `RandomForestABC` object with fields:
* `reference_table` an AbcModelChoiceDataset that corresponds to the reference table of the algorithm,
* `clf` a random forest classifier (PyObject from scikit-learn),
* `summary_stats_observations` are the summary statitics of the observations
* `estim_model` is the underlying model of the observations inferred with the RF-ABC method.
"""
function rf_abc_model_choice(models::Vector{<:Union{Model,ParametricModel}},
summary_stats_observations,
summary_stats_func::Function, N_ref::Int;
models_prior::DiscreteUnivariateDistribution = Categorical([1/length(models) for i = 1:length(models)]),
k::Int = N_ref, distance_func::Function = (x,y) -> 1,
hyperparameters_range::Dict = Dict(:n_estimators => [200], :min_samples_leaf => [1],
:min_samples_split => [2]))
@assert k <= N_ref
trainset = abc_model_choice_dataset(models, summary_stats_observations, summary_stats_func, distance_func, k, N_ref)
trainset = abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N_ref)
gridsearch = GridSearchCV(RandomForestClassifier(oob_score=true), hyperparameters_range)
fit!(gridsearch, transpose(trainset.X), trainset.y)
best_rf = gridsearch.best_estimator_
return RandomForestABC(trainset, best_rf, summary_stats_observations, predict(best_rf, [summary_stats_observations]))
end
"""
`posterior_proba_model(rf_abc::RandomForestABC)`
Estimates the posterior probability of the model with the Random Forest ABC method.
"""
# P(m = m^(ss_obs) | ss_obs) estimate
function posterior_proba_model(rf_abc::RandomForestABC)
oob_votes = rf_abc.clf.oob_decision_function_
......
......@@ -17,6 +17,7 @@ import Random: rand, rand!
import ScikitLearn
import ScikitLearn: fit!, predict, get_params
import ScikitLearn.GridSearch: GridSearchCV
import SharedArrays: SharedVector, SharedMatrix, sdata
import StaticArrays: SVector, @SVector
# Python objects import
import PyCall: PyNULL
......
using SpecialFunctions
using LinearAlgebra
using Random
using Distributions
using MarkovProcesses
@everywhere begin
using Distributions
using MarkovProcesses
using SpecialFunctions
using LinearAlgebra
using Random
end
using ScikitLearn
@sk_import metrics: (accuracy_score, classification_report)
global n = 20
@everywhere begin
global n = 20
struct Model1 <: Model end
struct Model2 <: Model end
struct Model3 <: Model end
import MarkovProcesses: simulate
struct Model1 <: Model end
struct Model2 <: Model end
struct Model3 <: Model end
import MarkovProcesses: simulate
function simulate(m::Model1)
param = rand(Exponential(1))
return rand(Exponential(param), n)
end
function simulate(m::Model2)
param = rand(Normal())
return rand(LogNormal(param,1), n)
end
function simulate(m::Model3)
param = rand(Exponential(1))
return rand(Gamma(2,1/param), n)
end
function simulate(m::Model1)
param = rand(Exponential(1))
return rand(Exponential(param), n)
end
function simulate(m::Model2)
param = rand(Normal())
return rand(LogNormal(param,1), n)
end
function simulate(m::Model3)
param = rand(Exponential(1))
return rand(Gamma(2,1/param), n)
end
m1, m2, m3 = Model1(), Model2(), Model3()
lh_m1(s) = exp(log(gamma(n+1)) - (n+1)*log(1+s[1]))
lh_m2(s) = exp(-s[2]^2/(2n*(n+1)) - (s[3]^2)/2 + (s[2]^2)/(2n) - s[2]) * (2pi)^(-n/2)*(n+1)^(-1/2)
lh_m3(s) = exp(s[2])*gamma(2n+1)/gamma(2)^n * (1+s[1])^(-2n-1)
lh_m1(s) = exp(log(gamma(n+1)) - (n+1)*log(1+s[1]))
lh_m2(s) = exp(-s[2]^2/(2n*(n+1)) - (s[3]^2)/2 + (s[2]^2)/(2n) - s[2]) * (2pi)^(-n/2)*(n+1)^(-1/2)
lh_m3(s) = exp(s[2])*gamma(2n+1)/gamma(2)^n * (1+s[1])^(-2n-1)
ss_func(y) = [sum(y), sum(log.(y)), sum(log.(y).^2)]
dist_l2(s_sim,s_obs) = norm(s_sim-s_obs)
ss_func(y) = [sum(y), sum(log.(y)), sum(log.(y).^2)]
dist_l2(s_sim,s_obs) = norm(s_sim-s_obs)
end
m1, m2, m3 = Model1(), Model2(), Model3()
observations = simulate(m3)
ss_observations = ss_func(observations)
models = [m1, m2, m3]
abc_testset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 1000, 1000)
println("Testset 10000 samples")
@timev abc_testset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 10000, 10000; dir_results = "toy_ex")
list_lh = [lh_m1, lh_m2, lh_m3]
prob_model(ss, list_lh, idx_model) = list_lh[idx_model](ss) / sum([list_lh[i](ss) for i = eachindex(list_lh)])
......@@ -65,9 +70,12 @@ savefig("set.svg")
=#
grid = Dict(:n_estimators => [500], :min_samples_leaf => [1], :min_samples_split => [2], :n_jobs => [8])
println("RF ABC")
@timev res_rf_abc = rf_abc_model_choice(models, ss_observations, ss_func, 29000; hyperparameters_range = grid)
@show posterior_proba_model(res_rf_abc)
X_testset = transpose(abc_testset.X)
println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, X_testset)))
@show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, X_testset))
return true
......@@ -80,3 +80,5 @@ X_testset = transpose(abc_testset.X)
println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, X_testset)))
@show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, X_testset))
return true
......@@ -5,4 +5,5 @@ include("run_dist_lp.jl")
include("run_automata.jl")
include("run_cosmos.jl")
include("run_abc_smc.jl")
include("run_abc_model_choice.jl")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment