Skip to content
Snippets Groups Projects
Commit 97df1230 authored by Bentriou Mahmoud's avatar Bentriou Mahmoud
Browse files

implementation of abc model choice with random forest

parent 26228335
No related branches found
No related tags found
No related merge requests found
struct AbcModelChoiceDataset
models_indexes::Vector{Int}
summary_stats_vector::Vector
epsilon::Float64
end
struct RandomForestABC
reference_table::AbcModelChoiceDataset
clf
summary_stats_observations
estim_model
end
function getproperty(dataset::AbcModelChoiceDataset, sym::Symbol)
if sym == :X
return dataset.summary_stats_vector
elseif sym == :y
return dataset.models_indexes
else
return getfield(dataset, sym)
end
end
function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}},
summary_stats_observations,
summary_stats_func::Function, distance_func::Function,
k::Int, N::Int)
nbr_models = length(models)
models_prior = Categorical([1/nbr_models for i = 1:nbr_models])
return abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N)
end
function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}, models_prior::DiscreteUnivariateDistribution,
summary_stats_observations,
summary_stats_func::Function, distance_func::Function,
k::Int, N::Int)
@assert length(models) >= 2 "Should contain at least 2 models"
@assert ncategories(models_prior) == length(models) "Number of categories of models' prior and number of models do not equal"
models_indexes = zeros(Int, N)
summary_stats_vector = Vector{typeof(summary_stats_observations)}(undef, N)
distances = zeros(N)
bool_parametric = typeof(models) <: Vector{ParametricModel}
for i = 1:N
current_idx_model = rand(models_prior)
models_indexes[i] = current_idx_model
if bool_parametric
param_model = models[current_idx_model]
vec_p = rand(param_model.distribution)
sim = simulate(param_model, vec_p)
else
sim = simulate(models[current_idx_model])
end
summary_stats_vector[i] = summary_stats_func(sim)
distances[i] = distance_func(summary_stats_vector[i], summary_stats_observations)
end
k_nn = sortperm(distances, alg = QuickSort)[1:k]
return AbcModelChoiceDataset(models_indexes[k_nn], summary_stats_vector[k_nn], distances[k_nn[end]])
end
function rf_abc_model_choice(models::Vector{<:Union{Model,ParametricModel}},
summary_stats_observations,
summary_stats_func::Function, N_ref::Int;
k::Int = N_ref, distance_func::Function = (x,y) -> 1,
hyperparameters_range::Dict = Dict(:n_estimators => [200], :min_samples_leaf => [1, 2],
:min_samples_split => [2,5]))
@assert k <= N_ref
trainset = abc_model_choice_dataset(models, summary_stats_observations, summary_stats_func, distance_func, k, N_ref)
gridsearch = GridSearchCV(RandomForestClassifier(oob_score=true), hyperparameters_range)
fit!(gridsearch, trainset.X, trainset.y)
best_rf = gridsearch.best_estimator_
return RandomForestABC(trainset, best_rf, summary_stats_observations, predict(best_rf, [summary_stats_observations]))
end
# P(m = m^(ss_obs) | ss_obs) estimate
function posterior_proba_model(rf_abc::RandomForestABC)
oob_votes = rf_abc.clf.oob_decision_function_
y_pred_oob = argmax.([oob_votes[i,:] for i = 1:size(oob_votes)[1]])
y_oob_regression = y_pred_oob .!= rf_abc.reference_table.y
dict_params = Dict()
for param in ["n_estimators", "min_samples_leaf", "min_samples_split", "oob_score"]
dict_params[Symbol(param)] = get_params(rf_abc.clf)[param]
end
rf_regressor = RandomForestRegressor(;dict_params...)
fit!(rf_regressor, rf_abc.reference_table.X, y_oob_regression)
return 1 - predict(rf_regressor, [rf_abc.summary_stats_observations])[1]
end
......@@ -8,12 +8,25 @@ import Base: setindex!, setproperty!, fill!, copyto!
import Dates
import Distributed: @everywhere, @distributed
import Distributions: Product, Uniform, Normal
import Distributions: Distribution, Univariate, Continuous, UnivariateDistribution,
import Distributions: Distribution, Univariate, Continuous,
UnivariateDistribution, DiscreteUnivariateDistribution,
MultivariateDistribution, product_distribution
import Distributions: insupport, isbounded, pdf
import Distributions: insupport, isbounded, ncategories, pdf
import FunctionWrappers: FunctionWrapper
import Random: rand, rand!
import ScikitLearn
import ScikitLearn: fit!, predict, get_params
import ScikitLearn.GridSearch: GridSearchCV
import StaticArrays: SVector, @SVector
# Python objects import
import PyCall: PyNULL
const RandomForestClassifier = PyNULL()
const RandomForestRegressor = PyNULL()
function __init__()
(ScikitLearn.Skcore).import_sklearn()
copy!(RandomForestClassifier, (ScikitLearn.Skcore.pyimport("sklearn.ensemble")).RandomForestClassifier)
copy!(RandomForestRegressor, (ScikitLearn.Skcore.pyimport("sklearn.ensemble")).RandomForestRegressor)
end
## Exports
export Distribution, Product, Uniform, Normal
......@@ -52,6 +65,7 @@ export load_model, load_automaton, load_plots
# Algorithms
export automaton_abc, abc_smc
export abc_model_choice_dataset, rf_abc_model_choice, posterior_proba_model
# About biochemical networks
export @network_model
......@@ -63,6 +77,7 @@ include("model.jl")
include("utils.jl")
include("network_model.jl")
include("../algorithms/abc_smc.jl")
include("../algorithms/abc_model_choice.jl")
end
using SpecialFunctions
using LinearAlgebra
using Random
using Distributions
using MarkovProcesses
using ScikitLearn
@sk_import metrics: (accuracy_score, classification_report)
global n = 20
struct Model1 <: Model end
struct Model2 <: Model end
struct Model3 <: Model end
import MarkovProcesses: simulate
function simulate(m::Model1)
param = rand(Exponential(1))
return rand(Exponential(param), n)
end
function simulate(m::Model2)
param = rand(Normal())
return rand(LogNormal(param,1), n)
end
function simulate(m::Model3)
param = rand(Exponential(1))
return rand(Gamma(2,1/param), n)
end
m1, m2, m3 = Model1(), Model2(), Model3()
lh_m1(s) = exp(log(gamma(n+1)) - (n+1)*log(1+s[1]))
lh_m2(s) = exp(-s[2]^2/(2n*(n+1)) - (s[3]^2)/2 + (s[2]^2)/(2n) - s[2]) * (2pi)^(-n/2)*(n+1)^(-1/2)
lh_m3(s) = exp(s[2])*gamma(2n+1)/gamma(2)^n * (1+s[1])^(-2n-1)
ss_func(y) = [sum(y), sum(log.(y)), sum(log.(y).^2)]
dist_l2(s_sim,s_obs) = sqrt(dot(s_sim,s_obs))
observations = simulate(m3)
ss_observations = ss_func(observations)
models = [m1, m2, m3]
abc_testset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 1000, 1000)
list_lh = [lh_m1, lh_m2, lh_m3]
prob_model(ss, list_lh, idx_model) = list_lh[idx_model](ss) / sum([list_lh[i](ss) for i = eachindex(list_lh)])
prob_model(ss, idx_model) = prob_model(ss, list_lh, idx_model)
#=
using Plots
p = plot(title="Trainset")
colors = ["black", "red", "green"]
begin_idx = 1
for i = 1:3
models_i = findall(x->x==i, abc_trainset.models_indexes)
nbr_obs = length(models_i)
end_idx = begin_idx + nbr_obs - 1
lh = list_lh[i]
@show i
@show nbr_obs
@show begin_idx:end_idx
scatter!(p, begin_idx:end_idx, prob_model.(abc_trainset.summary_stats_vector[models_i], 3),
color = colors[i], markersize = 3.0, markershape = :cross, label = "Model $i")
global begin_idx = end_idx + 1
end
savefig("set.svg")
=#
grid = Dict(:n_estimators => [500], :min_samples_leaf => [1], :min_samples_split => [2])
res_rf_abc = rf_abc_model_choice(models, ss_observations, ss_func, 29000; hyperparameters_range = grid)
@show posterior_proba_model(res_rf_abc)
println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, abc_testset.X)))
@show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, abc_testset.X))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment