Commit 97df1230 authored by Bentriou Mahmoud's avatar Bentriou Mahmoud
Browse files

implementation of abc model choice with random forest

parent 26228335
struct AbcModelChoiceDataset
models_indexes::Vector{Int}
summary_stats_vector::Vector
epsilon::Float64
end
struct RandomForestABC
reference_table::AbcModelChoiceDataset
clf
summary_stats_observations
estim_model
end
function getproperty(dataset::AbcModelChoiceDataset, sym::Symbol)
if sym == :X
return dataset.summary_stats_vector
elseif sym == :y
return dataset.models_indexes
else
return getfield(dataset, sym)
end
end
function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}},
summary_stats_observations,
summary_stats_func::Function, distance_func::Function,
k::Int, N::Int)
nbr_models = length(models)
models_prior = Categorical([1/nbr_models for i = 1:nbr_models])
return abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N)
end
function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}, models_prior::DiscreteUnivariateDistribution,
summary_stats_observations,
summary_stats_func::Function, distance_func::Function,
k::Int, N::Int)
@assert length(models) >= 2 "Should contain at least 2 models"
@assert ncategories(models_prior) == length(models) "Number of categories of models' prior and number of models do not equal"
models_indexes = zeros(Int, N)
summary_stats_vector = Vector{typeof(summary_stats_observations)}(undef, N)
distances = zeros(N)
bool_parametric = typeof(models) <: Vector{ParametricModel}
for i = 1:N
current_idx_model = rand(models_prior)
models_indexes[i] = current_idx_model
if bool_parametric
param_model = models[current_idx_model]
vec_p = rand(param_model.distribution)
sim = simulate(param_model, vec_p)
else
sim = simulate(models[current_idx_model])
end
summary_stats_vector[i] = summary_stats_func(sim)
distances[i] = distance_func(summary_stats_vector[i], summary_stats_observations)
end
k_nn = sortperm(distances, alg = QuickSort)[1:k]
return AbcModelChoiceDataset(models_indexes[k_nn], summary_stats_vector[k_nn], distances[k_nn[end]])
end
function rf_abc_model_choice(models::Vector{<:Union{Model,ParametricModel}},
summary_stats_observations,
summary_stats_func::Function, N_ref::Int;
k::Int = N_ref, distance_func::Function = (x,y) -> 1,
hyperparameters_range::Dict = Dict(:n_estimators => [200], :min_samples_leaf => [1, 2],
:min_samples_split => [2,5]))
@assert k <= N_ref
trainset = abc_model_choice_dataset(models, summary_stats_observations, summary_stats_func, distance_func, k, N_ref)
gridsearch = GridSearchCV(RandomForestClassifier(oob_score=true), hyperparameters_range)
fit!(gridsearch, trainset.X, trainset.y)
best_rf = gridsearch.best_estimator_
return RandomForestABC(trainset, best_rf, summary_stats_observations, predict(best_rf, [summary_stats_observations]))
end
# P(m = m^(ss_obs) | ss_obs) estimate
function posterior_proba_model(rf_abc::RandomForestABC)
oob_votes = rf_abc.clf.oob_decision_function_
y_pred_oob = argmax.([oob_votes[i,:] for i = 1:size(oob_votes)[1]])
y_oob_regression = y_pred_oob .!= rf_abc.reference_table.y
dict_params = Dict()
for param in ["n_estimators", "min_samples_leaf", "min_samples_split", "oob_score"]
dict_params[Symbol(param)] = get_params(rf_abc.clf)[param]
end
rf_regressor = RandomForestRegressor(;dict_params...)
fit!(rf_regressor, rf_abc.reference_table.X, y_oob_regression)
return 1 - predict(rf_regressor, [rf_abc.summary_stats_observations])[1]
end
......@@ -8,12 +8,25 @@ import Base: setindex!, setproperty!, fill!, copyto!
import Dates
import Distributed: @everywhere, @distributed
import Distributions: Product, Uniform, Normal
import Distributions: Distribution, Univariate, Continuous, UnivariateDistribution,
import Distributions: Distribution, Univariate, Continuous,
UnivariateDistribution, DiscreteUnivariateDistribution,
MultivariateDistribution, product_distribution
import Distributions: insupport, isbounded, pdf
import Distributions: insupport, isbounded, ncategories, pdf
import FunctionWrappers: FunctionWrapper
import Random: rand, rand!
import ScikitLearn
import ScikitLearn: fit!, predict, get_params
import ScikitLearn.GridSearch: GridSearchCV
import StaticArrays: SVector, @SVector
# Python objects import
import PyCall: PyNULL
const RandomForestClassifier = PyNULL()
const RandomForestRegressor = PyNULL()
function __init__()
(ScikitLearn.Skcore).import_sklearn()
copy!(RandomForestClassifier, (ScikitLearn.Skcore.pyimport("sklearn.ensemble")).RandomForestClassifier)
copy!(RandomForestRegressor, (ScikitLearn.Skcore.pyimport("sklearn.ensemble")).RandomForestRegressor)
end
## Exports
export Distribution, Product, Uniform, Normal
......@@ -52,6 +65,7 @@ export load_model, load_automaton, load_plots
# Algorithms
export automaton_abc, abc_smc
export abc_model_choice_dataset, rf_abc_model_choice, posterior_proba_model
# About biochemical networks
export @network_model
......@@ -63,6 +77,7 @@ include("model.jl")
include("utils.jl")
include("network_model.jl")
include("../algorithms/abc_smc.jl")
include("../algorithms/abc_model_choice.jl")
end
using SpecialFunctions
using LinearAlgebra
using Random
using Distributions
using MarkovProcesses
using ScikitLearn
@sk_import metrics: (accuracy_score, classification_report)
global n = 20
struct Model1 <: Model end
struct Model2 <: Model end
struct Model3 <: Model end
import MarkovProcesses: simulate
function simulate(m::Model1)
param = rand(Exponential(1))
return rand(Exponential(param), n)
end
function simulate(m::Model2)
param = rand(Normal())
return rand(LogNormal(param,1), n)
end
function simulate(m::Model3)
param = rand(Exponential(1))
return rand(Gamma(2,1/param), n)
end
m1, m2, m3 = Model1(), Model2(), Model3()
lh_m1(s) = exp(log(gamma(n+1)) - (n+1)*log(1+s[1]))
lh_m2(s) = exp(-s[2]^2/(2n*(n+1)) - (s[3]^2)/2 + (s[2]^2)/(2n) - s[2]) * (2pi)^(-n/2)*(n+1)^(-1/2)
lh_m3(s) = exp(s[2])*gamma(2n+1)/gamma(2)^n * (1+s[1])^(-2n-1)
ss_func(y) = [sum(y), sum(log.(y)), sum(log.(y).^2)]
dist_l2(s_sim,s_obs) = sqrt(dot(s_sim,s_obs))
observations = simulate(m3)
ss_observations = ss_func(observations)
models = [m1, m2, m3]
abc_testset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 1000, 1000)
list_lh = [lh_m1, lh_m2, lh_m3]
prob_model(ss, list_lh, idx_model) = list_lh[idx_model](ss) / sum([list_lh[i](ss) for i = eachindex(list_lh)])
prob_model(ss, idx_model) = prob_model(ss, list_lh, idx_model)
#=
using Plots
p = plot(title="Trainset")
colors = ["black", "red", "green"]
begin_idx = 1
for i = 1:3
models_i = findall(x->x==i, abc_trainset.models_indexes)
nbr_obs = length(models_i)
end_idx = begin_idx + nbr_obs - 1
lh = list_lh[i]
@show i
@show nbr_obs
@show begin_idx:end_idx
scatter!(p, begin_idx:end_idx, prob_model.(abc_trainset.summary_stats_vector[models_i], 3),
color = colors[i], markersize = 3.0, markershape = :cross, label = "Model $i")
global begin_idx = end_idx + 1
end
savefig("set.svg")
=#
grid = Dict(:n_estimators => [500], :min_samples_leaf => [1], :min_samples_split => [2])
res_rf_abc = rf_abc_model_choice(models, ss_observations, ss_func, 29000; hyperparameters_range = grid)
@show posterior_proba_model(res_rf_abc)
println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, abc_testset.X)))
@show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, abc_testset.X))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment