Commit 888e74c1 authored by Bentriou Mahmoud's avatar Bentriou Mahmoud
Browse files

change abc dataset to matrix

parent 7d3b7089
struct AbcModelChoiceDataset struct AbcModelChoiceDataset
models_indexes::Vector{Int} models_indexes::Vector{Int}
summary_stats_vector::Vector summary_stats_matrix::Matrix
epsilon::Float64 epsilon::Float64
end end
...@@ -14,7 +14,7 @@ end ...@@ -14,7 +14,7 @@ end
function getproperty(dataset::AbcModelChoiceDataset, sym::Symbol) function getproperty(dataset::AbcModelChoiceDataset, sym::Symbol)
if sym == :X if sym == :X
return dataset.summary_stats_vector return dataset.summary_stats_matrix
elseif sym == :y elseif sym == :y
return dataset.models_indexes return dataset.models_indexes
else else
...@@ -25,21 +25,21 @@ end ...@@ -25,21 +25,21 @@ end
function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}, function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}},
summary_stats_observations, summary_stats_observations,
summary_stats_func::Function, distance_func::Function, summary_stats_func::Function, distance_func::Function,
k::Int, N::Int) k::Int, N::Int; dir_results::Union{Nothing,String} = nothing)
nbr_models = length(models) nbr_models = length(models)
models_prior = Categorical([1/nbr_models for i = 1:nbr_models]) models_prior = Categorical([1/nbr_models for i = 1:nbr_models])
return abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N) return abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N; dir_results = dir_results)
end end
function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}, models_prior::DiscreteUnivariateDistribution, function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}, models_prior::DiscreteUnivariateDistribution,
summary_stats_observations, summary_stats_observations,
summary_stats_func::Function, distance_func::Function, summary_stats_func::Function, distance_func::Function,
k::Int, N::Int) k::Int, N::Int; dir_results::Union{Nothing,String} = nothing)
@assert length(models) >= 2 "Should contain at least 2 models" @assert length(models) >= 2 "Should contain at least 2 models"
@assert ncategories(models_prior) == length(models) "Number of categories of models' prior and number of models do not equal" @assert ncategories(models_prior) == length(models) "Number of categories of models' prior and number of models do not equal"
models_indexes = zeros(Int, N) models_indexes = zeros(Int, N)
summary_stats_vector = Vector{typeof(summary_stats_observations)}(undef, N) summary_stats_matrix = zeros(eltype(summary_stats_observations), length(summary_stats_observations), N)
distances = zeros(N) distances = zeros(N)
bool_parametric = typeof(models) <: Vector{ParametricModel} bool_parametric = typeof(models) <: Vector{ParametricModel}
for i = 1:N for i = 1:N
...@@ -52,12 +52,16 @@ function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}} ...@@ -52,12 +52,16 @@ function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}
else else
sim = simulate(models[current_idx_model]) sim = simulate(models[current_idx_model])
end end
summary_stats_vector[i] = summary_stats_func(sim) ss_i = summary_stats_func(sim)
distances[i] = distance_func(summary_stats_vector[i], summary_stats_observations) summary_stats_matrix[:,i] = ss_i
distances[i] = distance_func(ss_i, summary_stats_observations)
end end
k_nn = sortperm(distances, alg = QuickSort)[1:k] k_nn = sortperm(distances, alg = QuickSort)[1:k]
return AbcModelChoiceDataset(models_indexes[k_nn], summary_stats_vector[k_nn], distances[k_nn[end]]) if dir_results != nothing
dir_results = basename(dir_results) != "" ? dir_results * "/" : dir_results
end
return AbcModelChoiceDataset(models_indexes[k_nn], summary_stats_matrix[:,k_nn], distances[k_nn[end]])
end end
function rf_abc_model_choice(models::Vector{<:Union{Model,ParametricModel}}, function rf_abc_model_choice(models::Vector{<:Union{Model,ParametricModel}},
...@@ -69,7 +73,7 @@ function rf_abc_model_choice(models::Vector{<:Union{Model,ParametricModel}}, ...@@ -69,7 +73,7 @@ function rf_abc_model_choice(models::Vector{<:Union{Model,ParametricModel}},
@assert k <= N_ref @assert k <= N_ref
trainset = abc_model_choice_dataset(models, summary_stats_observations, summary_stats_func, distance_func, k, N_ref) trainset = abc_model_choice_dataset(models, summary_stats_observations, summary_stats_func, distance_func, k, N_ref)
gridsearch = GridSearchCV(RandomForestClassifier(oob_score=true), hyperparameters_range) gridsearch = GridSearchCV(RandomForestClassifier(oob_score=true), hyperparameters_range)
fit!(gridsearch, trainset.X, trainset.y) fit!(gridsearch, transpose(trainset.X), trainset.y)
best_rf = gridsearch.best_estimator_ best_rf = gridsearch.best_estimator_
return RandomForestABC(trainset, best_rf, summary_stats_observations, predict(best_rf, [summary_stats_observations])) return RandomForestABC(trainset, best_rf, summary_stats_observations, predict(best_rf, [summary_stats_observations]))
end end
...@@ -84,7 +88,7 @@ function posterior_proba_model(rf_abc::RandomForestABC) ...@@ -84,7 +88,7 @@ function posterior_proba_model(rf_abc::RandomForestABC)
dict_params[Symbol(param)] = get_params(rf_abc.clf)[param] dict_params[Symbol(param)] = get_params(rf_abc.clf)[param]
end end
rf_regressor = RandomForestRegressor(;dict_params...) rf_regressor = RandomForestRegressor(;dict_params...)
fit!(rf_regressor, rf_abc.reference_table.X, y_oob_regression) fit!(rf_regressor, transpose(rf_abc.reference_table.X), y_oob_regression)
return 1 - predict(rf_regressor, [rf_abc.summary_stats_observations])[1] return 1 - predict(rf_regressor, [rf_abc.summary_stats_observations])[1]
end end
...@@ -66,7 +66,8 @@ savefig("set.svg") ...@@ -66,7 +66,8 @@ savefig("set.svg")
grid = Dict(:n_estimators => [500], :min_samples_leaf => [1], :min_samples_split => [2], :n_jobs => [8]) grid = Dict(:n_estimators => [500], :min_samples_leaf => [1], :min_samples_split => [2], :n_jobs => [8])
@timev res_rf_abc = rf_abc_model_choice(models, ss_observations, ss_func, 29000; hyperparameters_range = grid) @timev res_rf_abc = rf_abc_model_choice(models, ss_observations, ss_func, 29000; hyperparameters_range = grid)
@show posterior_proba_model(res_rf_abc) @show posterior_proba_model(res_rf_abc)
println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, abc_testset.X))) X_testset = transpose(abc_testset.X)
@show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, abc_testset.X)) println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, X_testset)))
@show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, X_testset))
# From Pudlo: Reliable ABC model choice, 2016, Appendix B
using ARFIMA
using Random
using LinearAlgebra
using MarkovProcesses
using Distributions
using ScikitLearn
@sk_import metrics: (accuracy_score, classification_report)
using StatsBase: autocor
struct MA1 <: Model end
struct MA2 <: Model end
import MarkovProcesses: simulate
global N_tml = 100
global σ = 1.0
struct TriangleDist <: ContinuousMultivariateDistribution end
function Distributions.rand(d::TriangleDist)
θ1 = rand(Uniform(-2, 2))
θ2 = (θ1 < 0) ? rand(Uniform(-θ1-1,1)) : rand(Uniform(θ1-1,1))
return [θ1, θ2]
end
Distributions.rand!(d::TriangleDist, p::AbstractVector) = p[:] = rand(d)
Distributions.length(d::TriangleDist) = 2
Distributions.pdf(d::TriangleDist, p::AbstractVector) = 1/8
function simulate(m::MA1)
θ1 = rand(Uniform(-1, 1))
x = zeros(100)
ϵtm1 = rand(Normal(0,σ^2))
x[1] = ϵtm1
for t = 2:100
ϵt = rand(Normal(0,σ^2))
x[t] = ϵt - θ1*ϵtm1
ϵtm1 = ϵt
end
return x
end
function simulate(m::MA2)
θ1, θ2 = rand(TriangleDist())
x = zeros(100)
ϵtm1 = rand(Normal(0,σ^2))
ϵtm2 = rand(Normal(0,σ^2))
x[1] = ϵtm2
x[2] = ϵtm1 - θ1*ϵtm2
for t = 3:100
ϵt = rand(Normal(0,σ^2))
x[t] = ϵt - θ1*ϵtm1 - θ2*ϵtm2
ϵtm2 = ϵtm1
ϵtm1 = ϵt
end
return x
end
#=
function simulate(m::MA1)
θ1 = rand(Uniform(-1, 1))
return arma(N_tml, σ, nothing, SVector(θ1))
end
function simulate(m::MA2)
θ = rand(TriangleDist())
return arma(N_tml, σ, nothing, SVector(θ[1],θ[2]))
end
=#
m1, m2 = MA1(), MA2()
models = [m1, m2]
ss_func(y) = autocor(y, 1:7)
dist_l2(s_sim,s_obs) = norm(s_sim-s_obs)
observations = simulate(m1)
ss_observations = ss_func(observations)
abc_testset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 10000, 10000)
grid = Dict(:n_estimators => [300], :min_samples_leaf => [1], :min_samples_split => [2], :n_jobs => [8])
res_rf_abc = rf_abc_model_choice(models, ss_observations, ss_func, 10000; hyperparameters_range = grid)
@show posterior_proba_model(res_rf_abc)
X_testset = transpose(abc_testset.X)
println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, X_testset)))
@show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, X_testset))
%% Cell type:markdown id: tags:
# Setup models, dataset
%% Cell type:code id: tags:
``` julia
using SpecialFunctions
using LinearAlgebra
using Random
using Distributions
using MarkovProcesses
global n = 20
struct Model1 <: Model end
struct Model2 <: Model end
struct Model3 <: Model end
import MarkovProcesses: simulate
function simulate(m::Model1)
param = rand(Exponential(1))
return rand(Exponential(param), n)
end
function simulate(m::Model2)
param = rand(Normal())
return rand(LogNormal(param,1), n)
end
function simulate(m::Model3)
param = rand(Exponential(1))
return rand(Gamma(2,1/param), n)
end
m1, m2, m3 = Model1(), Model2(), Model3()
lh_m1(s::Vector) = exp(log(gamma(n+1)) - (n+1)*log(1+s[1]))
lh_m2(s::Vector) = exp(-s[2]^2/(2n*(n+1)) - (s[3]^2)/2 + (s[2]^2)/(2n) - s[2]) * (2pi)^(-n/2)*(n+1)^(-1/2)
lh_m3(s::Vector) = exp(s[2])*gamma(2n+1)/gamma(2)^n * (1+s[1])^(-2n-1)
ss_func(y) = [sum(y), sum(log.(y)), sum(log.(y).^2)]
dist_l2(s_sim,s_obs) = sqrt(dot(s_sim,s_obs))
observations = simulate(m3)
ss_observations = ss_func(observations)
models = [m1, m2, m3]
abc_trainset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 29000, 29000)
abc_testset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 1000, 1000)
list_lh = [lh_m1, lh_m2, lh_m3]
prob_model(ss::Vector, list_lh, idx_model) = list_lh[idx_model](ss) / sum([list_lh[i](ss) for i = eachindex(list_lh)])
prob_model(ss::Vector, idx_model) = prob_model(ss, list_lh, idx_model)
prob_model3(ss::Vector) = prob_model(ss, list_lh, 3)
```
%% Cell type:markdown id: tags:
# Plot
%% Cell type:code id: tags:
``` julia
using Plots
p = plot(title="Trainset")
colors = ["black", "red", "green"]
begin_idx = 1
for i = 1:3
models_i = findall(x->x==i, abc_testset.models_indexes)
nbr_obs = length(models_i)
end_idx = begin_idx + nbr_obs - 1
lh = list_lh[i]
scatter!(p, begin_idx:end_idx,
vec(mapslices(prob_model3, abc_testset.summary_stats_matrix[:,models_i], dims = 1)),
color = colors[i], markersize = 3.0, markershape = :cross, label = "Model $i")
global begin_idx = end_idx + 1
end
p
```
%% Cell type:markdown id: tags:
# Classification models
%% Cell type:code id: tags:
``` julia
using ScikitLearn
@sk_import linear_model: LogisticRegression
@sk_import ensemble: RandomForestClassifier
@sk_import metrics: (classification_report, confusion_matrix)
@sk_import neighbors: KNeighborsClassifier
X_trainset = transpose(abc_trainset.X)
X_testset = transpose(abc_testset.X)
logit_reg = fit!(LogisticRegression(), X_trainset, abc_trainset.y)
y_pred_logit = predict(logit_reg, X_testset)
println(classification_report(y_pred = y_pred_logit, y_true = abc_testset.y))
rf_clf = fit!(RandomForestClassifier(n_estimators=500), X_trainset, abc_trainset.y)
y_pred_rf = predict(rf_clf, X_testset)
println(classification_report(y_pred = y_pred_rf, y_true = abc_testset.y))
knn_clf = fit!(KNeighborsClassifier(n_neighbors=20), X_trainset, abc_trainset.y)
y_pred_knn = predict(rf_clf, X_testset)
println(classification_report(y_pred = y_pred_rf, y_true = abc_testset.y))
```
%% Cell type:markdown id: tags:
# RF ABC
%% Cell type:code id: tags:
``` julia
res_rf = rf_abc_model_choice(models, ss_observations, ss_func, 29000;
hyperparameters_range = Dict(:n_estimators => [500]))
println(classification_report(y_pred = predict(res_rf.clf, X_testset), y_true = abc_testset.y))
println(confusion_matrix(y_pred = predict(res_rf.clf, X_testset), y_true = abc_testset.y))
```
%% Cell type:code id: tags:
``` julia
dict_params = Dict()
for param in keys(get_params(res_rf.clf))
dict_params[Symbol(param)] = get_params(res_rf.clf)[param]
end
RandomForestClassifier(;dict_params...)
```
%% Cell type:code id: tags:
``` julia
oob_votes = res_rf.clf.oob_decision_function_
y_pred_oob = argmax.([oob_votes[i,:] for i = 1:size(oob_votes)[1]])
@show mean(y_pred_oob .== res_rf.reference_table.y)
@show res_rf.clf.oob_score_
```
%% Cell type:code id: tags:
``` julia
```
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment