Skip to content
Snippets Groups Projects
Commit 888e74c1 authored by Bentriou Mahmoud's avatar Bentriou Mahmoud
Browse files

change abc dataset to matrix

parent 7d3b7089
No related branches found
No related tags found
No related merge requests found
struct AbcModelChoiceDataset
models_indexes::Vector{Int}
summary_stats_vector::Vector
summary_stats_matrix::Matrix
epsilon::Float64
end
......@@ -14,7 +14,7 @@ end
function getproperty(dataset::AbcModelChoiceDataset, sym::Symbol)
if sym == :X
return dataset.summary_stats_vector
return dataset.summary_stats_matrix
elseif sym == :y
return dataset.models_indexes
else
......@@ -25,21 +25,21 @@ end
function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}},
summary_stats_observations,
summary_stats_func::Function, distance_func::Function,
k::Int, N::Int)
k::Int, N::Int; dir_results::Union{Nothing,String} = nothing)
nbr_models = length(models)
models_prior = Categorical([1/nbr_models for i = 1:nbr_models])
return abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N)
return abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N; dir_results = dir_results)
end
function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}, models_prior::DiscreteUnivariateDistribution,
summary_stats_observations,
summary_stats_func::Function, distance_func::Function,
k::Int, N::Int)
k::Int, N::Int; dir_results::Union{Nothing,String} = nothing)
@assert length(models) >= 2 "Should contain at least 2 models"
@assert ncategories(models_prior) == length(models) "Number of categories of models' prior and number of models do not equal"
models_indexes = zeros(Int, N)
summary_stats_vector = Vector{typeof(summary_stats_observations)}(undef, N)
summary_stats_matrix = zeros(eltype(summary_stats_observations), length(summary_stats_observations), N)
distances = zeros(N)
bool_parametric = typeof(models) <: Vector{ParametricModel}
for i = 1:N
......@@ -52,12 +52,16 @@ function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}
else
sim = simulate(models[current_idx_model])
end
summary_stats_vector[i] = summary_stats_func(sim)
distances[i] = distance_func(summary_stats_vector[i], summary_stats_observations)
ss_i = summary_stats_func(sim)
summary_stats_matrix[:,i] = ss_i
distances[i] = distance_func(ss_i, summary_stats_observations)
end
k_nn = sortperm(distances, alg = QuickSort)[1:k]
return AbcModelChoiceDataset(models_indexes[k_nn], summary_stats_vector[k_nn], distances[k_nn[end]])
if dir_results != nothing
dir_results = basename(dir_results) != "" ? dir_results * "/" : dir_results
end
return AbcModelChoiceDataset(models_indexes[k_nn], summary_stats_matrix[:,k_nn], distances[k_nn[end]])
end
function rf_abc_model_choice(models::Vector{<:Union{Model,ParametricModel}},
......@@ -69,7 +73,7 @@ function rf_abc_model_choice(models::Vector{<:Union{Model,ParametricModel}},
@assert k <= N_ref
trainset = abc_model_choice_dataset(models, summary_stats_observations, summary_stats_func, distance_func, k, N_ref)
gridsearch = GridSearchCV(RandomForestClassifier(oob_score=true), hyperparameters_range)
fit!(gridsearch, trainset.X, trainset.y)
fit!(gridsearch, transpose(trainset.X), trainset.y)
best_rf = gridsearch.best_estimator_
return RandomForestABC(trainset, best_rf, summary_stats_observations, predict(best_rf, [summary_stats_observations]))
end
......@@ -84,7 +88,7 @@ function posterior_proba_model(rf_abc::RandomForestABC)
dict_params[Symbol(param)] = get_params(rf_abc.clf)[param]
end
rf_regressor = RandomForestRegressor(;dict_params...)
fit!(rf_regressor, rf_abc.reference_table.X, y_oob_regression)
fit!(rf_regressor, transpose(rf_abc.reference_table.X), y_oob_regression)
return 1 - predict(rf_regressor, [rf_abc.summary_stats_observations])[1]
end
......@@ -66,7 +66,8 @@ savefig("set.svg")
grid = Dict(:n_estimators => [500], :min_samples_leaf => [1], :min_samples_split => [2], :n_jobs => [8])
@timev res_rf_abc = rf_abc_model_choice(models, ss_observations, ss_func, 29000; hyperparameters_range = grid)
@show posterior_proba_model(res_rf_abc)
println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, abc_testset.X)))
@show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, abc_testset.X))
@show posterior_proba_model(res_rf_abc)
X_testset = transpose(abc_testset.X)
println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, X_testset)))
@show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, X_testset))
# From Pudlo: Reliable ABC model choice, 2016, Appendix B
using ARFIMA
using Random
using LinearAlgebra
using MarkovProcesses
using Distributions
using ScikitLearn
@sk_import metrics: (accuracy_score, classification_report)
using StatsBase: autocor
struct MA1 <: Model end
struct MA2 <: Model end
import MarkovProcesses: simulate
global N_tml = 100
global σ = 1.0
struct TriangleDist <: ContinuousMultivariateDistribution end
function Distributions.rand(d::TriangleDist)
θ1 = rand(Uniform(-2, 2))
θ2 = (θ1 < 0) ? rand(Uniform(-θ1-1,1)) : rand(Uniform(θ1-1,1))
return [θ1, θ2]
end
Distributions.rand!(d::TriangleDist, p::AbstractVector) = p[:] = rand(d)
Distributions.length(d::TriangleDist) = 2
Distributions.pdf(d::TriangleDist, p::AbstractVector) = 1/8
function simulate(m::MA1)
θ1 = rand(Uniform(-1, 1))
x = zeros(100)
ϵtm1 = rand(Normal(0,σ^2))
x[1] = ϵtm1
for t = 2:100
ϵt = rand(Normal(0,σ^2))
x[t] = ϵt - θ1*ϵtm1
ϵtm1 = ϵt
end
return x
end
function simulate(m::MA2)
θ1, θ2 = rand(TriangleDist())
x = zeros(100)
ϵtm1 = rand(Normal(0,σ^2))
ϵtm2 = rand(Normal(0,σ^2))
x[1] = ϵtm2
x[2] = ϵtm1 - θ1*ϵtm2
for t = 3:100
ϵt = rand(Normal(0,σ^2))
x[t] = ϵt - θ1*ϵtm1 - θ2*ϵtm2
ϵtm2 = ϵtm1
ϵtm1 = ϵt
end
return x
end
#=
function simulate(m::MA1)
θ1 = rand(Uniform(-1, 1))
return arma(N_tml, σ, nothing, SVector(θ1))
end
function simulate(m::MA2)
θ = rand(TriangleDist())
return arma(N_tml, σ, nothing, SVector(θ[1],θ[2]))
end
=#
m1, m2 = MA1(), MA2()
models = [m1, m2]
ss_func(y) = autocor(y, 1:7)
dist_l2(s_sim,s_obs) = norm(s_sim-s_obs)
observations = simulate(m1)
ss_observations = ss_func(observations)
abc_testset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 10000, 10000)
grid = Dict(:n_estimators => [300], :min_samples_leaf => [1], :min_samples_split => [2], :n_jobs => [8])
res_rf_abc = rf_abc_model_choice(models, ss_observations, ss_func, 10000; hyperparameters_range = grid)
@show posterior_proba_model(res_rf_abc)
X_testset = transpose(abc_testset.X)
println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, X_testset)))
@show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, X_testset))
%% Cell type:markdown id: tags:
# Setup models, dataset
%% Cell type:code id: tags:
``` julia
using SpecialFunctions
using LinearAlgebra
using Random
using Distributions
using MarkovProcesses
global n = 20
struct Model1 <: Model end
struct Model2 <: Model end
struct Model3 <: Model end
import MarkovProcesses: simulate
function simulate(m::Model1)
param = rand(Exponential(1))
return rand(Exponential(param), n)
end
function simulate(m::Model2)
param = rand(Normal())
return rand(LogNormal(param,1), n)
end
function simulate(m::Model3)
param = rand(Exponential(1))
return rand(Gamma(2,1/param), n)
end
m1, m2, m3 = Model1(), Model2(), Model3()
lh_m1(s::Vector) = exp(log(gamma(n+1)) - (n+1)*log(1+s[1]))
lh_m2(s::Vector) = exp(-s[2]^2/(2n*(n+1)) - (s[3]^2)/2 + (s[2]^2)/(2n) - s[2]) * (2pi)^(-n/2)*(n+1)^(-1/2)
lh_m3(s::Vector) = exp(s[2])*gamma(2n+1)/gamma(2)^n * (1+s[1])^(-2n-1)
ss_func(y) = [sum(y), sum(log.(y)), sum(log.(y).^2)]
dist_l2(s_sim,s_obs) = sqrt(dot(s_sim,s_obs))
observations = simulate(m3)
ss_observations = ss_func(observations)
models = [m1, m2, m3]
abc_trainset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 29000, 29000)
abc_testset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 1000, 1000)
list_lh = [lh_m1, lh_m2, lh_m3]
prob_model(ss::Vector, list_lh, idx_model) = list_lh[idx_model](ss) / sum([list_lh[i](ss) for i = eachindex(list_lh)])
prob_model(ss::Vector, idx_model) = prob_model(ss, list_lh, idx_model)
prob_model3(ss::Vector) = prob_model(ss, list_lh, 3)
```
%% Cell type:markdown id: tags:
# Plot
%% Cell type:code id: tags:
``` julia
using Plots
p = plot(title="Trainset")
colors = ["black", "red", "green"]
begin_idx = 1
for i = 1:3
models_i = findall(x->x==i, abc_testset.models_indexes)
nbr_obs = length(models_i)
end_idx = begin_idx + nbr_obs - 1
lh = list_lh[i]
scatter!(p, begin_idx:end_idx,
vec(mapslices(prob_model3, abc_testset.summary_stats_matrix[:,models_i], dims = 1)),
color = colors[i], markersize = 3.0, markershape = :cross, label = "Model $i")
global begin_idx = end_idx + 1
end
p
```
%% Cell type:markdown id: tags:
# Classification models
%% Cell type:code id: tags:
``` julia
using ScikitLearn
@sk_import linear_model: LogisticRegression
@sk_import ensemble: RandomForestClassifier
@sk_import metrics: (classification_report, confusion_matrix)
@sk_import neighbors: KNeighborsClassifier
X_trainset = transpose(abc_trainset.X)
X_testset = transpose(abc_testset.X)
logit_reg = fit!(LogisticRegression(), X_trainset, abc_trainset.y)
y_pred_logit = predict(logit_reg, X_testset)
println(classification_report(y_pred = y_pred_logit, y_true = abc_testset.y))
rf_clf = fit!(RandomForestClassifier(n_estimators=500), X_trainset, abc_trainset.y)
y_pred_rf = predict(rf_clf, X_testset)
println(classification_report(y_pred = y_pred_rf, y_true = abc_testset.y))
knn_clf = fit!(KNeighborsClassifier(n_neighbors=20), X_trainset, abc_trainset.y)
y_pred_knn = predict(rf_clf, X_testset)
println(classification_report(y_pred = y_pred_rf, y_true = abc_testset.y))
```
%% Cell type:markdown id: tags:
# RF ABC
%% Cell type:code id: tags:
``` julia
res_rf = rf_abc_model_choice(models, ss_observations, ss_func, 29000;
hyperparameters_range = Dict(:n_estimators => [500]))
println(classification_report(y_pred = predict(res_rf.clf, X_testset), y_true = abc_testset.y))
println(confusion_matrix(y_pred = predict(res_rf.clf, X_testset), y_true = abc_testset.y))
```
%% Cell type:code id: tags:
``` julia
dict_params = Dict()
for param in keys(get_params(res_rf.clf))
dict_params[Symbol(param)] = get_params(res_rf.clf)[param]
end
RandomForestClassifier(;dict_params...)
```
%% Cell type:code id: tags:
``` julia
oob_votes = res_rf.clf.oob_decision_function_
y_pred_oob = argmax.([oob_votes[i,:] for i = 1:size(oob_votes)[1]])
@show mean(y_pred_oob .== res_rf.reference_table.y)
@show res_rf.clf.oob_score_
```
%% Cell type:code id: tags:
``` julia
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment