Commit 888e74c1 authored by Bentriou Mahmoud's avatar Bentriou Mahmoud

change abc dataset to matrix

parent 7d3b7089
struct AbcModelChoiceDataset
models_indexes::Vector{Int}
summary_stats_vector::Vector
summary_stats_matrix::Matrix
epsilon::Float64
end
......@@ -14,7 +14,7 @@ end
function getproperty(dataset::AbcModelChoiceDataset, sym::Symbol)
if sym == :X
return dataset.summary_stats_vector
return dataset.summary_stats_matrix
elseif sym == :y
return dataset.models_indexes
else
......@@ -25,21 +25,21 @@ end
function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}},
summary_stats_observations,
summary_stats_func::Function, distance_func::Function,
k::Int, N::Int)
k::Int, N::Int; dir_results::Union{Nothing,String} = nothing)
nbr_models = length(models)
models_prior = Categorical([1/nbr_models for i = 1:nbr_models])
return abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N)
return abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N; dir_results = dir_results)
end
function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}, models_prior::DiscreteUnivariateDistribution,
summary_stats_observations,
summary_stats_func::Function, distance_func::Function,
k::Int, N::Int)
k::Int, N::Int; dir_results::Union{Nothing,String} = nothing)
@assert length(models) >= 2 "Should contain at least 2 models"
@assert ncategories(models_prior) == length(models) "Number of categories of models' prior and number of models do not equal"
models_indexes = zeros(Int, N)
summary_stats_vector = Vector{typeof(summary_stats_observations)}(undef, N)
summary_stats_matrix = zeros(eltype(summary_stats_observations), length(summary_stats_observations), N)
distances = zeros(N)
bool_parametric = typeof(models) <: Vector{ParametricModel}
for i = 1:N
......@@ -52,12 +52,16 @@ function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}
else
sim = simulate(models[current_idx_model])
end
summary_stats_vector[i] = summary_stats_func(sim)
distances[i] = distance_func(summary_stats_vector[i], summary_stats_observations)
ss_i = summary_stats_func(sim)
summary_stats_matrix[:,i] = ss_i
distances[i] = distance_func(ss_i, summary_stats_observations)
end
k_nn = sortperm(distances, alg = QuickSort)[1:k]
return AbcModelChoiceDataset(models_indexes[k_nn], summary_stats_vector[k_nn], distances[k_nn[end]])
if dir_results != nothing
dir_results = basename(dir_results) != "" ? dir_results * "/" : dir_results
end
return AbcModelChoiceDataset(models_indexes[k_nn], summary_stats_matrix[:,k_nn], distances[k_nn[end]])
end
function rf_abc_model_choice(models::Vector{<:Union{Model,ParametricModel}},
......@@ -69,7 +73,7 @@ function rf_abc_model_choice(models::Vector{<:Union{Model,ParametricModel}},
@assert k <= N_ref
trainset = abc_model_choice_dataset(models, summary_stats_observations, summary_stats_func, distance_func, k, N_ref)
gridsearch = GridSearchCV(RandomForestClassifier(oob_score=true), hyperparameters_range)
fit!(gridsearch, trainset.X, trainset.y)
fit!(gridsearch, transpose(trainset.X), trainset.y)
best_rf = gridsearch.best_estimator_
return RandomForestABC(trainset, best_rf, summary_stats_observations, predict(best_rf, [summary_stats_observations]))
end
......@@ -84,7 +88,7 @@ function posterior_proba_model(rf_abc::RandomForestABC)
dict_params[Symbol(param)] = get_params(rf_abc.clf)[param]
end
rf_regressor = RandomForestRegressor(;dict_params...)
fit!(rf_regressor, rf_abc.reference_table.X, y_oob_regression)
fit!(rf_regressor, transpose(rf_abc.reference_table.X), y_oob_regression)
return 1 - predict(rf_regressor, [rf_abc.summary_stats_observations])[1]
end
......@@ -67,6 +67,7 @@ savefig("set.svg")
grid = Dict(:n_estimators => [500], :min_samples_leaf => [1], :min_samples_split => [2], :n_jobs => [8])
@timev res_rf_abc = rf_abc_model_choice(models, ss_observations, ss_func, 29000; hyperparameters_range = grid)
@show posterior_proba_model(res_rf_abc)
println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, abc_testset.X)))
@show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, abc_testset.X))
X_testset = transpose(abc_testset.X)
println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, X_testset)))
@show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, X_testset))
# From Pudlo: Reliable ABC model choice, 2016, Appendix B
using ARFIMA
using Random
using LinearAlgebra
using MarkovProcesses
using Distributions
using ScikitLearn
@sk_import metrics: (accuracy_score, classification_report)
using StatsBase: autocor
struct MA1 <: Model end
struct MA2 <: Model end
import MarkovProcesses: simulate
global N_tml = 100
global σ = 1.0
struct TriangleDist <: ContinuousMultivariateDistribution end
function Distributions.rand(d::TriangleDist)
θ1 = rand(Uniform(-2, 2))
θ2 = (θ1 < 0) ? rand(Uniform(-θ1-1,1)) : rand(Uniform(θ1-1,1))
return [θ1, θ2]
end
Distributions.rand!(d::TriangleDist, p::AbstractVector) = p[:] = rand(d)
Distributions.length(d::TriangleDist) = 2
Distributions.pdf(d::TriangleDist, p::AbstractVector) = 1/8
function simulate(m::MA1)
θ1 = rand(Uniform(-1, 1))
x = zeros(100)
ϵtm1 = rand(Normal(0,σ^2))
x[1] = ϵtm1
for t = 2:100
ϵt = rand(Normal(0,σ^2))
x[t] = ϵt - θ1*ϵtm1
ϵtm1 = ϵt
end
return x
end
function simulate(m::MA2)
θ1, θ2 = rand(TriangleDist())
x = zeros(100)
ϵtm1 = rand(Normal(0,σ^2))
ϵtm2 = rand(Normal(0,σ^2))
x[1] = ϵtm2
x[2] = ϵtm1 - θ1*ϵtm2
for t = 3:100
ϵt = rand(Normal(0,σ^2))
x[t] = ϵt - θ1*ϵtm1 - θ2*ϵtm2
ϵtm2 = ϵtm1
ϵtm1 = ϵt
end
return x
end
#=
function simulate(m::MA1)
θ1 = rand(Uniform(-1, 1))
return arma(N_tml, σ, nothing, SVector(θ1))
end
function simulate(m::MA2)
θ = rand(TriangleDist())
return arma(N_tml, σ, nothing, SVector(θ[1],θ[2]))
end
=#
m1, m2 = MA1(), MA2()
models = [m1, m2]
ss_func(y) = autocor(y, 1:7)
dist_l2(s_sim,s_obs) = norm(s_sim-s_obs)
observations = simulate(m1)
ss_observations = ss_func(observations)
abc_testset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 10000, 10000)
grid = Dict(:n_estimators => [300], :min_samples_leaf => [1], :min_samples_split => [2], :n_jobs => [8])
res_rf_abc = rf_abc_model_choice(models, ss_observations, ss_func, 10000; hyperparameters_range = grid)
@show posterior_proba_model(res_rf_abc)
X_testset = transpose(abc_testset.X)
println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, X_testset)))
@show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, X_testset))
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Setup models, dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"using SpecialFunctions\n",
"using LinearAlgebra\n",
"using Random\n",
"using Distributions\n",
"using MarkovProcesses\n",
"\n",
"global n = 20\n",
"\n",
"struct Model1 <: Model end\n",
"struct Model2 <: Model end\n",
"struct Model3 <: Model end\n",
"import MarkovProcesses: simulate\n",
"\n",
"function simulate(m::Model1)\n",
" param = rand(Exponential(1))\n",
" return rand(Exponential(param), n)\n",
"end\n",
"function simulate(m::Model2)\n",
" param = rand(Normal())\n",
" return rand(LogNormal(param,1), n)\n",
"end\n",
"function simulate(m::Model3)\n",
" param = rand(Exponential(1))\n",
" return rand(Gamma(2,1/param), n)\n",
"end\n",
"\n",
"m1, m2, m3 = Model1(), Model2(), Model3()\n",
"lh_m1(s::Vector) = exp(log(gamma(n+1)) - (n+1)*log(1+s[1]))\n",
"lh_m2(s::Vector) = exp(-s[2]^2/(2n*(n+1)) - (s[3]^2)/2 + (s[2]^2)/(2n) - s[2]) * (2pi)^(-n/2)*(n+1)^(-1/2)\n",
"lh_m3(s::Vector) = exp(s[2])*gamma(2n+1)/gamma(2)^n * (1+s[1])^(-2n-1)\n",
"\n",
"ss_func(y) = [sum(y), sum(log.(y)), sum(log.(y).^2)]\n",
"dist_l2(s_sim,s_obs) = sqrt(dot(s_sim,s_obs))\n",
"\n",
"observations = simulate(m3)\n",
"ss_observations = ss_func(observations)\n",
"models = [m1, m2, m3]\n",
"abc_trainset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 29000, 29000)\n",
"abc_testset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 1000, 1000)\n",
"\n",
"list_lh = [lh_m1, lh_m2, lh_m3]\n",
"prob_model(ss::Vector, list_lh, idx_model) = list_lh[idx_model](ss) / sum([list_lh[i](ss) for i = eachindex(list_lh)])\n",
"prob_model(ss::Vector, idx_model) = prob_model(ss, list_lh, idx_model)\n",
"prob_model3(ss::Vector) = prob_model(ss, list_lh, 3)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Plot"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"using Plots\n",
"\n",
"p = plot(title=\"Trainset\")\n",
"colors = [\"black\", \"red\", \"green\"]\n",
"begin_idx = 1\n",
"for i = 1:3\n",
" models_i = findall(x->x==i, abc_testset.models_indexes)\n",
" nbr_obs = length(models_i)\n",
" end_idx = begin_idx + nbr_obs - 1\n",
" lh = list_lh[i]\n",
" scatter!(p, begin_idx:end_idx, \n",
" vec(mapslices(prob_model3, abc_testset.summary_stats_matrix[:,models_i], dims = 1)), \n",
" color = colors[i], markersize = 3.0, markershape = :cross, label = \"Model $i\")\n",
" global begin_idx = end_idx + 1\n",
"end\n",
"p"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Classification models"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"using ScikitLearn\n",
"@sk_import linear_model: LogisticRegression\n",
"@sk_import ensemble: RandomForestClassifier\n",
"@sk_import metrics: (classification_report, confusion_matrix)\n",
"@sk_import neighbors: KNeighborsClassifier\n",
"\n",
"X_trainset = transpose(abc_trainset.X)\n",
"X_testset = transpose(abc_testset.X)\n",
"\n",
"logit_reg = fit!(LogisticRegression(), X_trainset, abc_trainset.y)\n",
"y_pred_logit = predict(logit_reg, X_testset)\n",
"println(classification_report(y_pred = y_pred_logit, y_true = abc_testset.y))\n",
"\n",
"rf_clf = fit!(RandomForestClassifier(n_estimators=500), X_trainset, abc_trainset.y)\n",
"y_pred_rf = predict(rf_clf, X_testset)\n",
"println(classification_report(y_pred = y_pred_rf, y_true = abc_testset.y))\n",
"\n",
"knn_clf = fit!(KNeighborsClassifier(n_neighbors=20), X_trainset, abc_trainset.y)\n",
"y_pred_knn = predict(rf_clf, X_testset)\n",
"println(classification_report(y_pred = y_pred_rf, y_true = abc_testset.y))\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# RF ABC"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"res_rf = rf_abc_model_choice(models, ss_observations, ss_func, 29000; \n",
" hyperparameters_range = Dict(:n_estimators => [500]))\n",
"println(classification_report(y_pred = predict(res_rf.clf, X_testset), y_true = abc_testset.y))\n",
"println(confusion_matrix(y_pred = predict(res_rf.clf, X_testset), y_true = abc_testset.y))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dict_params = Dict()\n",
"for param in keys(get_params(res_rf.clf))\n",
" dict_params[Symbol(param)] = get_params(res_rf.clf)[param]\n",
"end\n",
"RandomForestClassifier(;dict_params...)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"oob_votes = res_rf.clf.oob_decision_function_\n",
"y_pred_oob = argmax.([oob_votes[i,:] for i = 1:size(oob_votes)[1]])\n",
"@show mean(y_pred_oob .== res_rf.reference_table.y)\n",
"@show res_rf.clf.oob_score_"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Julia 1.5.3",
"language": "julia",
"name": "julia-1.5"
},
"language_info": {
"file_extension": ".jl",
"mimetype": "application/julia",
"name": "julia",
"version": "1.5.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment