diff --git a/algorithms/abc_model_choice.jl b/algorithms/abc_model_choice.jl index 44ee0ce776773ecf833542463096a613352f7a17..c5e0272696e6b0cefdf24b192263f52f316fbf17 100644 --- a/algorithms/abc_model_choice.jl +++ b/algorithms/abc_model_choice.jl @@ -1,7 +1,7 @@ struct AbcModelChoiceDataset models_indexes::Vector{Int} - summary_stats_vector::Vector + summary_stats_matrix::Matrix epsilon::Float64 end @@ -14,7 +14,7 @@ end function getproperty(dataset::AbcModelChoiceDataset, sym::Symbol) if sym == :X - return dataset.summary_stats_vector + return dataset.summary_stats_matrix elseif sym == :y return dataset.models_indexes else @@ -25,21 +25,21 @@ end function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}, summary_stats_observations, summary_stats_func::Function, distance_func::Function, - k::Int, N::Int) + k::Int, N::Int; dir_results::Union{Nothing,String} = nothing) nbr_models = length(models) models_prior = Categorical([1/nbr_models for i = 1:nbr_models]) - return abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N) + return abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N; dir_results = dir_results) end function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}, models_prior::DiscreteUnivariateDistribution, summary_stats_observations, summary_stats_func::Function, distance_func::Function, - k::Int, N::Int) + k::Int, N::Int; dir_results::Union{Nothing,String} = nothing) @assert length(models) >= 2 "Should contain at least 2 models" @assert ncategories(models_prior) == length(models) "Number of categories of models' prior and number of models do not equal" models_indexes = zeros(Int, N) - summary_stats_vector = Vector{typeof(summary_stats_observations)}(undef, N) + summary_stats_matrix = zeros(eltype(summary_stats_observations), length(summary_stats_observations), N) distances = zeros(N) bool_parametric = typeof(models) <: Vector{ParametricModel} for i = 1:N @@ -52,12 +52,16 @@ function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}} else sim = simulate(models[current_idx_model]) end - summary_stats_vector[i] = summary_stats_func(sim) - distances[i] = distance_func(summary_stats_vector[i], summary_stats_observations) + ss_i = summary_stats_func(sim) + summary_stats_matrix[:,i] = ss_i + distances[i] = distance_func(ss_i, summary_stats_observations) end k_nn = sortperm(distances, alg = QuickSort)[1:k] - return AbcModelChoiceDataset(models_indexes[k_nn], summary_stats_vector[k_nn], distances[k_nn[end]]) + if dir_results != nothing + dir_results = basename(dir_results) != "" ? dir_results * "/" : dir_results + end + return AbcModelChoiceDataset(models_indexes[k_nn], summary_stats_matrix[:,k_nn], distances[k_nn[end]]) end function rf_abc_model_choice(models::Vector{<:Union{Model,ParametricModel}}, @@ -69,7 +73,7 @@ function rf_abc_model_choice(models::Vector{<:Union{Model,ParametricModel}}, @assert k <= N_ref trainset = abc_model_choice_dataset(models, summary_stats_observations, summary_stats_func, distance_func, k, N_ref) gridsearch = GridSearchCV(RandomForestClassifier(oob_score=true), hyperparameters_range) - fit!(gridsearch, trainset.X, trainset.y) + fit!(gridsearch, transpose(trainset.X), trainset.y) best_rf = gridsearch.best_estimator_ return RandomForestABC(trainset, best_rf, summary_stats_observations, predict(best_rf, [summary_stats_observations])) end @@ -84,7 +88,7 @@ function posterior_proba_model(rf_abc::RandomForestABC) dict_params[Symbol(param)] = get_params(rf_abc.clf)[param] end rf_regressor = RandomForestRegressor(;dict_params...) - fit!(rf_regressor, rf_abc.reference_table.X, y_oob_regression) + fit!(rf_regressor, transpose(rf_abc.reference_table.X), y_oob_regression) return 1 - predict(rf_regressor, [rf_abc.summary_stats_observations])[1] end diff --git a/tests/abc_model_choice/toy_example.jl b/tests/abc_model_choice/toy_example.jl index 3cdef11021ac3013765fcc21e5bdbbe67550e4a0..0e7dfa01154737a1fead6d30c989cf3f1b748690 100644 --- a/tests/abc_model_choice/toy_example.jl +++ b/tests/abc_model_choice/toy_example.jl @@ -66,7 +66,8 @@ savefig("set.svg") grid = Dict(:n_estimators => [500], :min_samples_leaf => [1], :min_samples_split => [2], :n_jobs => [8]) @timev res_rf_abc = rf_abc_model_choice(models, ss_observations, ss_func, 29000; hyperparameters_range = grid) -@show posterior_proba_model(res_rf_abc) -println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, abc_testset.X))) -@show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, abc_testset.X)) +@show posterior_proba_model(res_rf_abc) +X_testset = transpose(abc_testset.X) +println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, X_testset))) +@show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, X_testset)) diff --git a/tests/abc_model_choice/toy_example_ma.jl b/tests/abc_model_choice/toy_example_ma.jl new file mode 100644 index 0000000000000000000000000000000000000000..39a3a7d7e1b392f0e237ee7ddb6bb7d5750cbf7d --- /dev/null +++ b/tests/abc_model_choice/toy_example_ma.jl @@ -0,0 +1,82 @@ +# From Pudlo: Reliable ABC model choice, 2016, Appendix B + +using ARFIMA +using Random +using LinearAlgebra +using MarkovProcesses +using Distributions +using ScikitLearn +@sk_import metrics: (accuracy_score, classification_report) +using StatsBase: autocor + +struct MA1 <: Model end +struct MA2 <: Model end +import MarkovProcesses: simulate + +global N_tml = 100 +global σ = 1.0 + +struct TriangleDist <: ContinuousMultivariateDistribution end +function Distributions.rand(d::TriangleDist) + θ1 = rand(Uniform(-2, 2)) + θ2 = (θ1 < 0) ? rand(Uniform(-θ1-1,1)) : rand(Uniform(θ1-1,1)) + return [θ1, θ2] +end +Distributions.rand!(d::TriangleDist, p::AbstractVector) = p[:] = rand(d) +Distributions.length(d::TriangleDist) = 2 +Distributions.pdf(d::TriangleDist, p::AbstractVector) = 1/8 +function simulate(m::MA1) + θ1 = rand(Uniform(-1, 1)) + x = zeros(100) + ϵtm1 = rand(Normal(0,σ^2)) + x[1] = ϵtm1 + for t = 2:100 + ϵt = rand(Normal(0,σ^2)) + x[t] = ϵt - θ1*ϵtm1 + ϵtm1 = ϵt + end + return x +end +function simulate(m::MA2) + θ1, θ2 = rand(TriangleDist()) + x = zeros(100) + ϵtm1 = rand(Normal(0,σ^2)) + ϵtm2 = rand(Normal(0,σ^2)) + x[1] = ϵtm2 + x[2] = ϵtm1 - θ1*ϵtm2 + for t = 3:100 + ϵt = rand(Normal(0,σ^2)) + x[t] = ϵt - θ1*ϵtm1 - θ2*ϵtm2 + ϵtm2 = ϵtm1 + ϵtm1 = ϵt + end + return x +end +#= +function simulate(m::MA1) + θ1 = rand(Uniform(-1, 1)) + return arma(N_tml, σ, nothing, SVector(θ1)) +end +function simulate(m::MA2) + θ = rand(TriangleDist()) + return arma(N_tml, σ, nothing, SVector(θ[1],θ[2])) +end +=# + +m1, m2 = MA1(), MA2() +models = [m1, m2] + +ss_func(y) = autocor(y, 1:7) +dist_l2(s_sim,s_obs) = norm(s_sim-s_obs) + +observations = simulate(m1) +ss_observations = ss_func(observations) +abc_testset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 10000, 10000) + +grid = Dict(:n_estimators => [300], :min_samples_leaf => [1], :min_samples_split => [2], :n_jobs => [8]) +res_rf_abc = rf_abc_model_choice(models, ss_observations, ss_func, 10000; hyperparameters_range = grid) +@show posterior_proba_model(res_rf_abc) +X_testset = transpose(abc_testset.X) +println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, X_testset))) +@show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, X_testset)) + diff --git a/tests/abc_model_choice/toy_model.ipynb b/tests/abc_model_choice/toy_model.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..3387eb23c2f99ff130117f5a8368c989ba99ca9c --- /dev/null +++ b/tests/abc_model_choice/toy_model.ipynb @@ -0,0 +1,195 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setup models, dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "using SpecialFunctions\n", + "using LinearAlgebra\n", + "using Random\n", + "using Distributions\n", + "using MarkovProcesses\n", + "\n", + "global n = 20\n", + "\n", + "struct Model1 <: Model end\n", + "struct Model2 <: Model end\n", + "struct Model3 <: Model end\n", + "import MarkovProcesses: simulate\n", + "\n", + "function simulate(m::Model1)\n", + " param = rand(Exponential(1))\n", + " return rand(Exponential(param), n)\n", + "end\n", + "function simulate(m::Model2)\n", + " param = rand(Normal())\n", + " return rand(LogNormal(param,1), n)\n", + "end\n", + "function simulate(m::Model3)\n", + " param = rand(Exponential(1))\n", + " return rand(Gamma(2,1/param), n)\n", + "end\n", + "\n", + "m1, m2, m3 = Model1(), Model2(), Model3()\n", + "lh_m1(s::Vector) = exp(log(gamma(n+1)) - (n+1)*log(1+s[1]))\n", + "lh_m2(s::Vector) = exp(-s[2]^2/(2n*(n+1)) - (s[3]^2)/2 + (s[2]^2)/(2n) - s[2]) * (2pi)^(-n/2)*(n+1)^(-1/2)\n", + "lh_m3(s::Vector) = exp(s[2])*gamma(2n+1)/gamma(2)^n * (1+s[1])^(-2n-1)\n", + "\n", + "ss_func(y) = [sum(y), sum(log.(y)), sum(log.(y).^2)]\n", + "dist_l2(s_sim,s_obs) = sqrt(dot(s_sim,s_obs))\n", + "\n", + "observations = simulate(m3)\n", + "ss_observations = ss_func(observations)\n", + "models = [m1, m2, m3]\n", + "abc_trainset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 29000, 29000)\n", + "abc_testset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 1000, 1000)\n", + "\n", + "list_lh = [lh_m1, lh_m2, lh_m3]\n", + "prob_model(ss::Vector, list_lh, idx_model) = list_lh[idx_model](ss) / sum([list_lh[i](ss) for i = eachindex(list_lh)])\n", + "prob_model(ss::Vector, idx_model) = prob_model(ss, list_lh, idx_model)\n", + "prob_model3(ss::Vector) = prob_model(ss, list_lh, 3)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Plot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "using Plots\n", + "\n", + "p = plot(title=\"Trainset\")\n", + "colors = [\"black\", \"red\", \"green\"]\n", + "begin_idx = 1\n", + "for i = 1:3\n", + " models_i = findall(x->x==i, abc_testset.models_indexes)\n", + " nbr_obs = length(models_i)\n", + " end_idx = begin_idx + nbr_obs - 1\n", + " lh = list_lh[i]\n", + " scatter!(p, begin_idx:end_idx, \n", + " vec(mapslices(prob_model3, abc_testset.summary_stats_matrix[:,models_i], dims = 1)), \n", + " color = colors[i], markersize = 3.0, markershape = :cross, label = \"Model $i\")\n", + " global begin_idx = end_idx + 1\n", + "end\n", + "p" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Classification models" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "using ScikitLearn\n", + "@sk_import linear_model: LogisticRegression\n", + "@sk_import ensemble: RandomForestClassifier\n", + "@sk_import metrics: (classification_report, confusion_matrix)\n", + "@sk_import neighbors: KNeighborsClassifier\n", + "\n", + "X_trainset = transpose(abc_trainset.X)\n", + "X_testset = transpose(abc_testset.X)\n", + "\n", + "logit_reg = fit!(LogisticRegression(), X_trainset, abc_trainset.y)\n", + "y_pred_logit = predict(logit_reg, X_testset)\n", + "println(classification_report(y_pred = y_pred_logit, y_true = abc_testset.y))\n", + "\n", + "rf_clf = fit!(RandomForestClassifier(n_estimators=500), X_trainset, abc_trainset.y)\n", + "y_pred_rf = predict(rf_clf, X_testset)\n", + "println(classification_report(y_pred = y_pred_rf, y_true = abc_testset.y))\n", + "\n", + "knn_clf = fit!(KNeighborsClassifier(n_neighbors=20), X_trainset, abc_trainset.y)\n", + "y_pred_knn = predict(rf_clf, X_testset)\n", + "println(classification_report(y_pred = y_pred_rf, y_true = abc_testset.y))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RF ABC" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "res_rf = rf_abc_model_choice(models, ss_observations, ss_func, 29000; \n", + " hyperparameters_range = Dict(:n_estimators => [500]))\n", + "println(classification_report(y_pred = predict(res_rf.clf, X_testset), y_true = abc_testset.y))\n", + "println(confusion_matrix(y_pred = predict(res_rf.clf, X_testset), y_true = abc_testset.y))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dict_params = Dict()\n", + "for param in keys(get_params(res_rf.clf))\n", + " dict_params[Symbol(param)] = get_params(res_rf.clf)[param]\n", + "end\n", + "RandomForestClassifier(;dict_params...)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "oob_votes = res_rf.clf.oob_decision_function_\n", + "y_pred_oob = argmax.([oob_votes[i,:] for i = 1:size(oob_votes)[1]])\n", + "@show mean(y_pred_oob .== res_rf.reference_table.y)\n", + "@show res_rf.clf.oob_score_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Julia 1.5.3", + "language": "julia", + "name": "julia-1.5" + }, + "language_info": { + "file_extension": ".jl", + "mimetype": "application/julia", + "name": "julia", + "version": "1.5.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}