change abc dataset to matrix

888e74c1 · Bentriou Mahmoud · 7d3b7089 · 888e74c1 · 888e74c1 · 888e74c1
Commit 888e74c1 authored 3 years ago by Bentriou Mahmoud
--- a/algorithms/abc_model_choice.jl
+++ b/algorithms/abc_model_choice.jl

 struct AbcModelChoiceDataset
    models_indexes::Vector{Int}
-    summary_stats_vector::Vector
+    summary_stats_matrix::Matrix
    epsilon::Float64
 end

@@ -14,7 +14,7 @@ end

 function getproperty(dataset::AbcModelChoiceDataset, sym::Symbol)
    if sym == :X
-        return dataset.summary_stats_vector
+        return dataset.summary_stats_matrix
    elseif sym == :y
        return dataset.models_indexes
    else
@@ -25,21 +25,21 @@ end
 function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}},
                                  summary_stats_observations,
                                  summary_stats_func::Function, distance_func::Function,
-                                  k::Int, N::Int)
+                                  k::Int, N::Int; dir_results::Union{Nothing,String} = nothing)
    nbr_models = length(models)
    models_prior = Categorical([1/nbr_models for i = 1:nbr_models])
-    return abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N)
+    return abc_model_choice_dataset(models, models_prior, summary_stats_observations, summary_stats_func, distance_func, k, N; dir_results = dir_results)
 end

 function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}, models_prior::DiscreteUnivariateDistribution,
                                  summary_stats_observations,
                                  summary_stats_func::Function, distance_func::Function,
-                                  k::Int, N::Int)
+                                  k::Int, N::Int; dir_results::Union{Nothing,String} = nothing)
    @assert length(models) >= 2 "Should contain at least 2 models"
    @assert ncategories(models_prior) == length(models) "Number of categories of models' prior and number of models do not equal"

    models_indexes = zeros(Int, N)
-    summary_stats_vector = Vector{typeof(summary_stats_observations)}(undef, N)
+    summary_stats_matrix = zeros(eltype(summary_stats_observations), length(summary_stats_observations), N)
    distances = zeros(N)
    bool_parametric = typeof(models) <: Vector{ParametricModel} 
    for i = 1:N
@@ -52,12 +52,16 @@ function abc_model_choice_dataset(models::Vector{<:Union{Model,ParametricModel}}
        else
            sim = simulate(models[current_idx_model])
        end
-        summary_stats_vector[i] = summary_stats_func(sim)
-        distances[i] = distance_func(summary_stats_vector[i], summary_stats_observations)
+        ss_i = summary_stats_func(sim)
+        summary_stats_matrix[:,i] = ss_i 
+        distances[i] = distance_func(ss_i, summary_stats_observations)
    end
    k_nn = sortperm(distances, alg = QuickSort)[1:k]

-    return AbcModelChoiceDataset(models_indexes[k_nn], summary_stats_vector[k_nn], distances[k_nn[end]])
+    if dir_results != nothing
+        dir_results = basename(dir_results) != "" ? dir_results * "/" : dir_results
+    end
+    return AbcModelChoiceDataset(models_indexes[k_nn], summary_stats_matrix[:,k_nn], distances[k_nn[end]])
 end

 function rf_abc_model_choice(models::Vector{<:Union{Model,ParametricModel}},
@@ -69,7 +73,7 @@ function rf_abc_model_choice(models::Vector{<:Union{Model,ParametricModel}},
    @assert k <= N_ref
    trainset = abc_model_choice_dataset(models, summary_stats_observations, summary_stats_func, distance_func, k, N_ref)
    gridsearch = GridSearchCV(RandomForestClassifier(oob_score=true), hyperparameters_range)
-    fit!(gridsearch, trainset.X, trainset.y)
+    fit!(gridsearch, transpose(trainset.X), trainset.y)
    best_rf = gridsearch.best_estimator_
    return RandomForestABC(trainset, best_rf, summary_stats_observations, predict(best_rf, [summary_stats_observations]))
 end
@@ -84,7 +88,7 @@ function posterior_proba_model(rf_abc::RandomForestABC)
        dict_params[Symbol(param)] = get_params(rf_abc.clf)[param]
    end
    rf_regressor = RandomForestRegressor(;dict_params...)
-    fit!(rf_regressor, rf_abc.reference_table.X, y_oob_regression)
+    fit!(rf_regressor, transpose(rf_abc.reference_table.X), y_oob_regression)
    return 1 - predict(rf_regressor, [rf_abc.summary_stats_observations])[1]
 end

--- a/tests/abc_model_choice/toy_example.jl
+++ b/tests/abc_model_choice/toy_example.jl
@@ -66,7 +66,8 @@ savefig("set.svg")

 grid = Dict(:n_estimators => [500], :min_samples_leaf => [1], :min_samples_split => [2], :n_jobs => [8])
 @timev res_rf_abc = rf_abc_model_choice(models, ss_observations, ss_func, 29000; hyperparameters_range = grid)
-@show posterior_proba_model(res_rf_abc) 
-println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, abc_testset.X)))
-@show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, abc_testset.X))
+@show posterior_proba_model(res_rf_abc)
+X_testset = transpose(abc_testset.X)
+println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, X_testset)))
+@show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, X_testset))

--- a/tests/abc_model_choice/toy_example_ma.jl
+++ b/tests/abc_model_choice/toy_example_ma.jl
+# From Pudlo: Reliable ABC model choice, 2016, Appendix B
+
+using ARFIMA
+using Random
+using LinearAlgebra
+using MarkovProcesses
+using Distributions
+using ScikitLearn
+@sk_import metrics: (accuracy_score, classification_report)
+using StatsBase: autocor
+
+struct MA1 <: Model end
+struct MA2 <: Model end
+import MarkovProcesses: simulate
+
+global N_tml = 100
+global σ = 1.0
+
+struct TriangleDist <: ContinuousMultivariateDistribution end
+function Distributions.rand(d::TriangleDist)
+    θ1 = rand(Uniform(-2, 2))
+    θ2 = (θ1 < 0) ? rand(Uniform(-θ1-1,1)) : rand(Uniform(θ1-1,1))
+    return [θ1, θ2]
+end
+Distributions.rand!(d::TriangleDist, p::AbstractVector) = p[:] = rand(d)
+Distributions.length(d::TriangleDist) = 2
+Distributions.pdf(d::TriangleDist, p::AbstractVector) = 1/8
+function simulate(m::MA1)
+    θ1 = rand(Uniform(-1, 1))
+    x = zeros(100)
+    ϵtm1 = rand(Normal(0,σ^2))
+    x[1] = ϵtm1
+    for t = 2:100
+        ϵt = rand(Normal(0,σ^2))
+        x[t] = ϵt - θ1*ϵtm1
+        ϵtm1 = ϵt
+    end
+    return x
+end
+function simulate(m::MA2)
+    θ1, θ2 = rand(TriangleDist())
+    x = zeros(100)
+    ϵtm1 = rand(Normal(0,σ^2))
+    ϵtm2 = rand(Normal(0,σ^2))
+    x[1] = ϵtm2
+    x[2] = ϵtm1 - θ1*ϵtm2
+    for t = 3:100
+        ϵt = rand(Normal(0,σ^2))
+        x[t] = ϵt - θ1*ϵtm1 - θ2*ϵtm2
+        ϵtm2 = ϵtm1
+        ϵtm1 = ϵt
+    end
+    return x
+end
+#=
+function simulate(m::MA1)
+    θ1 = rand(Uniform(-1, 1))
+    return arma(N_tml, σ, nothing, SVector(θ1))
+end
+function simulate(m::MA2)
+    θ = rand(TriangleDist())
+    return arma(N_tml, σ, nothing, SVector(θ[1],θ[2]))
+end
+=#
+
+m1, m2 = MA1(), MA2()
+models = [m1, m2]
+
+ss_func(y) = autocor(y, 1:7)
+dist_l2(s_sim,s_obs) = norm(s_sim-s_obs)
+
+observations = simulate(m1)
+ss_observations = ss_func(observations)
+abc_testset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 10000, 10000)
+
+grid = Dict(:n_estimators => [300], :min_samples_leaf => [1], :min_samples_split => [2], :n_jobs => [8])
+res_rf_abc = rf_abc_model_choice(models, ss_observations, ss_func, 10000; hyperparameters_range = grid)
+@show posterior_proba_model(res_rf_abc)
+X_testset = transpose(abc_testset.X)
+println(classification_report(y_true = abc_testset.y, y_pred = predict(res_rf_abc.clf, X_testset)))
+@show accuracy_score(abc_testset.y, predict(res_rf_abc.clf, X_testset))
+
--- a/tests/abc_model_choice/toy_model.ipynb
+++ b/tests/abc_model_choice/toy_model.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Setup models, dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "using SpecialFunctions\n",
+    "using LinearAlgebra\n",
+    "using Random\n",
+    "using Distributions\n",
+    "using MarkovProcesses\n",
+    "\n",
+    "global n = 20\n",
+    "\n",
+    "struct Model1 <: Model end\n",
+    "struct Model2 <: Model end\n",
+    "struct Model3 <: Model end\n",
+    "import MarkovProcesses: simulate\n",
+    "\n",
+    "function simulate(m::Model1)\n",
+    "    param = rand(Exponential(1))\n",
+    "    return rand(Exponential(param), n)\n",
+    "end\n",
+    "function simulate(m::Model2)\n",
+    "    param = rand(Normal())\n",
+    "    return rand(LogNormal(param,1), n)\n",
+    "end\n",
+    "function simulate(m::Model3)\n",
+    "    param = rand(Exponential(1))\n",
+    "    return rand(Gamma(2,1/param), n)\n",
+    "end\n",
+    "\n",
+    "m1, m2, m3 = Model1(), Model2(), Model3()\n",
+    "lh_m1(s::Vector) = exp(log(gamma(n+1)) - (n+1)*log(1+s[1]))\n",
+    "lh_m2(s::Vector) = exp(-s[2]^2/(2n*(n+1)) - (s[3]^2)/2 + (s[2]^2)/(2n) - s[2]) * (2pi)^(-n/2)*(n+1)^(-1/2)\n",
+    "lh_m3(s::Vector) = exp(s[2])*gamma(2n+1)/gamma(2)^n * (1+s[1])^(-2n-1)\n",
+    "\n",
+    "ss_func(y) = [sum(y), sum(log.(y)), sum(log.(y).^2)]\n",
+    "dist_l2(s_sim,s_obs) = sqrt(dot(s_sim,s_obs))\n",
+    "\n",
+    "observations = simulate(m3)\n",
+    "ss_observations = ss_func(observations)\n",
+    "models = [m1, m2, m3]\n",
+    "abc_trainset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 29000, 29000)\n",
+    "abc_testset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 1000, 1000)\n",
+    "\n",
+    "list_lh = [lh_m1, lh_m2, lh_m3]\n",
+    "prob_model(ss::Vector, list_lh, idx_model) = list_lh[idx_model](ss) / sum([list_lh[i](ss) for i = eachindex(list_lh)])\n",
+    "prob_model(ss::Vector, idx_model) = prob_model(ss, list_lh, idx_model)\n",
+    "prob_model3(ss::Vector) = prob_model(ss, list_lh, 3)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "using Plots\n",
+    "\n",
+    "p = plot(title=\"Trainset\")\n",
+    "colors = [\"black\", \"red\", \"green\"]\n",
+    "begin_idx = 1\n",
+    "for i = 1:3\n",
+    "    models_i = findall(x->x==i, abc_testset.models_indexes)\n",
+    "    nbr_obs = length(models_i)\n",
+    "    end_idx = begin_idx + nbr_obs - 1\n",
+    "    lh = list_lh[i]\n",
+    "    scatter!(p, begin_idx:end_idx, \n",
+    "             vec(mapslices(prob_model3, abc_testset.summary_stats_matrix[:,models_i], dims = 1)), \n",
+    "             color = colors[i], markersize = 3.0, markershape = :cross, label = \"Model $i\")\n",
+    "    global begin_idx = end_idx + 1\n",
+    "end\n",
+    "p"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Classification models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "using ScikitLearn\n",
+    "@sk_import linear_model: LogisticRegression\n",
+    "@sk_import ensemble: RandomForestClassifier\n",
+    "@sk_import metrics: (classification_report, confusion_matrix)\n",
+    "@sk_import neighbors: KNeighborsClassifier\n",
+    "\n",
+    "X_trainset = transpose(abc_trainset.X)\n",
+    "X_testset = transpose(abc_testset.X)\n",
+    "\n",
+    "logit_reg = fit!(LogisticRegression(), X_trainset, abc_trainset.y)\n",
+    "y_pred_logit = predict(logit_reg, X_testset)\n",
+    "println(classification_report(y_pred = y_pred_logit, y_true = abc_testset.y))\n",
+    "\n",
+    "rf_clf = fit!(RandomForestClassifier(n_estimators=500), X_trainset, abc_trainset.y)\n",
+    "y_pred_rf = predict(rf_clf, X_testset)\n",
+    "println(classification_report(y_pred = y_pred_rf, y_true = abc_testset.y))\n",
+    "\n",
+    "knn_clf = fit!(KNeighborsClassifier(n_neighbors=20), X_trainset, abc_trainset.y)\n",
+    "y_pred_knn = predict(rf_clf, X_testset)\n",
+    "println(classification_report(y_pred = y_pred_rf, y_true = abc_testset.y))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# RF ABC"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "res_rf = rf_abc_model_choice(models, ss_observations, ss_func, 29000; \n",
+    "                             hyperparameters_range = Dict(:n_estimators => [500]))\n",
+    "println(classification_report(y_pred = predict(res_rf.clf, X_testset), y_true = abc_testset.y))\n",
+    "println(confusion_matrix(y_pred = predict(res_rf.clf, X_testset), y_true = abc_testset.y))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dict_params = Dict()\n",
+    "for param in keys(get_params(res_rf.clf))\n",
+    "    dict_params[Symbol(param)] = get_params(res_rf.clf)[param]\n",
+    "end\n",
+    "RandomForestClassifier(;dict_params...)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "oob_votes = res_rf.clf.oob_decision_function_\n",
+    "y_pred_oob = argmax.([oob_votes[i,:] for i = 1:size(oob_votes)[1]])\n",
+    "@show mean(y_pred_oob .== res_rf.reference_table.y)\n",
+    "@show res_rf.clf.oob_score_"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Julia 1.5.3",
+   "language": "julia",
+   "name": "julia-1.5"
+  },
+  "language_info": {
+   "file_extension": ".jl",
+   "mimetype": "application/julia",
+   "name": "julia",
+   "version": "1.5.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
+%% Cell type:markdown id: tags:
+
+# Setup models, dataset
+
+%% Cell type:code id: tags:
+
+``` julia
+using SpecialFunctions
+using LinearAlgebra
+using Random
+using Distributions
+using MarkovProcesses
+
+global n = 20
+
+struct Model1 <: Model end
+struct Model2 <: Model end
+struct Model3 <: Model end
+import MarkovProcesses: simulate
+
+function simulate(m::Model1)
+    param = rand(Exponential(1))
+    return rand(Exponential(param), n)
+end
+function simulate(m::Model2)
+    param = rand(Normal())
+    return rand(LogNormal(param,1), n)
+end
+function simulate(m::Model3)
+    param = rand(Exponential(1))
+    return rand(Gamma(2,1/param), n)
+end
+
+m1, m2, m3 = Model1(), Model2(), Model3()
+lh_m1(s::Vector) = exp(log(gamma(n+1)) - (n+1)*log(1+s[1]))
+lh_m2(s::Vector) = exp(-s[2]^2/(2n*(n+1)) - (s[3]^2)/2 + (s[2]^2)/(2n) - s[2]) * (2pi)^(-n/2)*(n+1)^(-1/2)
+lh_m3(s::Vector) = exp(s[2])*gamma(2n+1)/gamma(2)^n * (1+s[1])^(-2n-1)
+
+ss_func(y) = [sum(y), sum(log.(y)), sum(log.(y).^2)]
+dist_l2(s_sim,s_obs) = sqrt(dot(s_sim,s_obs))
+
+observations = simulate(m3)
+ss_observations = ss_func(observations)
+models = [m1, m2, m3]
+abc_trainset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 29000, 29000)
+abc_testset = abc_model_choice_dataset(models, ss_observations, ss_func, dist_l2, 1000, 1000)
+
+list_lh = [lh_m1, lh_m2, lh_m3]
+prob_model(ss::Vector, list_lh, idx_model) = list_lh[idx_model](ss) / sum([list_lh[i](ss) for i = eachindex(list_lh)])
+prob_model(ss::Vector, idx_model) = prob_model(ss, list_lh, idx_model)
+prob_model3(ss::Vector) = prob_model(ss, list_lh, 3)
+```
+
+%% Cell type:markdown id: tags:
+
+# Plot
+
+%% Cell type:code id: tags:
+
+``` julia
+using Plots
+
+p = plot(title="Trainset")
+colors = ["black", "red", "green"]
+begin_idx = 1
+for i = 1:3
+    models_i = findall(x->x==i, abc_testset.models_indexes)
+    nbr_obs = length(models_i)
+    end_idx = begin_idx + nbr_obs - 1
+    lh = list_lh[i]
+    scatter!(p, begin_idx:end_idx,
+             vec(mapslices(prob_model3, abc_testset.summary_stats_matrix[:,models_i], dims = 1)),
+             color = colors[i], markersize = 3.0, markershape = :cross, label = "Model $i")
+    global begin_idx = end_idx + 1
+end
+p
+```
+
+%% Cell type:markdown id: tags:
+
+# Classification models
+
+%% Cell type:code id: tags:
+
+``` julia
+using ScikitLearn
+@sk_import linear_model: LogisticRegression
+@sk_import ensemble: RandomForestClassifier
+@sk_import metrics: (classification_report, confusion_matrix)
+@sk_import neighbors: KNeighborsClassifier
+
+X_trainset = transpose(abc_trainset.X)
+X_testset = transpose(abc_testset.X)
+
+logit_reg = fit!(LogisticRegression(), X_trainset, abc_trainset.y)
+y_pred_logit = predict(logit_reg, X_testset)
+println(classification_report(y_pred = y_pred_logit, y_true = abc_testset.y))
+
+rf_clf = fit!(RandomForestClassifier(n_estimators=500), X_trainset, abc_trainset.y)
+y_pred_rf = predict(rf_clf, X_testset)
+println(classification_report(y_pred = y_pred_rf, y_true = abc_testset.y))
+
+knn_clf = fit!(KNeighborsClassifier(n_neighbors=20), X_trainset, abc_trainset.y)
+y_pred_knn = predict(rf_clf, X_testset)
+println(classification_report(y_pred = y_pred_rf, y_true = abc_testset.y))
+```
+
+%% Cell type:markdown id: tags:
+
+# RF ABC
+
+%% Cell type:code id: tags:
+
+``` julia
+res_rf = rf_abc_model_choice(models, ss_observations, ss_func, 29000;
+                             hyperparameters_range = Dict(:n_estimators => [500]))
+println(classification_report(y_pred = predict(res_rf.clf, X_testset), y_true = abc_testset.y))
+println(confusion_matrix(y_pred = predict(res_rf.clf, X_testset), y_true = abc_testset.y))
+```
+
+%% Cell type:code id: tags:
+
+``` julia
+dict_params = Dict()
+for param in keys(get_params(res_rf.clf))
+    dict_params[Symbol(param)] = get_params(res_rf.clf)[param]
+end
+RandomForestClassifier(;dict_params...)
+```
+
+%% Cell type:code id: tags:
+
+``` julia
+oob_votes = res_rf.clf.oob_decision_function_
+y_pred_oob = argmax.([oob_votes[i,:] for i = 1:size(oob_votes)[1]])
+@show mean(y_pred_oob .== res_rf.reference_table.y)
+@show res_rf.clf.oob_score_
+```
+
+%% Cell type:code id: tags:
+
+``` julia
+```