Commit c4cc65e3 authored by Galtier Virginie's avatar Galtier Virginie
Browse files

updated version of the code resources

parent 90a4f6df
Pipeline #19466 failed with stage
in 10 seconds
This diff is collapsed.
auto_sklearn==0.14.5
PipelineProfiler==0.1.17
scikit_learn==1.0.2
This diff is collapsed.
%% Cell type:markdown id: tags:
# MLFlow Example
## 1. Data and uninstrumented code
Iris dataset
Random Forest model
Scikit-learn library
%% Cell type:code id: tags:
``` python
# Data: Iris dataset
#-------------------
# Load
from sklearn.datasets import load_iris
iris_dataset = load_iris()
X, y = iris_dataset.data, iris_dataset.target
# Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
```
%% Cell type:code id: tags:
``` python
# Random Forest Classifier Training
#----------------------------------
from sklearn.ensemble import RandomForestClassifier
n_estimators = 10 # The number of trees in the forest.
max_depth = 4 # The maximum depth of the tree.
max_features = 3 # The number of features to consider when looking for the best split
bootstrap = True # Whether bootstrap samples are used when building trees.
max_samples = 5 # If bootstrap is True, the number of samples to draw from X to train each base estimator.
randomForestClassifier_model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
max_features=max_features, bootstrap=bootstrap, max_samples=max_samples)
randomForestClassifier_model.fit(X_train, y_train)
# Model Evaluation
#-----------------
y_pred = randomForestClassifier_model.predict(X_test)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score, precision_score, accuracy_score
import numpy as np
cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
TP = np.diag(cm) # true positives
print("true positives: ", TP, " --> ", TP.sum())
TP = TP.sum()
FP = cm.sum(axis=0) - np.diag(cm) # false positives
print("false positives: ", FP, " --> ", FP.sum())
FP = FP.sum()
FN = cm.sum(axis=1) - np.diag(cm) # false negatives
print("false negatives: ", FN, " --> ", FN.sum())
FN = FN.sum()
TN = cm.sum() - (cm.sum(axis=0) - np.diag(cm) + cm.sum(axis=1) - np.diag(cm) + np.diag(cm)) # true negatives
print("true negatives: ", TN, " --> ", TN.sum())
TN = TN.sum()
recall = TP/(TP+FN)
print("recall = sensitivity = hit rate = true positive rate = TP/(TP+FN) = ", recall_score(y_test, y_pred, average="micro"), " ", recall)
specificity = TN/(TN+FP)
print("specificity = true negative rate = TN/(TN+FP) = ", specificity)
precision = TP/(TP+FP)
print("precision = positive predictive value = TP/(TP+FP) = ", precision_score(y_test, y_pred, average="micro"), " ", precision)
fall_out = FP/(FP+TN)
print("1-specificity = fall out = false positive rate = FP/(FP+TN) = ", fall_out)
miss_rate = FN/(TP+FN)
print("miss rate = false negative rate = FN/(TP+FN) = ", miss_rate)
accuracy = (TP+TN)/(TP+FP+FN+TN)
print("accuracy: ", accuracy_score(y_test, y_pred), " ", accuracy)
# Use the model
#--------------
X_new = [[5.7, 2.8, 4.5, 1.3]]
y_pred = randomForestClassifier_model.predict(X_new)
print("class of the flower described by ", X_new, ": ", y_pred)
```
%%%% Output: stream
true positives: [ 9 9 17] --> 35
false positives: [0 1 2] --> 3
false negatives: [0 2 1] --> 3
true negatives: [29 26 18] --> 73
recall = sensitivity = hit rate = true positive rate = TP/(TP+FN) = 0.9210526315789473 0.9210526315789473
specificity = true negative rate = TN/(TN+FP) = 0.9605263157894737
precision = positive predictive value = TP/(TP+FP) = 0.9210526315789473 0.9210526315789473
1-specificity = fall out = false positive rate = FP/(FP+TN) = 0.039473684210526314
miss rate = false negative rate = FN/(TP+FN) = 0.07894736842105263
accuracy: 0.9210526315789473 0.9473684210526315
class of the flower described by [[5.7, 2.8, 4.5, 1.3]] : [1]
%% Cell type:markdown id: tags:
## 2. MLFlow
### 2.1. Configuration of the backend store and artifact store
For local execution example, both stores are written as files into a "mlruns" directory.
See https://www.mlflow.org/docs/latest/tracking.html#scenario-1-mlflow-on-localhost.
%% Cell type:code id: tags:
``` python
import mlflow
# set the file repository where informations will be saved
mlflow.set_tracking_uri("file:./mlruns")
# create a new experiment
# experiment_id = mlflow.create_experiment("mlflow_demo")
# 2 experiments cannot share the same name
# -> postfix the experiment name with the date and time to make it unique in case the demonstration is run multiple times
from datetime import datetime
now = datetime.now()
dt_string = now.strftime("%d/%m/%Y_%H:%M:%S")
experiment_id = mlflow.create_experiment("expe_" + dt_string)
# Note: if runs are deleted from the UI, it might be necessary to add a .trash directory into the mlruns directory
```
%% Cell type:markdown id: tags:
### 2.2. First mlflow record
We copy the uninstrumented code and add mlflow instructions to record:
- hyperparameters values
- trained model
- performance metrics
Once the code is run, to visualiaze the results:
- change directory to the parent directory of "mlruns"
- launch the `mlflow ui` command
- open a browser to `localhost:5000`
%% Cell type:code id: tags:
``` python
with mlflow.start_run(experiment_id=experiment_id):
# Random Forest Classifier Training
#----------------------------------
from sklearn.ensemble import RandomForestClassifier
n_estimators = 10 # The number of trees in the forest.
max_depth = 4 # The maximum depth of the tree.
max_features = 3 # The number of features to consider when looking for the best split
bootstrap = True # Whether bootstrap samples are used when building trees.
max_samples = 5 # If bootstrap is True, the number of samples to draw from X to train each base estimator.
randomForestClassifier_model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
max_features=max_features, bootstrap=bootstrap, max_samples=max_samples)
# record hyperparameters values
mlflow.log_param("n_estimators", 10)
mlflow.log_param("max_depth", 4)
mlflow.log_param("max_features", 3)
mlflow.log_param("bootstrap", True)
mlflow.log_param("max_samples", 5)
randomForestClassifier_model.fit(X_train, y_train)
# record the trained model
mlflow.sklearn.log_model(sk_model=randomForestClassifier_model, artifact_path="model") # modèle adapté à l'API de Scikit-learn
# Model Evaluation
#-----------------
y_pred = randomForestClassifier_model.predict(X_test)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score, precision_score, accuracy_score
import numpy as np
cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
TP = np.diag(cm) # true positives
TP = TP.sum()
FP = cm.sum(axis=0) - np.diag(cm) # false positives
FP = FP.sum()
FN = cm.sum(axis=1) - np.diag(cm) # false negatives
FN = FN.sum()
TN = cm.sum() - (cm.sum(axis=0) - np.diag(cm) + cm.sum(axis=1) - np.diag(cm) + np.diag(cm)) # true negatives
TN = TN.sum()
recall = TP/(TP+FN)
specificity = TN/(TN+FP)
precision = TP/(TP+FP)
fall_out = FP/(FP+TN)
miss_rate = FN/(TP+FN)
accuracy = (TP+TN)/(TP+FP+FN+TN)
# record metrics
mlflow.log_metric("recall", recall)
mlflow.log_metric("specificity", specificity)
mlflow.log_metric("precision", precision)
mlflow.log_metric("fall out", fall_out)
mlflow.log_metric("miss rate", miss_rate)
mlflow.log_metric("accuracy", accuracy)
```
%% Cell type:markdown id: tags:
### 2.3. Record multiple runs, with various hyperparameters values
%% Cell type:code id: tags:
``` python
for n_estimators in [1, 10, 100]: # The number of trees in the forest.
for max_depth in [2, None]: # The maximum depth of the tree.
for max_features in [2, 4]: # The number of features to consider when looking for the best split
for bootstrap in [True, False]: # Whether bootstrap samples are used when building trees.
for max_samples in [5, 50, 100]: # If bootstrap is True, the number of samples to draw from X to train each base estimator.
with mlflow.start_run(experiment_id=experiment_id):
# Random Forest Classifier Training
#----------------------------------
from sklearn.ensemble import RandomForestClassifier
randomForestClassifier_model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
max_features=max_features, bootstrap=bootstrap, max_samples=max_samples)
# record hyperparameters values
mlflow.log_param("n_estimators", n_estimators)
mlflow.log_param("max_depth", max_depth)
mlflow.log_param("max_features", max_features)
mlflow.log_param("bootstrap", bootstrap)
mlflow.log_param("max_samples", max_samples)
randomForestClassifier_model.fit(X_train, y_train)
# record the trained model
mlflow.sklearn.log_model(sk_model=randomForestClassifier_model, artifact_path="model") # modèle adapté à l'API de Scikit-learn
# Model Evaluation
#-----------------
y_pred = randomForestClassifier_model.predict(X_test)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score, precision_score, accuracy_score
import numpy as np
cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
TP = np.diag(cm) # true positives
TP = TP.sum()
FP = cm.sum(axis=0) - np.diag(cm) # false positives
FP = FP.sum()
FN = cm.sum(axis=1) - np.diag(cm) # false negatives
FN = FN.sum()
TN = cm.sum() - (cm.sum(axis=0) - np.diag(cm) + cm.sum(axis=1) - np.diag(cm) + np.diag(cm)) # true negatives
TN = TN.sum()
recall = TP/(TP+FN)
specificity = TN/(TN+FP)
precision = TP/(TP+FP)
fall_out = FP/(FP+TN)
miss_rate = FN/(TP+FN)
accuracy = (TP+TN)/(TP+FP+FN+TN)
# record metrics
mlflow.log_metric("recall", recall)
mlflow.log_metric("specificity", specificity)
mlflow.log_metric("precision", precision)
mlflow.log_metric("fall out", fall_out)
mlflow.log_metric("miss rate", miss_rate)
mlflow.log_metric("accuracy", accuracy)
```
%% Cell type:markdown id: tags:
# MLFlow Example
Import and use a model saved by mlflow
%% Cell type:code id: tags:
``` python
import mlflow
logged_model = 'runs:/cb4e243740a8454e9c0d1e85b76ea75a/model'
# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)
# Predict on a Pandas DataFrame.
X_new = [[5.7, 2.8, 4.5, 1.3]]
import pandas as pd
y_pred = loaded_model.predict(pd.DataFrame(X_new))
print("class of the flower described by ", X_new, ": ", y_pred)
```
%%%% Output: stream
class of the flower described by [[5.7, 2.8, 4.5, 1.3]] : [1]
mlflow==1.22.0
numpy==1.21.1
pandas==1.3.0
scikit_learn==1.0.2
This diff is collapsed.
matplotlib==3.4.2
numpy==1.21.1
scikit_learn==1.0.2
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>ONNXExample</groupId>
<artifactId>ONNXExample</artifactId>
<version>0.0.1-SNAPSHOT</version>
<build>
<sourceDirectory>src</sourceDirectory>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.0</version>
<configuration>
<release>11</release>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>com.microsoft.onnxruntime</groupId>
<artifactId>onnxruntime</artifactId>
<version>1.10.0</version>
</dependency>
<dependency>
<groupId>com.microsoft.onnxruntime</groupId>
<artifactId>onnxruntime_gpu</artifactId>
<version>1.10.0</version>
</dependency>
</dependencies>
</project>
\ No newline at end of file
package onnx_example;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import ai.onnxruntime.OnnxTensor;
import ai.onnxruntime.OrtEnvironment;
import ai.onnxruntime.OrtException;
import ai.onnxruntime.OrtSession;
import ai.onnxruntime.OrtSession.Result;
public class ONNXImportExample {
public static void main(String[] args) throws OrtException {
OrtEnvironment env = OrtEnvironment.getEnvironment();
OrtSession inferenceSession = env.createSession("../rf_iris.onnx", new OrtSession.SessionOptions());
String inputName = inferenceSession.getInputNames().toArray()[0].toString();
String labelName = inferenceSession.getOutputNames().toArray()[0].toString();
float[][] X_new_array = new float[][] { { 5.7f, 2.8f, 4.5f, 1.3f } };
OnnxTensor X_new_tensor = OnnxTensor.createTensor(env, X_new_array);
Map<String, OnnxTensor> inputs = Map.of(inputName, X_new_tensor);
Set<String> requestedOutputs = new HashSet<>();
requestedOutputs.add(labelName);
Result results = inferenceSession.run(inputs, requestedOutputs);
OnnxTensor predictions = (OnnxTensor) results.get(0);
long[] predictionsJava = predictions.getLongBuffer().array();
System.out.print(Arrays.toString(predictionsJava));
X_new_tensor.close();
predictions.close();
}
}
%% Cell type:markdown id: tags:
# ONNX Export Example
## 1. Random Forest Model Training on the Iris Dataset
%% Cell type:code id: tags:
``` python
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
iris_dataset = load_iris()
X, y = iris_dataset.data, iris_dataset.target
X_train, X_test, y_train, y_test = train_test_split(X, y)
randomForestClassifier_default_model = RandomForestClassifier()
randomForestClassifier_default_model.fit(X_train, y_train)
```
%%%% Output: execute_result
RandomForestClassifier()
%% Cell type:markdown id: tags:
## 2. Scikit-learn Model Conversion to ONNX Format
See [`skl2onnx convert_sklearn` function](http://onnx.ai/sklearn-onnx/api_summary.html#converters)
%% Cell type:code id: tags:
``` python
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import convert_sklearn
# input type: float (Iris features) -> type = FloatTensorType
# input shape: unknown number of rows (->`None`) of 4 float values (Iris features)
initial_types = [('features_quadruplet_float_input', FloatTensorType([None, 4]))]
# conversion
onnx_model = convert_sklearn(model=randomForestClassifier_default_model, initial_types=initial_types)
# save the model to a .onnx file:
with open("rf_iris.onnx", "wb") as f:
f.write(onnx_model.SerializeToString())
# display the first and last lines of the serialiazed model:
print(str(onnx_model)[:260] + "\n...")
print(str(onnx_model)[-760:-550])
```
%%%% Output: stream
ir_version: 8
producer_name: "skl2onnx"
producer_version: "1.10.4"
domain: "ai.onnx"
model_version: 0
doc_string: ""
graph {
node {
input: "features_quadruplet_float_input"
output: "label"
output: "probabilities"
name: "TreeEnsembleClassifier
...
features_quadruplet_float_input"
type {
tensor_type {
elem_type: 1
shape {
dim {
}
dim {
dim_value: 4
}
}
}
}
}
%% Cell type:markdown id: tags:
Alternatively, `skl2onnx.to_onnx` calls `convert_sklearn()` with simplified parameters: if the training set is provided, it is used to infer the input types.
%% Cell type:code id: tags:
``` python
from skl2onnx import to_onnx
onnx_model_bis = to_onnx(model=randomForestClassifier_default_model, X=X_train)
# display the first and last lines of the serialiazed model:
print(str(onnx_model)[:260] + "\n...")
print(str(onnx_model)[-760:-550])
```
%%%% Output: stream
ir_version: 8
producer_name: "skl2onnx"
producer_version: "1.10.4"
domain: "ai.onnx"
model_version: 0
doc_string: ""
graph {
node {
input: "features_quadruplet_float_input"
output: "label"
output: "probabilities"
name: "TreeEnsembleClassifier
...
features_quadruplet_float_input"
type {
tensor_type {
elem_type: 1
shape {
dim {
}
dim {
dim_value: 4
}
}
}
}
}
%% Cell type:markdown id: tags:
# ONNX Import Example
%% Cell type:code id: tags:
``` python
import onnxruntime as rt
inferenceSession = rt.InferenceSession("rf_iris.onnx")
for model_input in inferenceSession.get_inputs():
print(model_input)
for model_output in inferenceSession.get_outputs():
print(model_output)
input_name = inferenceSession.get_inputs()[0].name
label_name = inferenceSession.get_outputs()[0].name
X_new = [[5.7, 2.8, 4.5, 1.3]]
y_pred = inferenceSession.run(output_names=[label_name], input_feed={input_name: X_new})
print(y_pred[0])
```
%%%% Output: stream
NodeArg(name='features_quadruplet_float_input', type='tensor(float)', shape=[None, 4])
NodeArg(name='output_label', type='tensor(int64)', shape=[None])
NodeArg(name='output_probability', type='seq(map(int64,tensor(float)))', shape=[])
[1]
onnxruntime==1.10.0
scikit_learn==1.0.2
skl2onnx==1.10.4
This diff is collapsed.
## 1. Les données
#----------------
import sklearn
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
X, Y = make_classification(n_samples=1000, n_features=20, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=100)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.6, shuffle=False)
## 2. AutoML
#-----------
#pip install auto-sklearn
import autosklearn
import autosklearn.classification
# time_left_for_this_task : Time limit in seconds for the search of appropriate models.
# per_run_time_limit : Time limit for a single call to the machine learning model.
# Model fitting will be terminated if the machine learning algorithm runs over the time limit.
# Set this value high enough so that typical machine learning algorithms can be fit on the training data.
automl = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=120,
per_run_time_limit=30,
)
automl.fit(X_train, Y_train)
## 3. affichage des résultats de l'AutoML
#----------------------------------------
# affiche le classement des modèles
print(automl.leaderboard())
# affiche la description des modèles
from pprint import pprint
pprint(automl.show_models(), indent=4)
# calcule la prédiction du meilleur modèle et affiche sa précision
Y_pred = automl.predict(X_test)
print("Accuracy score:", sklearn.metrics.accuracy_score(Y_test, Y_pred))
\ No newline at end of file