Commit e0cc1705 authored by Simon's avatar Simon
Browse files

removed features download option in training notebook

parent bf56e495
%% Cell type:markdown id: tags:
# Convolutional neural network and transfer learning for urban sound tagging
In this notebook, you will build and train a convolutional neural network (CNN) to perform urban sound tagging with [Keras](https://keras.io/). Using transfer learning, your CNN will build upon a model called [VGGish](https://github.com/tensorflow/models/tree/master/research/audioset/vggish). It was trained on [AudioSet](https://github.com/tensorflow/models/tree/master/research/audioset), a dataset of over 2 million human-labeled 10-second YouTube video soundtracks, with labels taken from an ontology of more than 600 audio event classes. This represents more than 5 thousand hours of audio.
The method you will implement here is based on ["Convolutional Neural Networks with Transfer Learning for Urban Sound Tagging"](http://dcase.community/documents/challenge2019/technical_reports/DCASE2019_Kim_107.pdf) that was proposed by Bongjun Kim (Department of Computer Science, Northwestern University, Evnaston, Illinois, USA) and obtained the 3rd best score at the [DCASE 2019 Challenge, task 5](http://dcase.community/challenge2019/task-urban-sound-tagging).
**Before working on the rest of this notebook, take some time to read and understand what are VGGish, Audioset, and the above-mentioned method that was submitted to the DCASE 2019 challenge.**
%% Cell type:markdown id: tags:
## Environment setup
### Mount Google Drive
%% Cell type:code id: tags:
``` python
from google.colab import drive
drive.mount('/content/drive')
```
%% Cell type:markdown id: tags:
### Get some utilities
%% Cell type:code id: tags:
``` python
import os
if not os.path.exists('mel_features.py'):
!wget https://gitlab-research.centralesupelec.fr/sleglaive/embedded-ust-students/raw/master/mel_features.py
if not os.path.exists('utils.py'):
!wget https://gitlab-research.centralesupelec.fr/sleglaive/embedded-ust-students/raw/master/utils.py
if not os.path.exists('vggish_params.py'):
!wget https://gitlab-research.centralesupelec.fr/sleglaive/embedded-ust-students/raw/master/vggish_params.py
```
%% Cell type:markdown id: tags:
### Define important paths
%% Cell type:code id: tags:
``` python
ust_data_dir = './drive/My Drive/data/ust-data'
dataset_dir = os.path.join(ust_data_dir, 'sonyc-ust')
annotation_file = os.path.join(dataset_dir, 'annotations.csv')
taxonomy_file = os.path.join(dataset_dir, 'dcase-ust-taxonomy.yaml')
log_mel_spec_dir = os.path.join(ust_data_dir, 'log-mel-spectrograms')
output_training_dir = os.path.join(ust_data_dir, 'output_training')
output_prediction_dir = os.path.join(ust_data_dir, 'output_prediction')
```
%% Cell type:markdown id: tags:
### Download features (optional)
If you haven't finished extracting the features yet, you can download them in order to continue working.
%% Cell type:code id: tags:
``` python
# if not(os.path.isdir(log_mel_spec_dir)):
# os.makedirs(log_mel_spec_dir)
# %pushd /content/drive/My\ Drive/data/ust-data/log-mel-spectrograms
# !wget https://gitlab-research.centralesupelec.fr/sleglaive/embedded-ust-students/raw/master/data.npy
# %popd
```
%% Cell type:markdown id: tags:
### Install missing packages
%% Cell type:code id: tags:
``` python
!pip install oyaml
```
%% Cell type:markdown id: tags:
## Define parameters
%% Cell type:code id: tags:
``` python
%tensorflow_version 1.x
import os
import pandas as pd
import oyaml as yaml
import numpy as np
from utils import get_file_targets, get_subset_split
import datetime
import pytz
import json
import keras
from keras.models import Model
from keras.layers import Flatten, Dense, Input, Conv2D, MaxPooling2D, GlobalMaxPooling1D, Reshape
from keras.optimizers import Adam
import resampy
import vggish_params
from IPython.display import clear_output
```
%% Cell type:markdown id: tags:
In the following cell, you have to set several hyperparameters of the learning algorithm.
%% Cell type:code id: tags:
``` python
model_name = 'my_model'
learning_rate = ? # learning rate for gradient descent (actually, Adam optimization method)
batch_size = ? # size of the mini-batches
num_epochs = ? # number of epochs
patience = ? # early stopping patience
tz_Paris = pytz.timezone('Europe/Paris')
datetime_Paris = datetime.datetime.now(tz_Paris)
timestamp = datetime_Paris.strftime("%Y-%m-%d-%Hh%Mm%Ss")
exp_id = model_name + '_' + timestamp
print(exp_id)
```
%% Cell type:code id: tags:
``` python
# save parameters to disk
params = {'annotation_file': annotation_file,
'taxonomy_file': taxonomy_file,
'exp_id': exp_id,
'log_mel_spec_dir': log_mel_spec_dir,
'output_dir': output_training_dir,
'learning_rate': learning_rate,
'batch_size': batch_size,
'batch_size': batch_size,
'num_epochs': num_epochs,
'patience': patience}
results_dir = os.path.join(output_training_dir, exp_id)
os.makedirs(results_dir, exist_ok=True)
kwarg_file = os.path.join(results_dir, "hyper_params.json")
with open(kwarg_file, 'w') as f:
json.dump(params, f, indent=2)
```
%% Cell type:markdown id: tags:
## Load annotations and taxonomy
%% Cell type:code id: tags:
``` python
# Create a Pandas DataFrame from the annotation CSV file
annotation_data = pd.read_csv(annotation_file).sort_values('audio_filename')
# List of all audio files
file_list = annotation_data['audio_filename'].unique().tolist()
# Load taxonomy
with open(taxonomy_file, 'r') as f:
taxonomy = yaml.load(f, Loader=yaml.Loader)
# get list of labels from taxonomy
labels = ["_".join([str(k), v]) for k,v in taxonomy['coarse'].items()]
# list of one-hot encoded labels for all audio files
target_list = get_file_targets(annotation_data, labels)
# list of idices for the training, validation and test subsets
train_file_idxs, val_file_idxs, test_file_idxs = get_subset_split(annotation_data)
# number of classes
n_classes = len(labels)
```
%% Cell type:markdown id: tags:
## Load log-Mel spectrograms
%% Cell type:code id: tags:
``` python
how_saved = 'global' # 'individual' or 'global'
if how_saved == 'global':
log_mel_spec_list = list(np.load(os.path.join(log_mel_spec_dir, 'data.npy')))
elif how_saved == 'individual':
# Create a list of log-Mel spectrograms of size 998 frames × 64 Mel-frequency
log_mel_spec_list = []
for idx, filename in enumerate(file_list):
clear_output(wait=True)
log_mel_path = os.path.join(log_mel_spec_dir, os.path.splitext(filename)[0] + '.npy')
log_mel_spec = np.load(log_mel_path)
log_mel_spec_list.append(log_mel_spec)
print('({}/{})'.format(idx+1, len(file_list)))
```
%% Cell type:code id: tags:
``` python
# Create training set (input, output) pairs
train_x = []
train_y = []
for idx in train_file_idxs:
train_x.append(log_mel_spec_list[idx])
train_y.append(target_list[idx])
perm_train_idxs = np.random.permutation(len(train_x))
train_x = np.array(train_x)[perm_train_idxs]
train_y = np.array(train_y)[perm_train_idxs]
# Create validation set (input, output) pairs
val_x = []
val_y = []
for idx in val_file_idxs:
val_x.append(log_mel_spec_list[idx])
val_y.append(target_list[idx])
perm_val_idxs = np.random.permutation(len(val_x))
val_x = np.array(val_x)[perm_val_idxs]
val_y = np.array(val_y)[perm_val_idxs]
```
%% Cell type:markdown id: tags:
## VGGish
%% Cell type:markdown id: tags:
[VGGish](https://github.com/tensorflow/models/tree/master/research/audioset/vggish) is a variant of the [VGG](https://arxiv.org/abs/1409.1556) model, in
particular Configuration A with 11 weight layers. Specifically, here are the
changes that were made:
* The input size was changed to 96x64 for log mel spectrogram audio inputs.
* The last group of convolutional and maxpool layers was dropped, so we now have
only four groups of convolution/maxpool layers instead of five.
* Instead of a 1000-wide fully connected layer at the end, 128-wide
fully connected layer was used. This acts as a compact embedding layer.
You will have access to a pre-trained VGGish Keras model. It was trained on [AudioSet](https://github.com/tensorflow/models/tree/master/research/audioset), a dataset of over 2 million human-labeled 10-second YouTube video soundtracks, with labels taken from an ontology of more than 600 audio event classes. This represents more than 5 thousand hours of audio.
In the following cell, you have to define the VGGish model in Keras, using the [Functional API](https://keras.io/models/model/). Look for the information you need in the [VGG](https://arxiv.org/abs/1409.1556) paper, and take the above-mentioned modifications into account.
Hint: Look at the imports to know which Keras layers you need.
%% Cell type:code id: tags:
``` python
input_shape = (96, 64, 1) # see vggish_params.py
img_input = Input( shape=input_shape)
x = # TODO: Define the VGGish model in Keras, looking at the VGG paper
# and taking into account the above-mentioned changes.
vggish_model = Model(img_input, x, name='vggish')
```
%% Cell type:markdown id: tags:
Your goal is to use the pre-trained VGGish model for transfer learning, i.e. adapting the model to your specific task and dataset.
In the following cell, you will load the pre-trained weights into your model. It will not work correctly if you did not define the proper architecture. You can look at the architecture of your model with `vggish_model.summary()`.
%% Cell type:code id: tags:
``` python
vggish_weights_file = 'vggish_weights.ckpt'
if not os.path.exists(vggish_weights_file):
!wget https://gitlab-research.centralesupelec.fr/sleglaive/embedded-ust-students/raw/master/vggish_weights.ckpt
vggish_model.load_weights(vggish_weights_file)
```
%% Cell type:markdown id: tags:
## Model adaptation
You cannot directly use the previous VGGish model. You have to define a new model that is inspired from VGGish but also satisfies the following requirements:
- Input layer: It should match the dimension of your audio clips.
- Convolutional layers (and intermediate max-pooling layers): You will use the same than VGGish, up to ```conv4_2```. You will have to initialize the convolutional layers of your new model with the parameters from the pre-trained VGGish model. During training, you will freeze these parameters.
- Temporal pooling: We want to make clip-level predictions for arbitrary clip durations, so frame-level feature maps should be pooled (e.g. with a max pooling) along the time axis.
- Fully-connected layers: You will then have a few fully-connected layers that are randomly initialized and whose parameters are learned on the training set. Choose the output layer to perform 8-class prediction.
---
### Question
Why do we keep the convolutional layers only, and drop the fully-connected ones?
---
In the following cell, you first have to define this new model, then you will transfer some weights from VGGish. You can use the ```my_init``` to initialize the parameters of the dense layers (see the documentation).
%% Cell type:code id: tags:
``` python
my_init = keras.initializers.VarianceScaling(scale=1/3,
mode='fan_in',
distribution='uniform',
seed=None)
complete_model = # TODO
complete_model.summary()
```
%% Cell type:markdown id: tags:
You now have to perform transfer learning. The first 6 convolutional layers are initialized with parameters from convolutional layers
of the pre-trained VGGish model. During training, the first four convolutional layers are fixed (not updated) and the remaining ones
are fine-tuned on the training set. Use the ```get_weights()```, ```set_weights()``` methods and the ```.trainable``` property of Keras layers.
%% Cell type:code id: tags:
``` python
# List of layers to copy (use the name of the layers)
layers_to_copy = []
for layer in layers_to_copy:
# copy the weight, see 'get_layer', 'set_weights' and 'get_weights' in Keras
# List of layers to freeze
layers_to_freeze = []
for layer in layers_to_freeze:
# freeze the weights by setting the 'trainable' parameter to false
```
%% Cell type:markdown id: tags:
## Training
In Edit > Notebook settings or Runtime>Change runtime type, select GPU as Hardware accelerator.
---
### Question
What loss should you use?
---
%% Cell type:code id: tags:
``` python
loss = # TODO
# Set up callbacks for early stopping and monitoring the loss
cb = []
# checkpoint
model_weight_file = os.path.join(results_dir, 'best_model_weights.h5')
cb.append(keras.callbacks.ModelCheckpoint(model_weight_file,
save_weights_only=True,
save_best_only=True,
monitor='val_loss'))
# early stopping
cb.append(keras.callbacks.EarlyStopping(monitor='val_loss',
patience=patience))
# monitor losses
history_csv_file = os.path.join(results_dir, 'history.csv')
cb.append(keras.callbacks.CSVLogger(history_csv_file, append=True,
separator=','))
# Compile model using Adam optimizer
complete_model.compile(Adam(lr=learning_rate), loss=loss)
history = complete_model.fit(
x=train_x[:,:,:,np.newaxis], y=train_y, batch_size=batch_size, epochs=num_epochs,
validation_data=(val_x[:,:,:,np.newaxis], val_y), callbacks=cb, verbose=2)
# save architecture
with open(os.path.join(results_dir, 'model_architecture.json'), 'w') as json_file:
json_file.write(complete_model.to_json())
```
%% Cell type:code id: tags:
``` python
import matplotlib.pyplot as plt
train_loss, = plt.plot(history.history['loss'], label='training loss')
val_loss, = plt.plot(history.history['val_loss'], label='validation loss')
plt.legend(handles=[train_loss, val_loss])
plt.xlabel('epochs')
plt.ylabel('loss')
```
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment