Commit 17b61d66 authored by Simon's avatar Simon
Browse files

paths to resources on gitlab updated

parent 60c067fc
%% Cell type:markdown id: tags:
# Preliminaries
%% Cell type:markdown id: tags:
## Environment setup
### Mount Google Drive
%% Cell type:code id: tags:
``` python
from google.colab import drive
drive.mount('/content/drive')
```
%% Cell type:markdown id: tags:
### Get some utilities
%% Cell type:code id: tags:
``` python
import os
if not os.path.exists('mel_features.py'):
!wget https://gitlab-student.centralesupelec.fr/sleglaive/embedded-urban-sound-tagging/raw/master/mel_features.py
!wget https://gitlab-research.centralesupelec.fr/sleglaive/embedded-ust-students/raw/master/mel_features.py
if not os.path.exists('utils.py'):
!wget https://gitlab-student.centralesupelec.fr/sleglaive/embedded-urban-sound-tagging/raw/master/utils.py
!wget https://gitlab-research.centralesupelec.fr/sleglaive/embedded-ust-students/raw/master/utils.py
if not os.path.exists('vggish_params.py'):
!wget https://gitlab-student.centralesupelec.fr/sleglaive/embedded-urban-sound-tagging/raw/master/vggish_params.py
!wget https://gitlab-research.centralesupelec.fr/sleglaive/embedded-ust-students/raw/master/vggish_params.py
```
%% Cell type:markdown id: tags:
### Define important paths
%% Cell type:code id: tags:
``` python
ust_data_dir = './drive/My Drive/data/ust-data'
dataset_dir = os.path.join(ust_data_dir, 'sonyc-ust')
annotation_file = os.path.join(dataset_dir, 'annotations.csv')
taxonomy_file = os.path.join(dataset_dir, 'dcase-ust-taxonomy.yaml')
log_mel_spec_dir = os.path.join(ust_data_dir, 'log-mel-spectrograms')
output_training_dir = os.path.join(ust_data_dir, 'output_training')
output_prediction_dir = os.path.join(ust_data_dir, 'output_prediction')
```
%% Cell type:markdown id: tags:
### Install missing packages
%% Cell type:code id: tags:
``` python
!pip install oyaml
```
%% Cell type:markdown id: tags:
## Exploring the dataset
%% Cell type:markdown id: tags:
We will use [Pandas DataFrame](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html) to manipulate the dataset.
%% Cell type:code id: tags:
``` python
import pandas as pd
import oyaml as yaml
from utils import get_file_targets, get_subset_split
import numpy as np
```
%% Cell type:code id: tags:
``` python
# Create a Pandas DataFrame from the annotation CSV file
annotation_data = pd.read_csv(annotation_file).sort_values('audio_filename')
# You can view the top rows of the frame with
annotation_data.head()
```
%% Cell type:code id: tags:
``` python
# List of all audio files
file_list = annotation_data['audio_filename'].unique().tolist()
```
%% Cell type:code id: tags:
``` python
# Load taxonomy
with open(taxonomy_file, 'r') as f:
taxonomy = yaml.load(f, Loader=yaml.Loader)
# get list of labels from taxonomy
labels = ["_".join([str(k), v]) for k,v in taxonomy['coarse'].items()]
# number of classes
n_classes = len(labels)
print(labels)
```
%% Cell type:code id: tags:
``` python
# get list of one-hot encoded labels for all audio files
target_list = get_file_targets(annotation_data, labels)
# get list of idices for the training, validation and test subsets
train_file_idxs, val_file_idxs, test_file_idxs = get_subset_split(annotation_data)
```
%% Cell type:markdown id: tags:
For each split (training, validation, test) and each label, we compute the proportion of files that contain this label.
%% Cell type:code id: tags:
``` python
train_proportions = np.sum(target_list[train_file_idxs,:],
axis=0)/len(train_file_idxs)
val_proportions = np.sum(target_list[val_file_idxs,:],
axis=0)/len(val_file_idxs)
test_proportions = np.sum(target_list[test_file_idxs,:],
axis=0)/len(test_file_idxs)
print('Distribution of classes in the training set:')
for idx, label in enumerate(labels):
print(label+': {:.2%}'.format(train_proportions[idx]))
print('\n')
print('Distribution of classes in the validation set:')
for idx, label in enumerate(labels):
print(label+': {:.2%}'.format(val_proportions[idx]))
print('\n')
print('Distribution of classes in the test set:')
for idx, label in enumerate(labels):
print(label+': {:.2%}'.format(test_proportions[idx]))
```
%% Cell type:markdown id: tags:
---
### Question
What conclusions can we draw from the distribution of classes in the training set?
---
%% Cell type:markdown id: tags:
We have only a few examples for `dog`, `music`, and `non-machinery-impact`. We can expect lower performance in the detection of these classes.
%% Cell type:markdown id: tags:
## Audio basics
We will use two libraries for loading and playing audio signals:
1. [Librosa](https://librosa.github.io/librosa/index.html) is a Python package for music and audio processing.
2. [IPython.display.Audio](https://ipython.org/ipython-doc/stable/api/generated/IPython.display.html#IPython.display.Audio) lets you play audio directly in notebooks.
%% Cell type:markdown id: tags:
### Reading audio
Use [`librosa.load`](https://librosa.github.io/librosa/generated/librosa.core.load.html#librosa.core.load) to load an audio file into an audio array. Return both the audio array as well as the sample rate:
%% Cell type:code id: tags:
``` python
import librosa
# get a file in the training set
training_file_list = [file_list[ind] for ind in train_file_idxs]
audio_file = os.path.join(dataset_dir, 'audio-dev/train',
training_file_list[10])
x, sr = librosa.load(audio_file, mono=True, sr=None)
```
%% Cell type:markdown id: tags:
Display the length of the audio array and sample rate:
%% Cell type:code id: tags:
``` python
print(x.shape)
print(sr)
```
%% Cell type:code id: tags:
``` python
import resampy
import vggish_params
old_sr = sr
sr = vggish_params.SAMPLE_RATE
x = resampy.resample(x, old_sr, sr)
```
%% Cell type:markdown id: tags:
### Visualizing Audio
%% Cell type:markdown id: tags:
In order to display plots inside the Jupyter notebook, run the following commands:
%% Cell type:code id: tags:
``` python
import matplotlib.pyplot as plt
```
%% Cell type:code id: tags:
``` python
time_axis = np.arange(0,x.shape[0]/sr, 1/sr)
plt.figure(figsize=(7, 3))
plt.plot(time_axis, x)
plt.title('waveform')
plt.ylabel('amplitude')
plt.xlabel('time (s)')
```
%% Cell type:markdown id: tags:
### Playing Audio
%% Cell type:markdown id: tags:
Using [`IPython.display.Audio`](http://ipython.org/ipython-doc/2/api/generated/IPython.lib.display.html#IPython.lib.display.Audio), you can play an audio file:
%% Cell type:code id: tags:
``` python
import IPython.display as ipd
ipd.Audio(x, rate=sr) # load a local WAV file
```
%% Cell type:markdown id: tags:
### Writing Audio
%% Cell type:markdown id: tags:
[`librosa.output.write_wav`](https://librosa.github.io/librosa/generated/librosa.output.write_wav.html#librosa.output.write_wav) saves a NumPy array to a WAV file.
%% Cell type:code id: tags:
``` python
librosa.output.write_wav('example.wav', x, sr)
```
%% Cell type:markdown id: tags:
## Mel spectrogram
%% Cell type:markdown id: tags:
In this project, we will work with a time-frequency representation of audio signals called the Mel spectrogram. It is computed as follows:
%% Cell type:markdown id: tags:
#### Framing
The waveform is converted into into a sequence of successive overlapping frames.
%% Cell type:code id: tags:
``` python
# Define the parameters of the short-term analysis
window_length_secs = vggish_params.STFT_WINDOW_LENGTH_SECONDS
hop_length_secs = vggish_params.STFT_HOP_LENGTH_SECONDS
window_length_samples = int(round(sr * window_length_secs))
hop_length_samples = int(round(sr * hop_length_secs))
num_samples = x.shape[0]
num_frames = 1 + int(np.floor((num_samples - window_length_samples) /
hop_length_samples))
# Create an array of shape (window_length_samples, num_frames) where each column
# contains a frame of the original audio signal
shape = (num_frames, window_length_samples)
strides = (x.strides[0] * hop_length_samples,) + x.strides
X_frames = np.lib.stride_tricks.as_strided(x, shape=shape, strides=strides).T
print(X_frames.shape)
```
%% Cell type:markdown id: tags:
#### Windowing
%% Cell type:markdown id: tags:
Each frame is multiplied with a smooth analysis window.
%% Cell type:code id: tags:
``` python
window = .5 - (0.5 * np.cos(2 * np.pi / window_length_samples *
np.arange(window_length_samples))) # "periodic" Hann
X_windowed_frames = X_frames * window[:,np.newaxis]
plt.figure()
plt.plot(window)
print(X_windowed_frames.shape)
plt.title('analysis window')
plt.xlabel('samples')
```
%% Cell type:markdown id: tags:
#### Discrete Fourier transform
%% Cell type:markdown id: tags:
The short-term Fourier transform (STFT) is computed by applying the discrete Fourier transform (DFT) on each windowed frame. The magnitude spectrogram is obtained by taking the modulus of the STFT matrix.
%% Cell type:code id: tags:
``` python
import librosa.display
fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0)))
X_stft = np.fft.rfft(X_windowed_frames, int(fft_length), axis=0)
X_spec = np.abs(X_stft)
plt.figure(figsize=(14, 7))
librosa.display.specshow(librosa.amplitude_to_db(X_spec), sr=sr,
hop_length=hop_length_samples, x_axis='time',
y_axis='hz')
# This is basically equivalent to:
# librosa.display.specshow(20*np.log10(X_spec), sr=sr,
# hop_length=hop_length_samples, x_axis='time',
# y_axis='hz')
# plt.clim(-60,25)
plt.colorbar()
plt.title('dB-scaled spectrogram')
plt.xlabel('time (s)')
plt.ylabel('frequency (Hz)')
```
%% Cell type:markdown id: tags:
#### Mel filterbank
%% Cell type:markdown id: tags:
A filterbank matrix is created to map DFT-frequency bins into Mel-frequency bins
%% Cell type:code id: tags:
``` python
import mel_features
lower_edge_hertz = vggish_params.MEL_MIN_HZ
upper_edge_hertz = vggish_params.MEL_MAX_HZ
num_mel_bins = vggish_params.NUM_MEL_BINS
spec_to_mel_mat = mel_features.spectrogram_to_mel_matrix(num_mel_bins=num_mel_bins,
num_spectrogram_bins=X_spec.shape[0],
audio_sample_rate=sr,
lower_edge_hertz=lower_edge_hertz,
upper_edge_hertz=upper_edge_hertz)
print(spec_to_mel_mat.T.shape)
plt.figure(figsize=(14, 7))
plt.imshow(spec_to_mel_mat.T, origin='lower')
plt.colorbar(orientation='horizontal')
plt.set_cmap('magma')
plt.title('Mel filterbank matrix')
plt.xlabel('DFT-frequency bins')
plt.ylabel('Mel-frequency bins')
```
%% Cell type:markdown id: tags:
#### Mel spectrogram
%% Cell type:markdown id: tags:
---
### Question
How do you obtain the Mel spectrogram from the filterbank matrix and the spectrogram?
---
%% Cell type:code id: tags:
``` python
X_mel_spec = # TODO
plt.figure(figsize=(14, 7))
librosa.display.specshow(librosa.amplitude_to_db(X_mel_spec), sr=sr,
hop_length=hop_length_samples, x_axis='time')
plt.set_cmap('magma')
plt.colorbar()
plt.title('dB-scaled Mel-spectrogram')
plt.xlabel('time (s)')
plt.yticks(np.arange(0,num_mel_bins,10))
plt.ylabel('Mel-frequency bins')
```
%% Cell type:markdown id: tags:
---
### Questions
1. What is the Mel scale?
2. Explain the effect of the Mel filterbank matrix on the time-frequency representation? What happens to the low and high frequencies?
---
......
%% Cell type:markdown id: tags:
# Extract log-Mel spectrograms
%% Cell type:markdown id: tags:
## Environment setup
### Mount Google Drive
%% Cell type:code id: tags:
``` python
from google.colab import drive
drive.mount('/content/drive')
```
%% Cell type:markdown id: tags:
### Get some utilities
%% Cell type:code id: tags:
``` python
import os
if not os.path.exists('mel_features.py'):
!wget https://gitlab-student.centralesupelec.fr/sleglaive/embedded-urban-sound-tagging/raw/master/mel_features.py
!wget https://gitlab-research.centralesupelec.fr/sleglaive/embedded-ust-students/raw/master/mel_features.py
if not os.path.exists('utils.py'):
!wget https://gitlab-student.centralesupelec.fr/sleglaive/embedded-urban-sound-tagging/raw/master/utils.py
!wget https://gitlab-research.centralesupelec.fr/sleglaive/embedded-ust-students/raw/master/utils.py
if not os.path.exists('vggish_params.py'):
!wget https://gitlab-student.centralesupelec.fr/sleglaive/embedded-urban-sound-tagging/raw/master/vggish_params.py
!wget https://gitlab-research.centralesupelec.fr/sleglaive/embedded-ust-students/raw/master/vggish_params.py
```
%% Cell type:markdown id: tags:
### Define important paths
%% Cell type:code id: tags:
``` python
ust_data_dir = './drive/My Drive/data/ust-data'
dataset_dir = os.path.join(ust_data_dir, 'sonyc-ust')
annotation_file = os.path.join(dataset_dir, 'annotations.csv')
taxonomy_file = os.path.join(dataset_dir, 'dcase-ust-taxonomy.yaml')
log_mel_spec_dir = os.path.join(ust_data_dir, 'log-mel-spectrograms')
output_training_dir = os.path.join(ust_data_dir, 'output_training')
output_prediction_dir = os.path.join(ust_data_dir, 'output_prediction')
```
%% Cell type:markdown id: tags:
### Install missing packages
%% Cell type:code id: tags:
``` python
!pip install oyaml
```
%% Cell type:markdown id: tags:
## Main processing
%% Cell type:markdown id: tags:
We use the [VGGish](https://github.com/tensorflow/models/tree/master/research/audioset/vggish) recipe to compute log-Mel spectrograms:
* All audio is resampled to 16 kHz mono.
* A spectrogram is computed using magnitudes of the Short-Time Fourier Transform
with a window size of 25 ms, a window hop of 10 ms, and a periodic Hann
window.
* A mel spectrogram is computed by mapping the spectrogram to 64 mel bins
covering the range 125-7500 Hz.
* A stabilized log mel spectrogram is computed by applying
log(mel-spectrum + 0.01) where the offset is used to avoid taking a logarithm
of zero.
%% Cell type:code id: tags:
``` python
import librosa
import os
import numpy as np
import pandas as pd
import mel_features
import vggish_params
from IPython.display import clear_output
if not(os.path.isdir(log_mel_spec_dir)):
os.makedirs(log_mel_spec_dir)
# Some parameters
sr = vggish_params.SAMPLE_RATE
window_length_secs = vggish_params.STFT_WINDOW_LENGTH_SECONDS
hop_length_secs = vggish_params.STFT_HOP_LENGTH_SECONDS
window_length_samples = int(round(sr * window_length_secs))
hop_length_samples = int(round(sr * hop_length_secs))
num_samples = 10*sr # 10-seconds audio clips
num_frames = 1 + int(np.floor((num_samples - window_length_samples) /
hop_length_samples))
# How to save the features?
# We have two options :
# --> 'individual': we create a small .npy file containing the log-mel
# spectrogram (numpy array) for each audio file in the dataset. So we have as
# many .npy files as the number of audio examples in the dataset.
# --> 'global': we create a huge .npy file containing the log-mel spectrograms
# (numpy array) for all the audio files in the dataset.
how_to_save = 'global' # 'global' or 'individual'
# Create a Pandas DataFrame from the annotation CSV file
annotation_data = pd.read_csv(annotation_file).sort_values('audio_filename')
# Create a new frame which only corresponds to the list of audio files
df_audio_files = annotation_data[['split', 'audio_filename']].drop_duplicates()
# List of all audio files
file_list = annotation_data['audio_filename'].unique().tolist()
# Create dictionnary for making the correspondance between splits and
# directories
split2dir = {'train': 'audio-dev/train',
'validate': 'audio-dev/validate',
'test': 'audio-eval'}
counter = 0
# Iterate over DataFrame rows as (index, row) pairs, where 'index' is the index
# of the row and 'row' contains the data of the row as a pandas Series
log_mel_spec_list = []
for index, row in df_audio_files.iterrows():
clear_output(wait=True)
filename = row['audio_filename']
print('({}/{}) {}'.format(counter+1, len(df_audio_files), filename))
partition = row['split']
audio_path = os.path.join(dataset_dir, split2dir[partition], filename)
x, sr = librosa.load(audio_path, mono=True, sr=None)
x =x.T
log_mel_spec = mel_features.waveform_to_log_mel_spectrogram(x, sr)
if log_mel_spec.shape[0] < num_frames:
# add zeros so that the final number of frames is 998
padding_len = num_frames-log_mel_spec.shape[0]
zero_pad = np.zeros((padding_len, log_mel_spec.shape[1]))
log_mel_spec = np.vstack((log_mel_spec, zero_pad))
elif log_mel_spec.shape[0] > num_frames:
# remove frames that the final number of frames is 998
log_mel_spec = log_mel_spec[:num_frames,:]
if how_to_save == 'individual':
data_path = os.path.join(log_mel_spec_dir,
os.path.splitext(filename)[0] + '.npy')
np.save(data_path, log_mel_spec)
elif how_to_save == 'global':
log_mel_spec_list.append(log_mel_spec)
counter+=1
if how_to_save ==