create_submission.py 6.11 KiB
# coding: utf-8
"""
This a dummy test with a non sense model in order to illustrate how on could apply a model to produce predictions as expected by the submission
The main algorithmic difficulties illustrated by this script are :
- continuously iterating over the test_dataset without shuffling will
continuously iteration over the volmue (latitude, longitude, depth, time)
- when iterating over minibatches, to subsample every time series by step of
10 days, we show how to identify where to sample the minibatches of predictions
"""
# Standard imports
import sys
import logging
import datetime
# External imports
import tqdm
import torch
import torch.nn as nn
import argparse
# Local imports
import dataloader
def dummy_model(X):
# X is a (B, T, N) tensor
# As a dummy model, say, we average all the environmental measures
# Divided by a magic number
return X[:, :, 4:].mean(dim=2) / 26 # This is (B, T)
def create_submission(model, transform, device, rootDir, logdir):
step_days = 10
batch_size = 1024
# We make chunks of num_days consecutive samples; As our dummy predictor
# is not using the temporal context, this is here arbitrarily chosen
# However, note that it must be a divisor of the total number of days
# in the 2017 year , either 1, 5, 73 or 365
num_days = 73
num_workers = 7
use_cuda = torch.cuda.is_available()
# Build the dataloaders
logging.info("Building the dataloader")
test_loader = dataloader.get_test_dataloader(
dataloader._DEFAULT_TEST_FILEPATH,
num_days,
batch_size,
num_workers,
use_cuda,
overwrite_index=True,
transform=transform,
target_transform=None,
)
num_days_test = test_loader.dataset.ntimes
logging.info("= Filling in the submission file")
with open(logdir + "submission.csv", "w") as fh_submission:
fh_submission.write("Id,Predicted\n")
submission_offset = 0
# Iterate on the test dataloader
t_offset = 0
# Every minibatch will contain batch_size * num_days
# As we do not shuffle the data these correspond to consecutive
# days of the same location then followed by consecutive days of the
# next location and so on
chunk_size = batch_size * num_days
with torch.no_grad():
for X in tqdm.tqdm(test_loader):
X = X.to(device)
#############################################
# This is where you inject your knowledge
# About your model
# The rest of the code is generic as soon as you have a
# model working on time series
# X is (B, T, N)
# predictions are (B, T)
predictions = model(X)
#############################################
# we reshape it in (B * T)
# and keep only the time instants we need
predictions = predictions.view(-1)
# we need to slice the times by steps of days
# in chunks of num_test_days days (2017 had 365 days)
yearcut_indices = list(range(0, chunk_size + t_offset, num_days_test))
# The yearcut_indices are the indices in the linearized minibatch
# corresponding to the 01/01/2017 for some (latitude, longitude, depth)
# For these yearcut_indices, we can locate where to sample
# The vector of predictions
subdays_indices = [
y + k
for y in yearcut_indices
for k in range(0, num_days_test, step_days)
]
subdays_indices = list(map(lambda i: i - t_offset, subdays_indices))
# Remove the negative indices if any
# These negatives indices happen because of the offset
# These correspond to the locations of the 01/01/2017 in the previous
# minibatch
subdays_indices = [
k
for k in subdays_indices
if 0 <= k < min(chunk_size, predictions.shape[0])
]
t_offset = chunk_size - (yearcut_indices[-1] - t_offset)
predictions_list = predictions[subdays_indices].tolist()
# Check
# X = X.view(-1, 18)
# subX = X[yearcut_indices, :]
# # subX = X
# timestamps = subX[:, 3].tolist()
# print(
# "\n".join(
# [f"{datetime.datetime.fromtimestamp(x)}" for x in timestamps]
# )
# )
# print("\n\n")
# sys.exit(-1)
# Dump the predictions to the submission file
submission_part = "\n".join(
[
f"{i+submission_offset},{pred}"
for i, pred in enumerate(predictions_list)
]
)
fh_submission.write(submission_part + "\n")
submission_offset += len(predictions_list)
fh_submission.close()
if __name__ == "__main__":
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(message)s")
use_cuda = torch.cuda.is_available()
if use_cuda :
device = torch.device('cuda')
else :
device = toch.device('cpu')
model_path = "logs/LinearRegression_1/best_model.pt"
model = nn.Sequential(
nn.Linear(14,35,True),
nn.ReLU(),
nn.Linear(35, 35, True),
nn.ReLU(),
nn.Linear(35,35,True),
nn.ReLU(),
nn.Linear(35,35,True),
nn.ReLU(),
nn.Linear(35,35,True),
nn.ReLU(),
nn.Linear(35,35,True),
nn.ReLU(),
nn.Linear(35,35, True),
nn.ReLU(),
nn.Linear(35,1, True),
nn.ReLU()
)
model = model.to(device)
model.load_state_dict(torch.load(model_path))
create_submission(model, dataloader.transform_remove_space_time(), device, )
#create_submission(model, None, device)