Newer
Older
# coding: utf-8
"""
This a dummy test with a non sense model in order to illustrate how on could apply a model to produce predictions as expected by the submission
The main algorithmic difficulties illustrated by this script are :
- continuously iterating over the test_dataset without shuffling will
continuously iteration over the volmue (latitude, longitude, depth, time)
- when iterating over minibatches, to subsample every time series by step of
10 days, we show how to identify where to sample the minibatches of predictions
"""
# Standard imports
import sys
import logging
import datetime
# External imports
import tqdm
import torch
def dummy_model(X):
# X is a (B, T, N) tensor
# As a dummy model, say, we average all the environmental measures
# Divided by a magic number
return X[:, :, 4:].mean(dim=2) / 26 # This is (B, T)
def create_submission(model, transform, device, rootDir):
step_days = 10
batch_size = 1024
# We make chunks of num_days consecutive samples; As our dummy predictor
# is not using the temporal context, this is here arbitrarily chosen
# However, note that it must be a divisor of the total number of days
# in the 2017 year , either 1, 5, 73 or 365
# Build the dataloaders
logging.info("Building the dataloader")
test_loader = dataloader.get_test_dataloader(
dataloader._DEFAULT_TEST_FILEPATH,
num_days,
batch_size,
num_workers,
use_cuda,
overwrite_index=True,
transform=transform,
target_transform=None,
)
num_days_test = test_loader.dataset.ntimes
logging.info("= Filling in the submission file")
with open(rootDir + "submission.csv", "w") as fh_submission:
fh_submission.write("Id,Predicted\n")
submission_offset = 0
# Iterate on the test dataloader
t_offset = 0
# Every minibatch will contain batch_size * num_days
# As we do not shuffle the data these correspond to consecutive
# days of the same location then followed by consecutive days of the
# next location and so on
chunk_size = batch_size * num_days
with torch.no_grad():
for X in tqdm.tqdm(test_loader):
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#############################################
# This is where you inject your knowledge
# About your model
# The rest of the code is generic as soon as you have a
# model working on time series
# X is (B, T, N)
# predictions are (B, T)
predictions = model(X)
#############################################
# we reshape it in (B * T)
# and keep only the time instants we need
predictions = predictions.view(-1)
# we need to slice the times by steps of days
# in chunks of num_test_days days (2017 had 365 days)
yearcut_indices = list(range(0, chunk_size + t_offset, num_days_test))
# The yearcut_indices are the indices in the linearized minibatch
# corresponding to the 01/01/2017 for some (latitude, longitude, depth)
# For these yearcut_indices, we can locate where to sample
# The vector of predictions
subdays_indices = [
y + k
for y in yearcut_indices
for k in range(0, num_days_test, step_days)
]
subdays_indices = list(map(lambda i: i - t_offset, subdays_indices))
# Remove the negative indices if any
# These negatives indices happen because of the offset
# These correspond to the locations of the 01/01/2017 in the previous
# minibatch
subdays_indices = [
k
for k in subdays_indices
if 0 <= k < min(chunk_size, predictions.shape[0])
]
t_offset = chunk_size - (yearcut_indices[-1] - t_offset)
predictions_list = predictions[subdays_indices].tolist()
# Check
# X = X.view(-1, 18)
# subX = X[yearcut_indices, :]
# # subX = X
# timestamps = subX[:, 3].tolist()
# print(
# "\n".join(
# [f"{datetime.datetime.fromtimestamp(x)}" for x in timestamps]
# )
# )
# print("\n\n")
# sys.exit(-1)
# Dump the predictions to the submission file
submission_part = "\n".join(
[
f"{i+submission_offset},{pred}"
for i, pred in enumerate(predictions_list)
]
)
fh_submission.write(submission_part + "\n")
submission_offset += len(predictions_list)
fh_submission.close()
if __name__ == "__main__":
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(message)s")
use_cuda = torch.cuda.is_available()
if use_cuda :
device = torch.device('cuda')
else :
device = toch.device('cpu')
model_path = "logs/LinearRegression_1/best_model.pt"
nn.ReLU(),
nn.Linear(35,35,True),
nn.ReLU(),
nn.Linear(35,35,True),
nn.ReLU(),
nn.Linear(35,35,True),
nn.ReLU(),
nn.Linear(35,35,True),
nn.ReLU(),
nn.Linear(35,1, True),
nn.ReLU()
)
model = model.to(device)
model.load_state_dict(torch.load(model_path))
create_submission(model, dataloader.transform_remove_space_time(), device)
#create_submission(model, None, device)