From f1b4e588e255c0e3de0f06839ed2742b93554e9b Mon Sep 17 00:00:00 2001 From: Yandi <yandirzm@gmail.com> Date: Sun, 22 Jan 2023 12:51:58 +0100 Subject: [PATCH] [Submissions] Copying script from Kaggle --- create_submission.py | 140 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 create_submission.py diff --git a/create_submission.py b/create_submission.py new file mode 100644 index 0000000..86f6bd8 --- /dev/null +++ b/create_submission.py @@ -0,0 +1,140 @@ +# coding: utf-8 + +""" +This a dummy test with a non sense model in order to illustrate how on could apply a model to produce predictions as expected by the submission + +The main algorithmic difficulties illustrated by this script are : + + - continuously iterating over the test_dataset without shuffling will + continuously iteration over the volmue (latitude, longitude, depth, time) + - when iterating over minibatches, to subsample every time series by step of + 10 days, we show how to identify where to sample the minibatches of predictions +""" + +# Standard imports +import sys +import logging +import datetime + +# External imports +import tqdm +import torch + +# Local imports +import bindataset as dataset + +def create_submission(model): + step_days = 10 + batch_size = 1024 + # We make chunks of num_days consecutive samples; As our dummy predictor + # is not using the temporal context, this is here arbitrarily chosen + # However, note that it must be a divisor of the total number of days + # in the 2017 year , either 1, 5, 73 or 365 + num_days = 365 + num_workers = 7 + + use_cuda = torch.cuda.is_available() + device = torch.device("cuda") if use_cuda else torch.device("cpu") + + # Build the dataloaders + logging.info("Building the dataloader") + + test_loader = dataset.get_test_dataloader( + dataset._DEFAULT_TEST_FILEPATH, + num_days, + batch_size, + num_workers, + use_cuda, + overwrite_index=True, + transform=transform, + target_transform=None, + ) + num_days_test = test_loader.dataset.ntimes + + logging.info("= Filling in the submission file") + with open("submission.csv", "w") as fh_submission: + fh_submission.write("Id,Predicted\n") + submission_offset = 0 + + # Iterate on the test dataloader + t_offset = 0 + # Every minibatch will contain batch_size * num_days + # As we do not shuffle the data these correspond to consecutive + # days of the same location then followed by consecutive days of the + # next location and so on + chunk_size = batch_size * num_days + + with torch.no_grad(): + for X in tqdm.tqdm(test_loader): + X.to(device) + + ############################################# + # This is where you inject your knowledge + # About your model + # The rest of the code is generic as soon as you have a + # model working on time series + # X is (B, T, N) + # predictions are (B, T) + predictions = model(X) + + ############################################# + + # we reshape it in (B * T) + # and keep only the time instants we need + predictions = predictions.view(-1) + + # we need to slice the times by steps of days + # in chunks of num_test_days days (2017 had 365 days) + yearcut_indices = list(range(0, chunk_size + t_offset, num_days_test)) + # The yearcut_indices are the indices in the linearized minibatch + # corresponding to the 01/01/2017 for some (latitude, longitude, depth) + # For these yearcut_indices, we can locate where to sample + # The vector of predictions + subdays_indices = [ + y + k + for y in yearcut_indices + for k in range(0, num_days_test, step_days) + ] + subdays_indices = list(map(lambda i: i - t_offset, subdays_indices)) + + # Remove the negative indices if any + # These negatives indices happen because of the offset + # These correspond to the locations of the 01/01/2017 in the previous + # minibatch + subdays_indices = [ + k + for k in subdays_indices + if 0 <= k < min(chunk_size, predictions.shape[0]) + ] + t_offset = chunk_size - (yearcut_indices[-1] - t_offset) + + predictions_list = predictions[subdays_indices].tolist() + + # Check + # X = X.view(-1, 18) + # subX = X[yearcut_indices, :] + # # subX = X + # timestamps = subX[:, 3].tolist() + # print( + # "\n".join( + # [f"{datetime.datetime.fromtimestamp(x)}" for x in timestamps] + # ) + # ) + # print("\n\n") + # sys.exit(-1) + + # Dump the predictions to the submission file + submission_part = "\n".join( + [ + f"{i+submission_offset},{pred}" + for i, pred in enumerate(predictions_list) + ] + ) + fh_submission.write(submission_part + "\n") + submission_offset += len(predictions_list) + fh_submission.close() + + +if __name__ == "__main__": + logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(message)s") + test() \ No newline at end of file -- GitLab