diff --git a/dataloader.py b/dataloader.py index 3119e6ab804c48584f54e99f7e77c45eb30c188a..b84e3455b074bb286233bf69eeeb850a5435bb8f 100644 --- a/dataloader.py +++ b/dataloader.py @@ -143,6 +143,7 @@ def get_dataloaders( return train_loader, valid_loader def get_stats_train_dataset( + cfg, filepath, num_days, batch_size, @@ -174,6 +175,7 @@ def get_stats_train_dataset( logging.info(" - Subset dataset") train_dataset = Dataset( + cfg, filepath, subset_file=train_subset_file, transform=train_transform, @@ -222,6 +224,7 @@ def get_stats_train_dataset( return mean, std, maxi, mini def get_test_dataloader( + cfg, filepath, num_days, batch_size, @@ -235,6 +238,7 @@ def get_test_dataloader( # Load the base dataset logging.info(" - Dataset creation") test_dataset = Dataset( + cfg, filepath, train=False, transform=transform, diff --git a/job.py b/job.py index 10c9d7d7f1aed63098cc95ca40705baefcef081c..33808297168ad51d8348169e5f14d60f49f4cebd 100644 --- a/job.py +++ b/job.py @@ -4,15 +4,15 @@ import os import subprocess import argparse -def makejob(commit_id, model, nruns, time_wall): +def makejob(commit_id, model, nruns, time_wall, experiment_name): return f"""#!/bin/bash #SBATCH --job-name={model} #SBATCH --nodes=1 #SBATCH --partition=gpu_prod_night #SBATCH --time={time_wall} -#SBATCH --output=logslurms/slurm-{model}%A_%a.out -#SBATCH --error=logslurms/slurm-{model}%A_%a.err +#SBATCH --output=logslurms/slurm-{model}{experiment_name}%A_%a.out +#SBATCH --error=logslurms/slurm-{model}{experiment_name}%A_%a.err #SBATCH --array=0-{nruns} @@ -43,7 +43,8 @@ wandb login 1a58b15c3c3ebcce186aa7185746efd5c2401a6c echo "Running main.py" -python3 main.py --rootDir /usr/users/sdi1/sdi1_3/Projet_DL/Kaggle_Phytoplankton/logs/ +python3 main.py --rootDir /usr/users/sdi1/sdi1_3/Projet_DL/Kaggle_Phytoplankton/logs/ --experimentName {experiment_name} train +python3 main.py --PATHTOCHECKPOINT /usr/users/sdi1/sdi1_3/Projet_DL/Kaggle_Phytoplankton/logs/{model}{experiment_name}_0 test if [[ $? != 0 ]]; then exit -1 @@ -75,7 +76,7 @@ parser.add_argument("--time_wall", help="Time wall. Choose in [no_limit, hour, half, quarter]") parser.add_argument("--model_name", - default ="Bi-LSTM", + default ="BidirectionalLSTM", help="Name of the model to train") parser.add_argument("--experimentName", @@ -87,7 +88,7 @@ os.system("mkdir -p logslurms") args = parser.parse_args() -time_wall = {"no_limit": "06:00:00","hour" : "1:00:00", "half" : "0:00:00", "quarter" : "0:00:15"} +time_wall = {"no_limit": "12:00:00","hour" : "1:00:00", "half" : "0:00:00", "quarter" : "0:00:15"} # Launch the batch jobs -submit_job(makejob(commit_id, args.model_name, 0, time_wall[args.time_wall])) \ No newline at end of file +submit_job(makejob(commit_id, args.model_name, 0, time_wall[args.time_wall], args.experimentName)) \ No newline at end of file diff --git a/logs/main_unit_test.log b/logs/main_unit_test.log index 6ddb9bffc3ac8cd6903aecafdeabc3a840416940..e7d7b07230094973b2aaa6d9f993174d10e2f8f1 100644 --- a/logs/main_unit_test.log +++ b/logs/main_unit_test.log @@ -2736,3 +2736,33 @@ INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx INFO:root: - The train fold has 541621 samples INFO:root: - The valid fold has 135539 samples +INFO:root:= Dataloaders +INFO:root: - Dataset creation +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Generating the index +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - Loaded a dataset with 677160 samples +INFO:root: - Splitting the data in training and validation sets +INFO:root:Generating the subset files from 677160 samples +INFO:root: - Subset dataset +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - The train fold has 542304 samples +INFO:root: - The valid fold has 134856 samples +INFO:root:= Dataloaders +INFO:root: - Dataset creation +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Generating the index +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - Loaded a dataset with 677160 samples +INFO:root: - Splitting the data in training and validation sets +INFO:root:Generating the subset files from 677160 samples +INFO:root: - Subset dataset +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - The train fold has 542221 samples +INFO:root: - The valid fold has 134939 samples diff --git a/main.py b/main.py index eb34ef81fa21c7e90f72b4df2ed32968c0c16027..1f6b8ca208ddde497d86dd491176a6dbd01b4062 100644 --- a/main.py +++ b/main.py @@ -53,7 +53,7 @@ def train(args, cfg): MAX = eval(cfg["ApproximativeMaxi"]) MIN = eval(cfg["ApproximativeMini"]) else : - MEAN, STD, MAX, MIN = dataloader.get_stats_train_dataset(trainpath, + MEAN, STD, MAX, MIN = dataloader.get_stats_train_dataset(cfg,trainpath, num_days, batch_size, num_workers, @@ -112,17 +112,14 @@ def train(args, cfg): if args.detect_anomaly: torch.autograd.set_detect_anomaly(True) - best_val_loss = None for t in range(cfg["Training"]["Epochs"]): print(f"Epoch {t+1}") my_train.train(args, network, train_loader, f_loss, optimizer, device, log_interval) val_loss = my_test.test(network, valid_loader, f_loss, device) - if best_val_loss != None: - if val_loss < best_val_loss : - network_checkpoint.update(val_loss) - best_val_loss = val_loss + + network_checkpoint.update(val_loss) scheduler.step(val_loss) @@ -136,33 +133,59 @@ def train(args, cfg): logging.info(f"Best model saved in folder {logdir}") + return logdir + -def test(args): +def test(args, cfg): dataset_transform = cfg["Dataset"]["Transform"] - rootDir = args.rootDir if args.rootDir != None else cfg["LogDir"] + #rootDir = args.rootDir if args.rootDir != None else cfg["LogDir"] + use_cuda = torch.cuda.is_available() if use_cuda : device = torch.device('cuda') else : device = toch.device('cpu') - logdir, raw_run_name = utils.create_unique_logpath(rootDir, cfg["Model"]["Name"]) + #logdir, raw_run_name = utils.create_unique_logpath(rootDir, cfg["Model"]["Name"]) + + model_path = f"{args.PATHTOCHECKPOINT}best_model.pt" + + # Re-compute the statistics or use the stored ones + approx_stats = cfg["ApproximativeStats"] - model_path = args.PATHTOCHECKPOINT + if approx_stats: + MEAN = eval(cfg["ApproximativeMean"]) + STD = eval(cfg["ApproximativeSTD"]) + MAX = eval(cfg["ApproximativeMaxi"]) + MIN = eval(cfg["ApproximativeMini"]) + else : + MEAN, STD, MAX, MIN = dataloader.get_stats_train_dataset(cfg,trainpath, + num_days, + batch_size, + num_workers, + use_cuda, + valid_ratio, + overwrite_index=True, + max_num_samples=max_num_samples, + train_transform=None, + valid_transform=None + ) dataset_transform = cfg["Dataset"]["Transform"] + + input_size = 14 if "space_time" in dataset_transform else (17 if "time" in dataset_transform else 18) network = model.build_model(cfg, input_size) - network = model.to(device) + network = network.to(device) network.load_state_dict(torch.load(model_path)) - utils.create_submission(args, network, eval(dataset_transform), device, rootDir, logdir) + utils.create_submission(args, cfg, network, eval(dataset_transform), device, args.PATHTOCHECKPOINT) - logging.info(f"The submission csv file has been created in the folder : {logdir}") + logging.info(f"The submission csv file has been created in the folder : {args.PATHTOCHECKPOINT}") if __name__ == "__main__": @@ -206,8 +229,8 @@ if __name__ == "__main__": parser.add_argument( "--PATHTOCHECKPOINT", - default="./logs/BestBidirectionalLSTM/best_model.pt", - help="Path of the model to load" + default="./logs/BestBidirectionalLSTM/", + help="Path of the directory containing the model to load (with the final /)" ) parser.add_argument( diff --git a/model.py b/model.py index 313f58032016c1ef2ca8f6c0d1e471329d1c2ec1..8a54984fb9da5a7fec143e0327226374d74047dd 100644 --- a/model.py +++ b/model.py @@ -203,7 +203,7 @@ class ModelCheckpoint: def update(self, loss): if (self.min_loss is None) or (loss < self.min_loss): - print("Saving a better model") + print(f"Saving a better model in {self.filepath}") torch.save(self.model.state_dict(), self.filepath) self.min_loss = loss diff --git a/sub_2CMEMS-MEDSEA-2017-testing.nc.bin_index.idx b/sub_2CMEMS-MEDSEA-2017-testing.nc.bin_index.idx index 83480535195d577964a385ab97911ca9e66d6da9..071712f9e2a9b462b43be767974f612cb72c69a2 100644 Binary files a/sub_2CMEMS-MEDSEA-2017-testing.nc.bin_index.idx and b/sub_2CMEMS-MEDSEA-2017-testing.nc.bin_index.idx differ diff --git a/train_indices.subset b/train_indices.subset index 775627b199394f7334d13628767253fc70ee8f4a..9295db2161f04f8868a1507f9e72b61c5f1ad086 100644 Binary files a/train_indices.subset and b/train_indices.subset differ diff --git a/utils.py b/utils.py index 3a6d5a966c1942fae2d05f6dad07997d6cfd3f87..f24cdd031bc556aa9822454804e91f53e73b2f85 100644 --- a/utils.py +++ b/utils.py @@ -11,6 +11,9 @@ import torch.nn as nn import argparse import yaml +# Internal imports +import dataloader + def generate_unique_logpath(logdir, raw_run_name): i = 0 while(True): @@ -58,8 +61,7 @@ def write_summary(logdir, model, optimizer, val_loss): summary_file.write(summary_text) summary_file.close() -def create_submission(args, model, transform, device, rootDir, logdir): - cfg = yaml.load(config_file, Loader=yaml.FullLoader) +def create_submission(args, cfg, model, transform, device, logdir): step_days = 10 batch_size = 1024 # We make chunks of num_days consecutive samples; As our dummy predictor @@ -74,7 +76,8 @@ def create_submission(args, model, transform, device, rootDir, logdir): logging.info("Building the dataloader") if args.PATHTOTESTSET != None: - test_loader = dataloader.get_test_dataloader( + test_loader = dataloader.get_test_dataloader( + cfg, args.PATHTOTESTSET, num_days, batch_size, @@ -86,7 +89,8 @@ def create_submission(args, model, transform, device, rootDir, logdir): ) else : test_loader = dataloader.get_test_dataloader( - dataloader._DEFAULT_TEST_FILEPATH, + cfg, + cfg["Dataset"]["_DEFAULT_TEST_FILEPATH"], num_days, batch_size, num_workers, diff --git a/valid_indices.subset b/valid_indices.subset index 5dade12f6242af965e01d887b87463504aea3f78..22f74983a95ca03330b4dd78afc3635e8cdc6b35 100644 Binary files a/valid_indices.subset and b/valid_indices.subset differ diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log index 7b6c88ad31abd2aa0412a30e76943f006c595653..0d8a09f6f2ba4ed5b3cd9d881d4e0c2fbb163158 120000 --- a/wandb/debug-internal.log +++ b/wandb/debug-internal.log @@ -1 +1 @@ -run-20230211_212625-o8jq0eya/logs/debug-internal.log \ No newline at end of file +run-20230211_220151-s3lkmhrt/logs/debug-internal.log \ No newline at end of file diff --git a/wandb/debug.log b/wandb/debug.log index 14b2e91e2bef1da8a8ae3da25fb60f268d7a151c..a57b6d30a3106d867d7194144b82d0ac8355d0e4 120000 --- a/wandb/debug.log +++ b/wandb/debug.log @@ -1 +1 @@ -run-20230211_212625-o8jq0eya/logs/debug.log \ No newline at end of file +run-20230211_220151-s3lkmhrt/logs/debug.log \ No newline at end of file diff --git a/wandb/latest-run b/wandb/latest-run index 921df60f4e49ce33f24f5b4449ce8089a4288094..688ad02edb8028199fe2bed029cdb745139fae2e 120000 --- a/wandb/latest-run +++ b/wandb/latest-run @@ -1 +1 @@ -run-20230211_212625-o8jq0eya \ No newline at end of file +run-20230211_220151-s3lkmhrt \ No newline at end of file