diff --git a/dataloader.py b/dataloader.py index 54a3e52290f6dbdc52d8d3212a58d5cf706f166b..3119e6ab804c48584f54e99f7e77c45eb30c188a 100644 --- a/dataloader.py +++ b/dataloader.py @@ -23,6 +23,7 @@ import torch.utils.data as data from dataset import Dataset def train_valid_split( + cfg, dataset, valid_ratio, train_subset_filepath, @@ -36,6 +37,8 @@ def train_valid_split( For smalls sets, we could use indices in main memory with torch.utils.data.Subset """ + _ENCODING_ENDIAN = cfg["Dataset"]["_ENCODING_ENDIAN"] + _ENCODING_LINEAR = cfg["Dataset"]["_ENCODING_LINEAR"] N = len(dataset) if max_num_samples is not None: @@ -67,6 +70,7 @@ def train_valid_split( def get_dataloaders( + cfg, filepath, num_days, batch_size, @@ -84,7 +88,7 @@ def get_dataloaders( # Load the base dataset logging.info(" - Dataset creation") dataset = Dataset( - filepath, train=True, overwrite_index=overwrite_index, num_days=num_days + cfg, filepath, train=True, overwrite_index=overwrite_index, num_days=num_days ) logging.info(f" - Loaded a dataset with {len(dataset)} samples") @@ -93,11 +97,12 @@ def get_dataloaders( train_subset_file = "train_indices.subset" valid_subset_file = "valid_indices.subset" train_valid_split( - dataset, valid_ratio, train_subset_file, valid_subset_file, max_num_samples + cfg, dataset, valid_ratio, train_subset_file, valid_subset_file, max_num_samples ) logging.info(" - Subset dataset") train_dataset = Dataset( + cfg, filepath, subset_file=train_subset_file, transform=train_transform, @@ -105,6 +110,7 @@ def get_dataloaders( num_days=num_days, ) valid_dataset = Dataset( + cfg, filepath, subset_file=valid_subset_file, transform=valid_transform, @@ -284,6 +290,7 @@ if __name__ == "__main__": 1.9490e-01, 9.2847e-03, 2.2575e+00, 8.5310e-02, 7.8280e-02, 8.6237e-02]) train_loader, valid_loader = get_dataloaders( + cfg, filepath = trainpath, num_days = num_days, batch_size = batch_size, diff --git a/dataset.py b/dataset.py index 63c9f512ea0ea2b4f6fd288a4ecf3391d73f5da2..a8046a2f56d0b82ef419673eeb2d531f9fd22505 100644 --- a/dataset.py +++ b/dataset.py @@ -3,7 +3,7 @@ import os import sys import logging import pathlib -from typing import Union +from typing import Union, Dict import struct from datetime import datetime @@ -19,15 +19,7 @@ import tqdm import torch import torch.utils.data as data -_DEFAULT_TRAIN_FILEPATH = "/mounts/Datasets3/2022-ChallengePlankton/sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin" -_DEFAULT_TEST_FILEPATH = ( - "/mounts/Datasets3/2022-ChallengePlankton/sub_2CMEMS-MEDSEA-2017-testing.nc.bin" -) -_ENCODING_LINEAR = "I" -_ENCODING_INDEX = "I" # h(short) with 2 bytes should be sufficient -_ENCODING_OFFSET_FORMAT = "" -_ENCODING_ENDIAN = "<" class Dataset(data.Dataset): @@ -44,6 +36,7 @@ class Dataset(data.Dataset): def __init__( self, + cfg : Dict, filepath: Union[pathlib.Path, str], overwrite_index: bool = False, train=True, @@ -65,6 +58,14 @@ class Dataset(data.Dataset): target_transform: a transform to apply to the phyc output tensor """ super().__init__() + + self._ENCODING_ENDIAN = cfg["Dataset"]["_ENCODING_ENDIAN"] + self._ENCODING_LINEAR = cfg["Dataset"]["_ENCODING_LINEAR"] + self._ENCODING_INDEX = cfg["Dataset"]["_ENCODING_INDEX"] + self._ENCODING_OFFSET_FORMAT = cfg["Dataset"]["_ENCODING_OFFSET_FORMAT"] + self._DEFAULT_TRAIN_FILEPATH = cfg["Dataset"]["_DEFAULT_TRAIN_FILEPATH"] + self._DEFAULT_TEST_FILEPATH = cfg["Dataset"]["_DEFAULT_TEST_FILEPATH"] + if isinstance(filepath, str): filepath = pathlib.Path(filepath) self.filepath = filepath @@ -104,18 +105,18 @@ class Dataset(data.Dataset): self.row_format = ( "?" + "f" * len(self.in_variables) + ("f" if self.train else "") ) - self.row_size = struct.calcsize(_ENCODING_ENDIAN + self.row_format) + self.row_size = struct.calcsize(self._ENCODING_ENDIAN + self.row_format) # Load the header of the file for the dimension variables # Local utilitary function to parse the header def _read_dim(fp, offset, base_format, lock): - fmt = _ENCODING_ENDIAN + "i" + fmt = self._ENCODING_ENDIAN + "i" (dim, nbytes_dim) = read_bin_data(fp, offset, os.SEEK_SET, fmt, lock) dim = dim[0] # the returned values is a tuple offset += nbytes_dim - fmt = _ENCODING_ENDIAN + (base_format * dim) + fmt = self._ENCODING_ENDIAN + (base_format * dim) (values, nbytes_values) = read_bin_data(fp, offset, os.SEEK_SET, fmt, lock) return dim, np.array(values), nbytes_dim + nbytes_values @@ -213,9 +214,9 @@ class Dataset(data.Dataset): fileoffset = t0_offset + dt * self.row_size fmt = ( - _ENCODING_ENDIAN - + (_ENCODING_LINEAR * 2) - + (_ENCODING_INDEX * 4) + self._ENCODING_ENDIAN + + (self._ENCODING_LINEAR * 2) + + (self._ENCODING_INDEX * 4) ) write_bin_data( fhindex, @@ -255,7 +256,7 @@ class Dataset(data.Dataset): self._subset_map = open(subsetpath, "rb") def _get_fileoffset(self, idx): - fmt = _ENCODING_ENDIAN + (_ENCODING_LINEAR * 2) + (_ENCODING_INDEX * 4) + fmt = self._ENCODING_ENDIAN + (self._ENCODING_LINEAR * 2) + (self._ENCODING_INDEX * 4) whence = 0 if idx >= 0 else 2 offset = idx * struct.calcsize(fmt) @@ -276,7 +277,7 @@ class Dataset(data.Dataset): # If we are processing a subet, convert the "idx" to the # original dataset index if self._subset_map is not None: - fmt = _ENCODING_ENDIAN + _ENCODING_LINEAR + fmt = self._ENCODING_ENDIAN + self._ENCODING_LINEAR offset = idx * struct.calcsize(fmt) (values, _) = read_bin_data( self._subset_map, offset, os.SEEK_SET, fmt, self._subset_lock @@ -291,7 +292,7 @@ class Dataset(data.Dataset): self.fp, file_offset, os.SEEK_SET, - _ENCODING_ENDIAN + (self.row_format * self.num_days), + self._ENCODING_ENDIAN + (self.row_format * self.num_days), self.fp_lock, ) @@ -341,7 +342,7 @@ class Dataset(data.Dataset): # is the size of the subset if self._subset_map is not None: subset_size = os.path.getsize(self._subset_map.name) - return subset_size // struct.calcsize(_ENCODING_ENDIAN + _ENCODING_LINEAR) + return subset_size // struct.calcsize(self._ENCODING_ENDIAN + self._ENCODING_LINEAR) else: # Access the last of the index file # and get its linear index. This linear index is also diff --git a/logs/main_unit_test.log b/logs/main_unit_test.log index 8f016a55f0006b877b3e43fb14e63892d1404617..f72c143f1f4ad14af28205025b9ef3478c3c4567 100644 --- a/logs/main_unit_test.log +++ b/logs/main_unit_test.log @@ -2648,3 +2648,76 @@ INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx INFO:root: - The train fold has 541940 samples INFO:root: - The valid fold has 135220 samples +INFO:root:= Dataloaders +INFO:root: - Dataset creation +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Generating the index +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - Loaded a dataset with 677160 samples +INFO:root: - Splitting the data in training and validation sets +INFO:root:Generating the subset files from 677160 samples +INFO:root:= Dataloaders +INFO:root: - Dataset creation +INFO:root:= Dataloaders +INFO:root: - Dataset creation +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Generating the index +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - Loaded a dataset with 677160 samples +INFO:root: - Splitting the data in training and validation sets +INFO:root:Generating the subset files from 677160 samples +INFO:root: - Subset dataset +INFO:root:= Dataloaders +INFO:root: - Dataset creation +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Generating the index +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - Loaded a dataset with 677160 samples +INFO:root: - Splitting the data in training and validation sets +INFO:root:Generating the subset files from 677160 samples +INFO:root: - Subset dataset +INFO:root:= Dataloaders +INFO:root: - Dataset creation +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Generating the index +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - Loaded a dataset with 677160 samples +INFO:root: - Splitting the data in training and validation sets +INFO:root:Generating the subset files from 677160 samples +INFO:root: - Subset dataset +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - The train fold has 542057 samples +INFO:root: - The valid fold has 135103 samples +INFO:root:= Dataloaders +INFO:root: - Dataset creation +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Generating the index +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - Loaded a dataset with 677160 samples +INFO:root: - Splitting the data in training and validation sets +INFO:root:Generating the subset files from 677160 samples +INFO:root: - Subset dataset +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - The train fold has 542046 samples +INFO:root: - The valid fold has 135114 samples +INFO:root:= Dataloaders +INFO:root: - Dataset creation +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Generating the index +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - Loaded a dataset with 677160 samples +INFO:root: - Splitting the data in training and validation sets +INFO:root:Generating the subset files from 677160 samples +INFO:root: - Subset dataset +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - The train fold has 541609 samples +INFO:root: - The valid fold has 135551 samples diff --git a/main.py b/main.py index fe6efdf5863f5b9a0f8c61cf73c33143cb945765..eb34ef81fa21c7e90f72b4df2ed32968c0c16027 100644 --- a/main.py +++ b/main.py @@ -40,7 +40,7 @@ def train(args, cfg): dataset_transform = cfg["Dataset"]["Transform"] input_size = 14 if "space_time" in dataset_transform else (17 if "time" in dataset_transform else 18) - if not args.no_wandb: + if not args.no_log: wandb.init(entity = "wherephytoplankton", project = "Kaggle phytoplancton", config = {"batch_size": batch_size, "epochs": epochs}) # Re-compute the statistics or use the stored ones @@ -66,6 +66,7 @@ def train(args, cfg): ) train_loader, valid_loader = dataloader.get_dataloaders( + cfg, trainpath, num_days, batch_size, @@ -101,10 +102,10 @@ def train(args, cfg): factor = 0.5 ) - logdir, raw_run_name = utils.create_unique_logpath(rootDir, cfg["Model"]["Name"]) - network_checkpoint = model.ModelCheckpoint(logdir + "/best_model.pt", network) - - if not args.no_wandb: + experiment_name = args.experimentName + if not(args.no_log): + logdir, raw_run_name = utils.create_unique_logpath(rootDir, cfg["Model"]["Name"] + experiment_name) + network_checkpoint = model.ModelCheckpoint(logdir + "/best_model.pt", network) wandb.run.name = raw_run_name wandb.watch(network, log_freq = log_freq) @@ -127,10 +128,11 @@ def train(args, cfg): print("Validation : Loss : {:.4f}".format(val_loss)) - if not args.no_wandb: + if not args.no_log: wandb.log({"val_loss": val_loss}) - utils.write_summary(logdir, network, optimizer, val_loss) + if not(args.no_log): + utils.write_summary(logdir, network, optimizer, val_loss) logging.info(f"Best model saved in folder {logdir}") @@ -167,15 +169,15 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - "--no_wandb", + "--detect_anomaly", action="store_true", - help="If specified, no log will be sent to wandb. Especially useful when running batch jobs.", + help="If specified, torch.autograd.set_detect_anomaly(True) will be activated", ) parser.add_argument( - "--detect_anomaly", + "--no_log", action="store_true", - help="If specified, torch.autograd.set_detect_anomaly(True) will be activated", + help="If specified, no folder will be created while training the model and no log will be sent to wantdb.", ) parser.add_argument( @@ -184,6 +186,12 @@ if __name__ == "__main__": help="Directory in which the log files will be stored" ) + parser.add_argument( + "--experimentName", + default="", + help="Name of the experiment, will affect the name of the run on wandb and the name of the created folder" + ) + parser.add_argument( "--PATHTOTESTSET", default=None, diff --git a/my_train.py b/my_train.py index 778b6c7e356fa2152a9edc82e92b40946d401639..3b987eb7100190977b0bdcb580f9c4ce04823d10 100644 --- a/my_train.py +++ b/my_train.py @@ -44,7 +44,7 @@ def train(args, model, loader, f_loss, optimizer, device, log_interval = 100): Y = list(model.parameters())[0].grad.cpu().tolist() - if not args.no_wandb: + if not args.no_log: if batch_idx % log_interval == 0: wandb.log({"train_loss" : loss}) optimizer.step() diff --git a/train_indices.subset b/train_indices.subset index 7c4d7fd4d5a7a65aaa34f8a80ddc63f15a8ec829..801ef942d91e6e4dde955d1555b6be9e784a8e5f 100644 Binary files a/train_indices.subset and b/train_indices.subset differ diff --git a/utils.py b/utils.py index 4361c3206fdf2ec52daffd7bb618782fbdc75933..3a6d5a966c1942fae2d05f6dad07997d6cfd3f87 100644 --- a/utils.py +++ b/utils.py @@ -58,8 +58,6 @@ def write_summary(logdir, model, optimizer, val_loss): summary_file.write(summary_text) summary_file.close() - - def create_submission(args, model, transform, device, rootDir, logdir): cfg = yaml.load(config_file, Loader=yaml.FullLoader) step_days = 10 diff --git a/valid_indices.subset b/valid_indices.subset index 1bd7deacda9183635879207df14248df38fbffa1..16b6fc468d117d472925c9975058cda1453fb00c 100644 Binary files a/valid_indices.subset and b/valid_indices.subset differ