diff --git a/bindataset.py b/bindataset.py index 31b055e220bb8970b77697830fd295a5397f15a7..8b4126f24a964bc6137e51b02432d0c16c14a7ac 100644 --- a/bindataset.py +++ b/bindataset.py @@ -565,7 +565,16 @@ def test_getitem(): logging.info( f"The idx {idx} corresponds to : \n\tlinear index={lin_index}\n\tfile offset={file_offset}\n\ttab indices={tab_indices}" ) - + train_loader, valid_loader = get_dataloaders( + trainpath, + num_days, + batch_size, + num_workers, + use_cuda, + valid_ratio, + overwrite_index=True, + max_num_samples=max_num_samples, + ) def test_dataloader(): logging.info("====> Test dataloader") diff --git a/config.yml b/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..fc17b56963a1cfcc95a77f86b7c7f7ec93f39973 --- /dev/null +++ b/config.yml @@ -0,0 +1,26 @@ +# Dataset Configuration +Dataset: + num_days: 1 # Test with sequence of 1 day + batch_size: 128 + num_workers: 7 + valid_ratio: 0.2 + max_num_samples: None #1000 + _DEFAULT_TRAIN_FILEPATH: "/mounts/Datasets3/2022-ChallengePlankton/sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin" + _DEFAULT_TEST_FILEPATH: "/mounts/Datasets3/2022-ChallengePlankton/sub_2CMEMS-MEDSEA-2017-testing.nc.bin" + _ENCODING_LINEAR: "I" + _ENCODING_INDEX: "I" # h(short) with 2 bytes should be sufficient + _ENCODING_OFFSET_FORMAT: "" + _ENCODING_ENDIAN: "<" + +#Model selection +Model: + Name: LinearRegression + +#Model parameters selection +LinearRegression: + # Bias in {True, False} + Bias: True + + + + diff --git a/dataloader.py b/dataloader.py index 6cd159e44c0e6eb68dae0c91740dae3e2ecc2ee2..355a91d18a2db9f349572d54fa464698b52cf7d7 100644 --- a/dataloader.py +++ b/dataloader.py @@ -180,3 +180,30 @@ def get_test_dataloader( ) return test_loader + +if __name__ == "__main__": + logging.basicConfig(filename='logs/dataloader_unit_test.log', level=logging.INFO) + logging.info("====> Test dataloader") + use_cuda = torch.cuda.is_available() + trainpath = _DEFAULT_TRAIN_FILEPATH + num_days = 1 # Test with sequence of 1 day + batch_size = 128 + num_workers = 7 + valid_ratio = 0.2 + # max_num_samples = 1000 + max_num_samples = None + + train_loader, valid_loader = get_dataloaders( + trainpath, + num_days, + batch_size, + num_workers, + use_cuda, + valid_ratio, + overwrite_index=True, + max_num_samples=max_num_samples, + ) + + it = iter(train_loader) + X, Y = next(it) + logging.info(f"Got a minibatch of size {X.shape} -> {Y.shape}") \ No newline at end of file diff --git a/debug.py b/debug.py index 57961a826add76ca582946c5d7ae7ebde9a7e5fb..0ecb5f4a954ac10770479ea6d83e402229bacf66 100644 --- a/debug.py +++ b/debug.py @@ -1,5 +1,5 @@ from dataset import Dataset - +import bindataset _DEFAULT_TRAIN_FILEPATH = "/mounts/Datasets3/2022-ChallengePlankton/sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin" _DEFAULT_TEST_FILEPATH = ( @@ -8,10 +8,32 @@ _DEFAULT_TEST_FILEPATH = ( idx ="sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx" -data = Dataset(_DEFAULT_TRAIN_FILEPATH, False, True, idx, 1, None, None) +data = Dataset(_DEFAULT_TRAIN_FILEPATH, overwrite_index = False, train = False, subset_file = idx, num_days = 20, transform = None, target_transform = None) + + +""" +Builds a pointdataset generating the index if necessary or requested + +Arguments: + filepath: the full path to the nc file to load + overwrite_index: if True ignores the index and regenerates it + train: if True, accessing an element also returns the phyc + subset_file: a filename which holds a list of indices this dataset must use + num_days: the number of days that each sample considers + transform: a transform to apply to the input tensor + target_transform: a transform to apply to the phyc output tensor +""" + + +print("Len whole dataset :") +print(len(data)) +print() + +print("Shape data[0] : ") +print(data[0].shape) -print(len(data[0])) +print(data.in_variables) +print(len(data.in_variables)) -print(data[0][0].shape) -print(data[0][1]) \ No newline at end of file +bindataset.test_time_dataset() \ No newline at end of file diff --git a/logs/dataloader_unit_test.log b/logs/dataloader_unit_test.log new file mode 100644 index 0000000000000000000000000000000000000000..a16be9d991b853dafd079359f936ca4db0ccb9ca --- /dev/null +++ b/logs/dataloader_unit_test.log @@ -0,0 +1,53 @@ +INFO:root:====> Test dataloader +INFO:root:= Dataloaders +INFO:root: - Dataset creation +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Generating the index +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - Loaded a dataset with 50154984 samples +INFO:root: - Splitting the data in training and validation sets +INFO:root:Generating the subset files from 50154984 samples +INFO:root: - Subset dataset +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - The train fold has 40123681 samples +INFO:root: - The valid fold has 10031303 samples +INFO:root:Got a minibatch of size torch.Size([128, 1, 18]) -> torch.Size([128, 1]) +INFO:root:====> Test dataloader +INFO:root:= Dataloaders +INFO:root: - Dataset creation +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Generating the index +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - Loaded a dataset with 50154984 samples +INFO:root: - Splitting the data in training and validation sets +INFO:root:Generating the subset files from 50154984 samples +INFO:root: - Subset dataset +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - The train fold has 40125966 samples +INFO:root: - The valid fold has 10029018 samples +INFO:root:Got a minibatch of size torch.Size([128, 1, 18]) -> torch.Size([128, 1]) +INFO:root:====> Test dataloader +INFO:root:= Dataloaders +INFO:root: - Dataset creation +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Generating the index +INFO:root:====> Test dataloader +INFO:root:= Dataloaders +INFO:root: - Dataset creation +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Generating the index +INFO:root:====> Test dataloader +INFO:root:= Dataloaders +INFO:root: - Dataset creation +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Generating the index +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - Loaded a dataset with 50154984 samples +INFO:root: - Splitting the data in training and validation sets +INFO:root:Generating the subset files from 50154984 samples diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..a6c2bf609a4c49a3036d8a88b8f87fb4b4440031 --- /dev/null +++ b/main.py @@ -0,0 +1,35 @@ +import dataloader +import model +import test +import train +import yaml + +if __name__ == "__main__": + config_file = open("config.yml") + cfg = yaml.load(config_file) + + use_cuda = torch.cuda.is_available() + trainpath = cfg["Dataset"]["_DEFAULT_TRAIN_FILEPATH"] + num_days = cfg["Dataset"]["num_days"] + batch_size = cfg["Dataset"]["batch_size"] + num_workers = cfg["Dataset"]["num_workers"] + valid_ratio = cfg["Dataset"]["valid_ratio"] + max_num_samples = cfg["Dataset"]["max_num_samples"] + + train_loader, valid_loader = dataloader.get_dataloaders( + trainpath, + num_days, + batch_size, + num_workers, + use_cuda, + valid_ratio, + overwrite_index=True, + max_num_samples=max_num_samples, + ) + + model = model.build_model(cfg, input_size) + logdir = generate_unique_logpath(top_logdir, "linear") + print("Logging to {}".format(logdir)) + # -> Prints out Logging to ./logs/linear_1 + if not os.path.exists(logdir): + os.mkdir(logdir) \ No newline at end of file diff --git a/model.py b/model.py index a376ef84c27aba6bda2179b717c1393631d6285d..7a393b4e8c3503c75ee50f3f998ef4ed1be3b4c6 100644 --- a/model.py +++ b/model.py @@ -3,4 +3,14 @@ import torch import torch.nn as nn import torch.nn.functional as F from torch.autograd import Function +import models + +def build_model(cfg, input_size): + return eval(f"models.{cfg['Model']['Name']}(cfg, input_size)") + +if __name__== "__main__": + import yaml + config_file = open("config.yml","r") + cfg = yaml.load(config_file) + print(cfg['Model']['Name']) \ No newline at end of file diff --git a/models/linear.py b/models/linear.py new file mode 100644 index 0000000000000000000000000000000000000000..0627fad3d6efbc74cf192cc1e6389a0a9fd5b8dc --- /dev/null +++ b/models/linear.py @@ -0,0 +1,11 @@ +import torch.nn as nn + +class LinearRegression(nn.Module): + def __init__(self, cfg, input_size): + super(LinearRegression, self).__init__() + self.input_size = input_size + self.bias = cfg["LinearRegression"]["Bias"] + self.regressor = nn.Linear(input_size, 1, bias) + def forward(self, x): + y = self.regressor(x) + return y diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..986dc9f9b72feee2c5129c1870bba62b0d7873bf --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +torch +pyyaml \ No newline at end of file diff --git a/test.py b/test.py new file mode 100644 index 0000000000000000000000000000000000000000..80d9e454853e34adead85e949683cb3c92cc056c --- /dev/null +++ b/test.py @@ -0,0 +1,52 @@ +import torch + +def test(model, loader, f_loss, device): + """ + Test a model by iterating over the loader + + Arguments : + + model -- A torch.nn.Module object + loader -- A torch.utils.data.DataLoader + f_loss -- The loss function, i.e. a loss Module + device -- The device to use for computation + + Returns : + + A tuple with the mean loss and mean accuracy + + """ + # We disable gradient computation which speeds up the computation + # and reduces the memory usage + with torch.no_grad(): + model.eval() + N = 0 + tot_loss, correct = 0.0, 0.0 + for _, (inputs, targets) in enumerate(loader): + + # We got a minibatch from the loader within inputs and targets + # With a mini batch size of 128, we have the following shapes + # inputs is of shape (128, 1, 28, 28) + # targets is of shape (128) + + # We need to copy the data on the GPU if we use one + inputs, targets = inputs.to(device), targets.to(device) + + # Compute the forward pass, i.e. the scores for each input image + outputs = model(inputs) + + # We accumulate the exact number of processed samples + N += inputs.shape[0] + + # We accumulate the loss considering + # The multipliation by inputs.shape[0] is due to the fact + # that our loss criterion is averaging over its samples + tot_loss += inputs.shape[0] * f_loss(outputs, targets).item() + + # For the accuracy, we compute the labels for each input image + # Be carefull, the model is outputing scores and not the probabilities + # But given the softmax is not altering the rank of its input scores + # we can compute the label by argmaxing directly the scores + predicted_targets = outputs.argmax(dim=1) + correct += (predicted_targets == targets).sum().item() + return tot_loss/N, correct/N \ No newline at end of file diff --git a/train.py b/train.py new file mode 100644 index 0000000000000000000000000000000000000000..ce6f54c9315a80af78a1d46ba765235fb81fc4f6 --- /dev/null +++ b/train.py @@ -0,0 +1,31 @@ +def train(model, loader, f_loss, optimizer, device): + """ + Train a model for one epoch, iterating over the loader + using the f_loss to compute the loss and the optimizer + to update the parameters of the model. + + Arguments : + + model -- A torch.nn.Module object + loader -- A torch.utils.data.DataLoader + f_loss -- The loss function, i.e. a loss Module + optimizer -- A torch.optim.Optimzer object + device -- a torch.device class specifying the device + used for computation + + Returns : + """ + + model.train() + + for _, (inputs, targets) in enumerate(loader): + inputs, targets = inputs.to(device), targets.to(device) + + # Compute the forward pass through the network up to the loss + outputs = model(inputs) + loss = f_loss(outputs, targets) + + # Backward and optimize + optimizer.zero_grad() + loss.backward() + optimizer.step() \ No newline at end of file diff --git a/train_indices.subset b/train_indices.subset index 20e13c16236888ceacd42847424c1c598cbf0fc6..52f50d5e0619ed0d361ba53becfb35495ae718be 100644 Binary files a/train_indices.subset and b/train_indices.subset differ diff --git a/utils.py b/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b6b3e52f630f8b2af68e562cffc74fa31058cbfc --- /dev/null +++ b/utils.py @@ -0,0 +1,11 @@ +import os + +def generate_unique_logpath(logdir, raw_run_name): + i = 0 + while(True): + run_name = raw_run_name + "_" + str(i) + log_path = os.path.join(logdir, run_name) + if not os.path.isdir(log_path): + return log_path + i = i + 1 + diff --git a/valid_indices.subset b/valid_indices.subset index 457f1d117f15460c2c1cc0aa881eae34b069767f..8733e1ea294c6b7e278286602c31a9ec5689b66a 100644 Binary files a/valid_indices.subset and b/valid_indices.subset differ