diff --git a/config.yml b/config.yml index db94f2477a3ba9c04f9e44551398d661ee12d68d..31d1d30ef1b40ce6c9b4a6627919af2269346f7c 100644 --- a/config.yml +++ b/config.yml @@ -11,6 +11,12 @@ Dataset: _ENCODING_INDEX: "I" # h(short) with 2 bytes should be sufficient _ENCODING_OFFSET_FORMAT: "" _ENCODING_ENDIAN: "<" + Transform: dataloader.composite_transform(dataloader.transform_remove_space_time(), dataloader.transform_normalize_with_train_statistics(MEAN, STD)) + # Available transforms: + # dataloader.transform_remove_space_time() + # dataloader.transform_normalize_with_train_statistics(MEAN, STD) + # dataloader.transform_min_max_scaling(MIN, MAX) + # dataloader.composite_transform(dataloader.transform_remove_space_time(), dataloader.transform_min_max_scaling(MIN, MAX)) # Data Transformation ApproximativeStats: True @@ -31,6 +37,7 @@ ApproximativeMini: "torch.tensor([ 4.1479e+01, 6.0000e+00, 1.0182e+00, 1.2623 7.9218e+00, 1.0000e-11, 3.7171e+01, 2.5584e+00, 1.2075e+01, -1.2436e+00, -9.9256e-01, -8.8131e-01])" + #Optimizer selection Optimizer: Adam # in {Adam} @@ -40,7 +47,7 @@ Training: #Model selection Model: - Name: CNN1D + Name: BidirectionalLSTM #choose in {LinearRegression, BidirectionalLSTM, RNN} #Model parameters selection @@ -51,9 +58,10 @@ LinearRegression: Initialization: init_he BidirectionalLSTM: - HiddenSize: 16 - NumLayers: 1 - Dropout: 0.2 + HiddenSize: 32 + NumLayers: 4 + LSTMDropout: 0 + FFNDropout: 0.2 NumFFN: 8 Initialization: None diff --git a/logs/main_unit_test.log b/logs/main_unit_test.log index b1a1cf012abbfb66418dfe32cdd53ffdd8c03251..f9c0a53ef7ff80542a153d83a7eef4fb9be74ffd 100644 --- a/logs/main_unit_test.log +++ b/logs/main_unit_test.log @@ -2339,3 +2339,116 @@ INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx INFO:root: - The train fold has 541585 samples INFO:root: - The valid fold has 135575 samples +INFO:root:= Dataloaders +INFO:root: - Dataset creation +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Generating the index +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - Loaded a dataset with 677160 samples +INFO:root: - Splitting the data in training and validation sets +INFO:root:Generating the subset files from 677160 samples +INFO:root: - Subset dataset +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - The train fold has 542010 samples +INFO:root: - The valid fold has 135150 samples +INFO:root:Building the dataloader +INFO:root:= Dataloaders +INFO:root: - Dataset creation +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 365 time points +INFO:root:Generating the index +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2017-testing.nc.bin_index.idx +INFO:root:I loaded 112860 values in the test set +INFO:root:= Filling in the submission file +INFO:root:= Dataloaders +INFO:root: - Dataset creation +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Generating the index +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - Loaded a dataset with 677160 samples +INFO:root: - Splitting the data in training and validation sets +INFO:root:Generating the subset files from 677160 samples +INFO:root: - Subset dataset +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - The train fold has 541712 samples +INFO:root: - The valid fold has 135448 samples +INFO:root:= Dataloaders +INFO:root: - Dataset creation +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Generating the index +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - Loaded a dataset with 677160 samples +INFO:root: - Splitting the data in training and validation sets +INFO:root:Generating the subset files from 677160 samples +INFO:root: - Subset dataset +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - The train fold has 541776 samples +INFO:root: - The valid fold has 135384 samples +INFO:root:= Dataloaders +INFO:root: - Dataset creation +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Generating the index +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - Loaded a dataset with 677160 samples +INFO:root: - Splitting the data in training and validation sets +INFO:root:Generating the subset files from 677160 samples +INFO:root: - Subset dataset +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - The train fold has 541703 samples +INFO:root: - The valid fold has 135457 samples +INFO:root:= Dataloaders +INFO:root: - Dataset creation +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Generating the index +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - Loaded a dataset with 677160 samples +INFO:root: - Splitting the data in training and validation sets +INFO:root:Generating the subset files from 677160 samples +INFO:root: - Subset dataset +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - The train fold has 541380 samples +INFO:root: - The valid fold has 135780 samples +INFO:root:= Dataloaders +INFO:root: - Dataset creation +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Generating the index +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - Loaded a dataset with 677160 samples +INFO:root: - Splitting the data in training and validation sets +INFO:root:Generating the subset files from 677160 samples +INFO:root: - Subset dataset +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - The train fold has 542076 samples +INFO:root: - The valid fold has 135084 samples +INFO:root:= Dataloaders +INFO:root: - Dataset creation +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Generating the index +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - Loaded a dataset with 677160 samples +INFO:root: - Splitting the data in training and validation sets +INFO:root:Generating the subset files from 677160 samples +INFO:root: - Subset dataset +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root:The loaded dataset contains 25 latitudes, 37 longitudes, 28 depths and 2222 time points +INFO:root:Loading the index from sub_2CMEMS-MEDSEA-2010-2016-training.nc.bin_index.idx +INFO:root: - The train fold has 541214 samples +INFO:root: - The valid fold has 135946 samples diff --git a/main.py b/main.py index e5edb4ad7abe53794cf3328b6ffe084b6f5afa23..07caa23043011ec203c9a082222b94a11d81d1d7 100644 --- a/main.py +++ b/main.py @@ -59,6 +59,8 @@ if __name__ == "__main__": log_freq = int(cfg["Wandb"]["log_freq"]) log_interval = int(cfg["Wandb"]["log_interval"]) + dataset_transform = cfg["Dataset"]["Transform"] + if not args.no_wandb: wandb.init(entity = "wherephytoplankton", project = "Kaggle phytoplancton", config = {"batch_size": batch_size, "epochs": epochs}) @@ -93,8 +95,8 @@ if __name__ == "__main__": valid_ratio, overwrite_index = True, max_num_samples=max_num_samples, - train_transform=dataloader.composite_transform(dataloader.transform_remove_space_time(), dataloader.transform_min_max_scaling(MIN, MAX)), - valid_transform=dataloader.composite_transform(dataloader.transform_remove_space_time(), dataloader.transform_min_max_scaling(MIN, MAX)) + train_transform= eval(dataset_transfom), + valid_transform=eval(dataset_transfom) ) if use_cuda : @@ -133,4 +135,4 @@ if __name__ == "__main__": utils.write_summary(logdir, network, optimizer, val_loss) - create_submission.create_submission(network, dataloader.composite_transform(dataloader.transform_remove_space_time(), dataloader.transform_min_max_scaling(MIN, MAX)), device, rootDir, logdir) + create_submission.create_submission(network, eval(dataset_transfom), device, rootDir, logdir) diff --git a/model.py b/model.py index 58bb42c9b2a78e93619957a05a2f04d6e6f0485c..e0357c26028787515848e2d5fe8540c527da772a 100644 --- a/model.py +++ b/model.py @@ -76,10 +76,11 @@ class BidirectionalLSTM(nn.Module): super(BidirectionalLSTM, self).__init__() self.hidden_size = cfg["BidirectionalLSTM"]["HiddenSize"] self.num_layers = cfg["BidirectionalLSTM"]["NumLayers"] - self.dropout = cfg["BidirectionalLSTM"]["Dropout"] + self.LSTM_dropout = cfg["BidirectionalLSTM"]["LSTMDropout"] + self.FFN_dropout = cfg["DidirectionalLSTM"]["FFNDropout"] self.num_ffn = cfg["BidirectionalLSTM"]["NumFFN"] - self.lstm = nn.LSTM(input_size, self.hidden_size, self.num_layers, batch_first = True, bidirectional =True, dropout = 0.2) + self.lstm = nn.LSTM(input_size, self.hidden_size, self.num_layers, batch_first = True, bidirectional =True, dropout = self.LSTM_dropout) self.fc = nn.Sequential() for layer in range(self.num_ffn): @@ -92,7 +93,7 @@ class BidirectionalLSTM(nn.Module): ) self.fc.add_module( f"dropout_{layer}", - nn.Dropout(p=self.dropout) + nn.Dropout(p=self.FFN_dropout) ) self.fc.add_module( "last_linear", @@ -120,16 +121,15 @@ class CNN1D(torch.nn.Module): def __init__(self, cfg, num_inputs): super(CNN1D, self).__init__() self.model = torch.nn.Sequential( - *conv_block(num_inputs, 6, 0.01), - *conv_block(64, 7, 0.01), - *conv_block(128,8,0.01) + *conv_block(num_inputs, 32), + *conv_block(32, 128) ) self.avg_pool = torch.nn.AdaptiveAvgPool1d(1) - self.fc = nn.Sequential( - nn.Linear(256, 32), + self.ffn = nn.Sequential( + nn.Linear(128, 128), nn.ReLU(), - nn.Linear(32,cfg["Dataset"]["num_days"]) + nn.Linear(128,cfg["Dataset"]["num_days"]) ) def forward(self, x): @@ -137,25 +137,21 @@ class CNN1D(torch.nn.Module): out = self.model(x) + print(f"This is after CNN : {out}") out = self.avg_pool(out) out = out.view([out.shape[0], -1]) - out = self.fc(out) + #print(f"{out.shape} this is out.shape") + out = self.ffn(out) out = out.view([out.shape[0], out.shape[1], 1]) return out -def conv_block(in_channels, power, dropout_p): +def conv_block(in_channels, out_channels): return [ - torch.nn.Conv1d(in_channels, 2**power, 16), - #torch.nn.BatchNorm1d(2**power), - torch.nn.LeakyReLU(), - torch.nn.Dropout(p=dropout_p), - torch.nn.Conv1d(2**power, 2**power, 8), - torch.nn.BatchNorm1d(2**power), - torch.nn.LeakyReLU(), - #torch.nn.Dropout(p=dropout_p), - torch.nn.MaxPool1d(2, stride = 1) + torch.nn.Conv1d(in_channels, out_channels, kernel_size = 3, stride = 1, padding = 1), + torch.nn.ReLU(), + torch.nn.BatchNorm1d(out_channels), ] # Initialization diff --git a/train_indices.subset b/train_indices.subset index 82ff06c8175de13333850a1293f875a45886aa20..4be28c3fddee04d316099595035195d190335a28 100644 Binary files a/train_indices.subset and b/train_indices.subset differ diff --git a/valid_indices.subset b/valid_indices.subset index 2879dd700471194d299571d05e917450856b3b49..71e2f411ff24dec16c1ec6873ea24d6a85512515 100644 Binary files a/valid_indices.subset and b/valid_indices.subset differ diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log index 3245817b8c808dd5a25d2e3f8f4b128923c4a702..86e3c6db85a338ab8d678250d88aa44012db1885 120000 --- a/wandb/debug-internal.log +++ b/wandb/debug-internal.log @@ -1 +1 @@ -run-20230204_013952-5w9xw0aw/logs/debug-internal.log \ No newline at end of file +run-20230204_131220-udsbqzli/logs/debug-internal.log \ No newline at end of file diff --git a/wandb/debug.log b/wandb/debug.log index ed90b7c5344b414587e2d31a200dbf31684c1568..f159925d26b61b22c8fb7b0fc268b4daba398c32 120000 --- a/wandb/debug.log +++ b/wandb/debug.log @@ -1 +1 @@ -run-20230204_013952-5w9xw0aw/logs/debug.log \ No newline at end of file +run-20230204_131220-udsbqzli/logs/debug.log \ No newline at end of file diff --git a/wandb/latest-run b/wandb/latest-run index 9fa788a3b33b1984e1c4241a02fb397cf5ef142f..c44486eefedb3139505e9e4dd55a4a1c8354a55b 120000 --- a/wandb/latest-run +++ b/wandb/latest-run @@ -1 +1 @@ -run-20230204_013952-5w9xw0aw \ No newline at end of file +run-20230204_131220-udsbqzli \ No newline at end of file