diff --git a/src/bert_training.py b/src/bert_training.py index 1e9812d..e227de3 100644 --- a/src/bert_training.py +++ b/src/bert_training.py @@ -269,13 +269,13 @@ def test(start_time, epoch, batches_per_epoch, batch_size, model, optimizer, cre ######################################### Training ################################################################ -h_dim = 32 +h_dim = 64 category_feature_dim = 8 layers_num = 6 num_heads = 2 class_num = 1 -dataset_dropout_rate = 0.4 -classifier_dropout_date = 0.4 +features_dropout_rate = 0.4 +model_dropout_date = 0.4 epochs = 500 batch_size = 30000 datasets_per_epoch = 1 @@ -285,7 +285,7 @@ comment = sys.argv[1] logs_dir = f'runs/{datetime.now().date()}_{datetime.now().hour:02d}_{datetime.now().minute:02d}_{datetime.now().second:02d}_{comment}/' writer = SummaryWriter(logs_dir) сheсkpoints_dir = f'checkpoints/{datetime.now().date()}_{datetime.now().hour:02d}_{datetime.now().minute:02d}_{datetime.now().second:02d}_{comment}/' -script_snapshot_path = Path(logs_dir + "bert_training_ddp.py") +script_snapshot_path = Path(logs_dir + Path(sys.argv[0]).name) Path(сheсkpoints_dir).mkdir(parents=True, exist_ok=True) print("Logs dir:", logs_dir) print("Chekpoints dir:", сheсkpoints_dir) @@ -298,7 +298,7 @@ credit_train_dataset = CreditProductsDataset( targets_path="/wd/data/train_target.csv", train_uniq_client_ids_path=f"/wd/fold3_train_ids.csv", test_uniq_client_ids_path=f"/wd/fold3_test_ids.csv", - dropout_rate=dataset_dropout_rate + dropout_rate=features_dropout_rate ) print(f"Dataset preparation time: {datetime.now() - start_prep_time}") @@ -316,7 +316,7 @@ classifier = BertClassifier( h_dim=h_dim, class_num=class_num, max_seq_len=credit_train_dataset.max_user_history, - dropout_rate = classifier_dropout_date + dropout_rate = model_dropout_date ) model = Model(encoder=encoder, classifier=classifier).to("cuda") @@ -424,4 +424,4 @@ finally: rocauc=rocauc, сheсkpoints_dir=сheсkpoints_dir ) - writer.close() \ No newline at end of file + writer.close() diff --git a/src/bert_training_ddp.py b/src/bert_training_ddp.py index 70700d9..f6e460d 100644 --- a/src/bert_training_ddp.py +++ b/src/bert_training_ddp.py @@ -329,7 +329,7 @@ if __name__ == "__main__": logs_dir = f'runs/{datetime.now().date()}_{datetime.now().hour:02d}_{datetime.now().minute:02d}_{datetime.now().second:02d}_{comment}/' writer = SummaryWriter(logs_dir) сheсkpoints_dir = f'checkpoints/{datetime.now().date()}_{datetime.now().hour:02d}_{datetime.now().minute:02d}_{datetime.now().second:02d}_{comment}/' - script_snapshot_path = Path(logs_dir + "bert_training_ddp.py") + script_snapshot_path = Path(logs_dir + Path(sys.argv[0]).name) Path(сheсkpoints_dir).mkdir(parents=True, exist_ok=True) print("Logs dir:", logs_dir) print("Chekpoints dir:", сheсkpoints_dir) @@ -449,4 +449,4 @@ if __name__ == "__main__": if rank == 0: writer.close() torch.distributed.destroy_process_group() - \ No newline at end of file + diff --git a/src/bert_training_dp.py b/src/bert_training_dp.py index 1f18504..71daa3c 100644 --- a/src/bert_training_dp.py +++ b/src/bert_training_dp.py @@ -24,7 +24,7 @@ Path(сheсkpoints_dir).mkdir(parents=True, exist_ok=True) print("Logs dir:", logs_dir) print("Chekpoints dir:", сheсkpoints_dir) writer = SummaryWriter(logs_dir) -script_snapshot_path = Path(logs_dir + "bert_training.py") +script_snapshot_path = Path(logs_dir + Path(sys.argv[0]).name) script_snapshot_path.write_bytes(Path(sys.argv[0]).read_bytes()) # copy this version of script script_snapshot_path.chmod(0o400) # with read-only permission @@ -366,4 +366,4 @@ finally: credit_dataset=credit_train_dataset, encoder = encoder, model=model, optimizer=optimizer, epoch=epoch+1, loss=loss.item(), rocauc=test_auroc.compute().item(), сheсkpoints_dir=сheсkpoints_dir) - writer.close() \ No newline at end of file + writer.close()