added padding mask

main
Vladimir 2 weeks ago
parent 3986a4d5c7
commit 6d7e1b0095

@ -11,6 +11,6 @@ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python src/bert_training_dp.py fold3_18l_dy
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --standalone --nproc-per-node=8 src/bert_training_ddp.py fold3_18l_dyt_04_04_3750 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --standalone --nproc-per-node=8 src/bert_training_ddp.py fold3_18l_dyt_04_04_3750
``` ```
Логирование ведётся в tensorboard в папку `./runs/`. В папку с логами при запуске копируется текущая версия скрипта. Чекпоинты моделей сохраняются в папку `./checkpoints/`. Логирование ведётся в tensorboard в папку `./logs/`. В папку с логами при запуске копируется текущая версия скрипта. Чекпоинты моделей сохраняются в папку `./checkpoints/`.
Разбиение на обучающую и тестовую выборки осуществляется скриптом `train_test_split.py`. Разбиение на обучающую и тестовую выборки осуществляется скриптом `train_test_split.py`.

@ -2,6 +2,7 @@ import os
import sys import sys
# DEVICE_IDX = sys.argv[1] # DEVICE_IDX = sys.argv[1]
# os.environ['CUDA_VISIBLE_DEVICES'] = f"{DEVICE_IDX}" # os.environ['CUDA_VISIBLE_DEVICES'] = f"{DEVICE_IDX}"
comment = sys.argv[1]
from torch import nn from torch import nn
import torch import torch
@ -14,9 +15,10 @@ from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader from torch.utils.data import Dataset, DataLoader
from pathlib import Path from pathlib import Path
import schedulefree import schedulefree
import einops
from einops import rearrange, repeat from einops import rearrange, repeat
import torch.nn.functional as F import torch.nn.functional as F
from typing import Optional
import functools import functools
def save_checkpoint(credit_dataset, encoder, model, optimizer, epoch, loss, rocauc, checkpoints_dir): def save_checkpoint(credit_dataset, encoder, model, optimizer, epoch, loss, rocauc, checkpoints_dir):
@ -48,8 +50,7 @@ def save_checkpoint(credit_dataset, encoder, model, optimizer, epoch, loss, roca
class CreditProductsDataset: class CreditProductsDataset:
def __init__(self, def __init__(self,
features_path, targets_path, train_test_split_ratio=0.9, features_path, targets_path, train_test_split_ratio=0.9,
train_uniq_client_ids_path=None, test_uniq_client_ids_path=None, train_uniq_client_ids_path=None, test_uniq_client_ids_path=None
dropout_rate=0.0
): ):
self.__dict__.update({k:v for k,v in locals().items() if k != 'self'}) self.__dict__.update({k:v for k,v in locals().items() if k != 'self'})
if Path(self.train_uniq_client_ids_path).exists(): if Path(self.train_uniq_client_ids_path).exists():
@ -97,6 +98,8 @@ class CreditProductsDataset:
self.cat_features = pad_sequence(torch.split(self.cat_features, self.user_seq_lengths.tolist()), batch_first=True) # implicit max seq self.cat_features = pad_sequence(torch.split(self.cat_features, self.user_seq_lengths.tolist()), batch_first=True) # implicit max seq
self.num_features = torch.tensor(self.features_df[self.num_columns].values, dtype=torch.float32) self.num_features = torch.tensor(self.features_df[self.num_columns].values, dtype=torch.float32)
self.num_features = pad_sequence(torch.split(self.num_features, self.user_seq_lengths.tolist()), batch_first=True) self.num_features = pad_sequence(torch.split(self.num_features, self.user_seq_lengths.tolist()), batch_first=True)
self.padding_mask = torch.ones(len(self.features_df), dtype=torch.bool)
self.padding_mask = pad_sequence(torch.split(self.padding_mask, self.user_seq_lengths.tolist()), batch_first=True)
self.targets_df = self.targets_df.set_index('id') self.targets_df = self.targets_df.set_index('id')
self.targets_df = self.targets_df.sort_index() self.targets_df = self.targets_df.sort_index()
@ -104,20 +107,22 @@ class CreditProductsDataset:
def get_train_batch(self, batch_size=4): def get_train_batch(self, batch_size=4):
sampled_ids = np.random.choice(self.train_uniq_client_ids, batch_size, replace=False) # think about replace=True sampled_ids = np.random.choice(self.train_uniq_client_ids, batch_size, replace=False) # think about replace=True
cat_features_batch = self.cat_features[sampled_ids] return (
num_features_batch = self.num_features[sampled_ids] self.cat_features[sampled_ids],
cat_features_batch *= torch.empty_like(cat_features_batch).bernoulli_(1-self.dropout_rate) # arg is keep_probability self.num_features[sampled_ids],
num_features_batch *= torch.empty_like(num_features_batch).bernoulli_(1-self.dropout_rate) self.padding_mask[sampled_ids],
targets_batch = self.targets[sampled_ids] self.targets[sampled_ids]
return cat_features_batch, num_features_batch, targets_batch )
def get_test_batch_iterator(self, batch_size=4): def get_test_batch_iterator(self, batch_size=4):
for i in range(0, len(self.test_uniq_client_ids), batch_size): for i in range(0, len(self.test_uniq_client_ids), batch_size):
ids = self.test_uniq_client_ids[i:i+batch_size] sampled_ids = self.test_uniq_client_ids[i:i+batch_size]
cat_features_batch = self.cat_features[ids] yield (
num_features_batch = self.num_features[ids] self.cat_features[sampled_ids],
targets_batch = self.targets[ids] self.num_features[sampled_ids],
yield cat_features_batch, num_features_batch, targets_batch self.padding_mask[sampled_ids],
self.targets[sampled_ids]
)
# for parallel data selection # for parallel data selection
class WrapperDataset(Dataset): class WrapperDataset(Dataset):
@ -132,15 +137,15 @@ class WrapperDataset(Dataset):
return self.num_batches return self.num_batches
def __getitem__(self, idx): def __getitem__(self, idx):
cat_inputs, num_inputs, targets = self.credit_dataset.get_train_batch(batch_size=self.batch_size) cat_inputs, num_inputs, padding_mask, targets = self.credit_dataset.get_train_batch(batch_size=self.batch_size)
return cat_inputs, num_inputs, targets return cat_inputs, num_inputs, padding_mask, targets
##################################### Model ########################################################################################### ##################################### Model ###########################################################################################
class Encoder(nn.Module): class Encoder(nn.Module):
def __init__(self, cat_columns, num_columns, cat_features_max_id, category_feature_dim=4, out_dim=64): def __init__(self, cat_columns, num_columns, cat_features_max_id, category_feature_dim=4, out_dim=64, features_dropout_rate=0.0):
super().__init__() super().__init__()
self.__dict__.update({k:v for k,v in locals().items() if k != 'self'}) self.__dict__.update({k:v for k,v in locals().items() if k != 'self'}) # all args are added as object variables
self.total_h_dim = len(self.cat_columns) * category_feature_dim + len(self.num_columns) self.total_h_dim = len(self.cat_columns) * category_feature_dim + len(self.num_columns)
self.cat_embeds = nn.Embedding(cat_features_max_id + 1, self.category_feature_dim, padding_idx=0) self.cat_embeds = nn.Embedding(cat_features_max_id + 1, self.category_feature_dim, padding_idx=0)
self.num_scales = nn.Parameter(torch.randn(1, len(self.num_columns))) self.num_scales = nn.Parameter(torch.randn(1, len(self.num_columns)))
@ -148,10 +153,11 @@ class Encoder(nn.Module):
self.proj = nn.Linear(self.total_h_dim, self.out_dim, bias=False) self.proj = nn.Linear(self.total_h_dim, self.out_dim, bias=False)
def forward(self, cat_features_batch, num_features_batch): def forward(self, cat_features_batch, num_features_batch):
cat_embed_tensor = self.cat_embeds(cat_features_batch.type(torch.int32)) cat_embed_tensor = self.cat_embeds(cat_features_batch.data.type(torch.int32))
cat_embed_tensor = cat_embed_tensor.reshape(cat_features_batch.shape[0], cat_features_batch.shape[1], -1) cat_embed_tensor = cat_embed_tensor.reshape(cat_features_batch.data.shape[0], cat_features_batch.data.shape[1], -1)
num_embed_tensor = self.num_scales * num_features_batch + self.num_shifts num_embed_tensor = self.num_scales * num_features_batch.data + self.num_shifts
embed_tensor = torch.concat([cat_embed_tensor, num_embed_tensor], dim=-1) embed_tensor = torch.concat([cat_embed_tensor.data, num_embed_tensor.data], dim=-1)
embed_tensor = F.dropout(embed_tensor, self.features_dropout_rate)
inputs = self.proj(embed_tensor) inputs = self.proj(embed_tensor)
return inputs return inputs
@ -204,24 +210,32 @@ class TransformerLayer(nn.Module):
self.rope = RoPE(dim=h_dim//self.num_heads, max_seq_len=max_seq_len) self.rope = RoPE(dim=h_dim//self.num_heads, max_seq_len=max_seq_len)
def split_to_heads(self, x, B, T, H): def split_to_heads(self, x, B, T, H):
return rearrange(x, 'b t (n h) -> (b n) t h', b=B, t=T, n=self.num_heads) if self.num_heads > 1 else x if self.num_heads <= 1: return x
return rearrange(x, 'b t (n h) -> (b n) t h', b=B, t=T, n=self.num_heads)
def gather_heads(self, x, B, T, H): def gather_heads(self, x, B, T, H):
return rearrange(x, '(b n) t h -> b t (n h)', b=B, t=T, n=self.num_heads) if self.num_heads > 1 else x if self.num_heads <= 1: return x
return rearrange(x, '(b n) t h -> b t (n h)', b=B, t=T, n=self.num_heads)
def attention(self, x): def attention(self, x, padding_mask):
padding_mask = padding_mask.unsqueeze(-1).expand(*padding_mask.shape+(self.num_heads,))
padding_mask = self.split_to_heads(padding_mask, *padding_mask.shape)
q = self.rope(self.split_to_heads(self.q_proj(x), *x.shape)) q = self.rope(self.split_to_heads(self.q_proj(x), *x.shape))
k = self.rope(self.split_to_heads(self.k_proj(x), *x.shape)) k = self.rope(self.split_to_heads(self.k_proj(x), *x.shape))
v = self.split_to_heads(self.v_proj(x), *x.shape) v = self.split_to_heads(self.v_proj(x), *x.shape)
scores = (q @ k.transpose(1, 2)) * (self.h_dim ** -0.5) scores = (q @ k.transpose(1, 2)) * (self.h_dim ** -0.5)
scores = scores.masked_fill(~padding_mask, -1e9)
attention = nn.functional.softmax(scores, dim=2) attention = nn.functional.softmax(scores, dim=2)
return self.o_proj(self.gather_heads(attention @ v, *x.shape)) return self.o_proj(self.gather_heads(attention @ v, *x.shape))
def forward(self, x): def forward(self, x, padding_mask):
x = x + F.dropout1d(self.attention(self.ln1(x)), p=self.dropout_rate) x = x + F.dropout1d(self.attention(self.ln1(x), padding_mask), p=self.dropout_rate)
x = x + F.dropout1d(self.ff2(F.gelu(self.ff1(self.ln2(x)))), p=self.dropout_rate) x = x + F.dropout1d(self.ff2(F.gelu(self.ff1(self.ln2(x)))), p=self.dropout_rate)
return x return x
def prepend(element, tensor):
return torch.cat([element.expand([tensor.shape[0], element.shape[1], tensor.shape[2]]), tensor], dim=1)
class BertClassifier(nn.Module): class BertClassifier(nn.Module):
def __init__(self, layers_num=1, h_dim=64, class_num=2, max_seq_len=128, num_heads=4, dropout_rate = 0.1): def __init__(self, layers_num=1, h_dim=64, class_num=2, max_seq_len=128, num_heads=4, dropout_rate = 0.1):
super().__init__() super().__init__()
@ -232,11 +246,12 @@ class BertClassifier(nn.Module):
self.classifier_head = nn.Sequential(nn.Linear(h_dim, h_dim), nn.GELU(), nn.Linear(h_dim, class_num)) self.classifier_head = nn.Sequential(nn.Linear(h_dim, h_dim), nn.GELU(), nn.Linear(h_dim, class_num))
self.pos_embeds = nn.Parameter(torch.randn(1, self.max_seq_len, h_dim)) self.pos_embeds = nn.Parameter(torch.randn(1, self.max_seq_len, h_dim))
def forward(self, x): def forward(self, x, padding_mask):
x = torch.concat([self.cls_token.expand([x.shape[0], self.cls_token.shape[1], self.cls_token.shape[2]]), x], dim=1) x = prepend(self.cls_token, x)
padding_mask = torch.cat([torch.ones(x.shape[0], 1, dtype=torch.bool, device=x.device), padding_mask], dim=1)
x = x + self.pos_embeds[:, :x.shape[1], :] x = x + self.pos_embeds[:, :x.shape[1], :]
for l in self.layers: for l in self.layers:
x = l(x) x = l(x, padding_mask)
x = self.classifier_head(x[:,0,:]) x = self.classifier_head(x[:,0,:])
return x[:,:] if self.class_num > 1 else x[:,0] return x[:,:] if self.class_num > 1 else x[:,0]
@ -246,20 +261,21 @@ class Model(nn.Module):
self.encoder = encoder self.encoder = encoder
self.classifier = classifier self.classifier = classifier
def forward(self, cat_inputs, num_inputs): def forward(self, cat_inputs, num_inputs, padding_mask):
inputs = self.encoder(cat_inputs, num_inputs) inputs = self.encoder(cat_inputs, num_inputs)
return self.classifier(inputs) return self.classifier(inputs, padding_mask)
def test(start_time, epoch, batches_per_epoch, batch_size, model, optimizer, credit_dataset, test_auroc, writer): def test(start_time, epoch, batches_per_epoch, batch_size, model, optimizer, credit_dataset, test_auroc, writer):
model.eval() model.eval()
optimizer.eval() optimizer.eval()
with torch.no_grad(): with torch.no_grad():
test_iterator = credit_dataset.get_test_batch_iterator(batch_size=batch_size) test_iterator = credit_dataset.get_test_batch_iterator(batch_size=batch_size)
for test_batch_id, (test_cat_inputs, test_num_inputs, test_targets) in enumerate(test_iterator): for test_batch_id, (test_cat_inputs, test_num_inputs, test_padding_mask, test_targets) in enumerate(test_iterator):
test_cat_inputs = test_cat_inputs.to("cuda", non_blocking=True) test_cat_inputs = test_cat_inputs.to("cuda", non_blocking=True)
test_num_inputs = test_num_inputs.to("cuda", non_blocking=True) test_num_inputs = test_num_inputs.to("cuda", non_blocking=True)
test_padding_mask = test_padding_mask.to("cuda", non_blocking=True)
test_targets = test_targets.to("cuda", non_blocking=True) test_targets = test_targets.to("cuda", non_blocking=True)
outputs = model(test_cat_inputs, test_num_inputs) outputs = model(test_cat_inputs, test_num_inputs, test_padding_mask)
test_auroc.update(outputs, test_targets.long()) test_auroc.update(outputs, test_targets.long())
print(f"\r {test_batch_id}/{len(credit_dataset.test_uniq_client_ids)//batch_size} {test_auroc.compute().item():.5f}", end = " "*20) print(f"\r {test_batch_id}/{len(credit_dataset.test_uniq_client_ids)//batch_size} {test_auroc.compute().item():.5f}", end = " "*20)
if not writer is None: if not writer is None:
@ -281,8 +297,8 @@ batch_size = 30000
datasets_per_epoch = 1 datasets_per_epoch = 1
num_workers = 10 num_workers = 10
comment = sys.argv[1]
logs_dir = f'runs/{datetime.now().date()}_{datetime.now().hour:02d}_{datetime.now().minute:02d}_{datetime.now().second:02d}_{comment}/' logs_dir = f'logs/{datetime.now().date()}_{datetime.now().hour:02d}_{datetime.now().minute:02d}_{datetime.now().second:02d}_{comment}/'
writer = SummaryWriter(logs_dir) writer = SummaryWriter(logs_dir)
checkpoints_dir = f'checkpoints/{datetime.now().date()}_{datetime.now().hour:02d}_{datetime.now().minute:02d}_{datetime.now().second:02d}_{comment}/' checkpoints_dir = f'checkpoints/{datetime.now().date()}_{datetime.now().hour:02d}_{datetime.now().minute:02d}_{datetime.now().second:02d}_{comment}/'
script_snapshot_path = Path(logs_dir + Path(sys.argv[0]).name) script_snapshot_path = Path(logs_dir + Path(sys.argv[0]).name)
@ -297,8 +313,7 @@ credit_train_dataset = CreditProductsDataset(
features_path="/wd/data/train_data/", features_path="/wd/data/train_data/",
targets_path="/wd/data/train_target.csv", targets_path="/wd/data/train_target.csv",
train_uniq_client_ids_path=f"/wd/fold3_train_ids.csv", train_uniq_client_ids_path=f"/wd/fold3_train_ids.csv",
test_uniq_client_ids_path=f"/wd/fold3_test_ids.csv", test_uniq_client_ids_path=f"/wd/fold3_test_ids.csv"
dropout_rate=features_dropout_rate
) )
print(f"Dataset preparation time: {datetime.now() - start_prep_time}") print(f"Dataset preparation time: {datetime.now() - start_prep_time}")
@ -307,7 +322,8 @@ encoder = Encoder(
num_columns=credit_train_dataset.num_columns, num_columns=credit_train_dataset.num_columns,
cat_features_max_id=credit_train_dataset.cat_features.max(), cat_features_max_id=credit_train_dataset.cat_features.max(),
category_feature_dim=category_feature_dim, category_feature_dim=category_feature_dim,
out_dim=h_dim out_dim=h_dim,
features_dropout_rate=features_dropout_rate
) )
classifier = BertClassifier( classifier = BertClassifier(
@ -358,13 +374,14 @@ try:
test_auroc=test_auroc, test_auroc=test_auroc,
writer=writer writer=writer
) )
for batch_id, (cat_inputs, num_inputs, targets) in enumerate(dataloader): for batch_id, (cat_inputs, num_inputs, padding_mask, targets) in enumerate(dataloader):
model.train() model.train()
optimizer.train() optimizer.train()
optimizer.zero_grad() optimizer.zero_grad()
outputs = model( outputs = model(
cat_inputs[0].to("cuda"), cat_inputs[0].to("cuda"),
num_inputs[0].to("cuda") num_inputs[0].to("cuda"),
padding_mask[0].to("cuda")
) )
loss = criterion(outputs, targets[0].to("cuda")) loss = criterion(outputs, targets[0].to("cuda"))
loss.backward() loss.backward()

Loading…
Cancel
Save