|
|
import os
|
|
|
import sys
|
|
|
#os.environ['CUDA_VISIBLE_DEVICES'] = f"[0,1,2,3,4,5,6,7]" # f"{sys.argv[1]}"
|
|
|
|
|
|
from torch import nn
|
|
|
import torch
|
|
|
import pandas as pd
|
|
|
import numpy as np
|
|
|
from matplotlib import pyplot as plt
|
|
|
from torch.utils.tensorboard import SummaryWriter
|
|
|
from datetime import datetime, timedelta
|
|
|
from torchmetrics import AUROC
|
|
|
from torch.nn.utils.rnn import pad_sequence
|
|
|
from torch.utils.data import Dataset, DataLoader
|
|
|
from pathlib import Path
|
|
|
import schedulefree
|
|
|
from einops import rearrange, repeat
|
|
|
import torch.nn.functional as F
|
|
|
from torch import autograd
|
|
|
import optical_matrix_multiplication as omm
|
|
|
|
|
|
step = 1
|
|
|
pixel_size: float = 3.6e-6
|
|
|
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
|
print('Current device - ', device)
|
|
|
|
|
|
def new_formula(sim, tensor_1, tensor_2):
|
|
|
tensor_1 = tensor_1[None,:,:,:] if len(tensor_1.shape) < 4 else tensor_1
|
|
|
tensor_2 = tensor_2[None,:,:,:] if len(tensor_2.shape) < 4 else tensor_2
|
|
|
device = tensor_1.device
|
|
|
|
|
|
A_pos = torch.clamp(tensor_1, min=0) # A⁺ = max(A, 0)
|
|
|
A_neg = torch.clamp(-tensor_1, min=0) # A⁻ = max(-A, 0)
|
|
|
B_pos = torch.clamp(tensor_2, min=0) # B⁺ = max(B, 0)
|
|
|
B_neg = torch.clamp(-tensor_2, min=0) # B⁻ = max(-B, 0)
|
|
|
|
|
|
max_A_pos = torch.max(A_pos) # Может быть 0, если нет положительных значений
|
|
|
max_A_neg = torch.max(A_neg) # Может быть 0, если нет отрицательных значений
|
|
|
max_B_pos = torch.max(B_pos)
|
|
|
max_B_neg = torch.max(B_neg)
|
|
|
|
|
|
zero_template = torch.zeros_like(
|
|
|
torch.empty(tensor_1.shape[0],tensor_1.shape[1], tensor_1.shape[2], tensor_2.shape[3]))
|
|
|
|
|
|
if max_A_pos > 0 and max_B_pos > 0:
|
|
|
t1 = sim(A_pos / max_A_pos, B_pos / max_B_pos) * max_A_pos * max_B_pos
|
|
|
else:
|
|
|
t1 = zero_template.clone().to(device)
|
|
|
|
|
|
if max_A_pos > 0 and max_B_neg > 0:
|
|
|
t2 = sim(A_pos / max_A_pos, B_neg / max_B_neg) * max_A_pos * max_B_neg
|
|
|
else:
|
|
|
t2 = zero_template.clone().to(device)
|
|
|
|
|
|
if max_A_neg > 0 and max_B_pos > 0:
|
|
|
t3 = sim(A_neg / max_A_neg, B_pos / max_B_pos) * max_A_neg * max_B_pos
|
|
|
else:
|
|
|
t3 = zero_template.clone().to(device)
|
|
|
|
|
|
if max_A_neg > 0 and max_B_neg > 0:
|
|
|
t4 = sim(A_neg / max_A_neg, B_neg / max_B_neg) * max_A_neg * max_B_neg
|
|
|
else:
|
|
|
t4 = zero_template.clone().to(device)
|
|
|
|
|
|
return (t1 - t2 - t3 + t4)[0,:,:,:]
|
|
|
|
|
|
comment = Path(__file__).stem # sys.argv[2]
|
|
|
checkpoint_file = None
|
|
|
logs_dir = f'/wd/finbert_results/runs/{datetime.now().date()}_{datetime.now().hour:02d}_{datetime.now().minute:02d}_{datetime.now().second:02d}_{comment}/'
|
|
|
сhekpoints_dir = f'/wd/finbert_results/сhekpoints/{datetime.now().date()}_{datetime.now().hour:02d}_{datetime.now().minute:02d}_{datetime.now().second:02d}_{comment}/'
|
|
|
Path(сhekpoints_dir).mkdir(parents=True, exist_ok=True)
|
|
|
print("Logs dir:", logs_dir)
|
|
|
print("Chekpoints dir:", сhekpoints_dir)
|
|
|
writer = SummaryWriter(logs_dir)
|
|
|
Path(logs_dir + "bert_training.py").write_bytes(Path(sys.argv[0]).read_bytes()) # copy this version of script
|
|
|
|
|
|
def save_checkpoint(credit_dataset, encoder, model, optimizer, epoch, loss, rocauc, сhekpoints_dir="checkpoints/"):
|
|
|
checkpoint = {
|
|
|
'encoder': {
|
|
|
'state_dict': encoder.state_dict(),
|
|
|
**{k:v for k,v in encoder.__dict__.items() if k[0] != '_' and k != 'training'}
|
|
|
},
|
|
|
'model': {
|
|
|
'state_dict': model.state_dict(),
|
|
|
**{k:v for k,v in model.__dict__.items() if k[0] != '_' and k != 'training'}
|
|
|
},
|
|
|
'epoch': epoch,
|
|
|
'optimizer': {
|
|
|
'state_dict': optimizer.state_dict(),
|
|
|
},
|
|
|
'loss': loss,
|
|
|
'rocauc': rocauc,
|
|
|
'train_uniq_client_ids_path': credit_dataset.train_uniq_client_ids_path,
|
|
|
'test_uniq_client_ids_path': credit_dataset.test_uniq_client_ids_path
|
|
|
}
|
|
|
path = сhekpoints_dir + f"epoch_{epoch}_{loss:.5f}.pth"
|
|
|
torch.save(checkpoint, path)
|
|
|
print(f"\nCheckpoint saved to {path}")
|
|
|
|
|
|
def load_checkpoint(checkpoint_file):
|
|
|
if os.path.exists(checkpoint_file):
|
|
|
checkpoint = torch.load(checkpoint_file)
|
|
|
|
|
|
#optimizer.load_state_dict(checkpoint['optimizer'])
|
|
|
encoder.load_state_dict(checkpoint['encoder']['state_dict'])
|
|
|
model.load_state_dict(checkpoint['model']['state_dict'])
|
|
|
|
|
|
class CreditProductsDataset:
|
|
|
def __init__(self,
|
|
|
features_path, targets_path, train_test_split_ratio=0.9,
|
|
|
train_uniq_client_ids_path=None, test_uniq_client_ids_path=None
|
|
|
):
|
|
|
self.train_uniq_client_ids_path = train_uniq_client_ids_path
|
|
|
self.test_uniq_client_ids_path = test_uniq_client_ids_path
|
|
|
if Path(self.train_uniq_client_ids_path).exists():
|
|
|
self.train_uniq_client_ids = pd.read_csv(self.train_uniq_client_ids_path).iloc[:,0].values
|
|
|
print("Loaded", self.train_uniq_client_ids_path)
|
|
|
else:
|
|
|
raise Exception(f"No {self.train_uniq_client_ids_path}")
|
|
|
if Path(self.test_uniq_client_ids_path).exists():
|
|
|
self.test_uniq_client_ids = pd.read_csv(self.test_uniq_client_ids_path).iloc[:,0].values
|
|
|
print("Loaded", self.test_uniq_client_ids_path)
|
|
|
else:
|
|
|
raise Exception(f"No {self.test_uniq_client_ids_path}")
|
|
|
self.features_df = pd.read_parquet(features_path)
|
|
|
self.targets_df = pd.read_csv(targets_path)
|
|
|
self.uniq_client_ids = self.features_df.id.unique()
|
|
|
self.max_user_history = self.features_df.rn.max()
|
|
|
self.id_columns = ['id', 'rn']
|
|
|
self.cat_columns = [
|
|
|
'pre_since_opened', 'pre_since_confirmed', 'pre_pterm', 'pre_fterm', 'pre_till_pclose', 'pre_till_fclose',
|
|
|
'pre_loans_credit_limit', 'pre_loans_next_pay_summ', 'pre_loans_outstanding', 'pre_loans_total_overdue',
|
|
|
'pre_loans_max_overdue_sum', 'pre_loans_credit_cost_rate', 'is_zero_loans5', 'is_zero_loans530', 'is_zero_loans3060',
|
|
|
'is_zero_loans6090', 'is_zero_loans90', 'pre_util', 'pre_over2limit', 'pre_maxover2limit', 'is_zero_util', 'is_zero_over2limit',
|
|
|
'is_zero_maxover2limit', 'enc_paym_0', 'enc_paym_1', 'enc_paym_2', 'enc_paym_3', 'enc_paym_4', 'enc_paym_5', 'enc_paym_6', 'enc_paym_7',
|
|
|
'enc_paym_8', 'enc_paym_9', 'enc_paym_10', 'enc_paym_11', 'enc_paym_12', 'enc_paym_13', 'enc_paym_14', 'enc_paym_15', 'enc_paym_16',
|
|
|
'enc_paym_17', 'enc_paym_18', 'enc_paym_19', 'enc_paym_20', 'enc_paym_21', 'enc_paym_22', 'enc_paym_23', 'enc_paym_24',
|
|
|
'enc_loans_account_holder_type', 'enc_loans_credit_status', 'enc_loans_credit_type', 'enc_loans_account_cur', 'pclose_flag',
|
|
|
'fclose_flag'
|
|
|
]
|
|
|
self.num_columns = list(set(self.features_df.columns).difference(set(self.id_columns)).difference(self.cat_columns))
|
|
|
# make unified category index for embeddings for all columns. zero index embedding for padding will be zeroed during training
|
|
|
self.cat_cardinalities = self.features_df.max(axis=0)[self.cat_columns] + 1
|
|
|
self.cat_cardinalities_integral = self.cat_cardinalities.cumsum()
|
|
|
self.features_df[self.cat_columns[1:]] = self.features_df[self.cat_columns[1:]] + self.cat_cardinalities_integral[1:]
|
|
|
self.features_df[self.cat_columns] = self.features_df[self.cat_columns] + 1 # zero embedding is for padding
|
|
|
|
|
|
self.features_df = self.features_df.sort_values(self.id_columns, ascending=[True, True])
|
|
|
self.features_df = self.features_df.set_index('id')
|
|
|
self.targets_df = self.targets_df.set_index('id')
|
|
|
self.targets_df = self.targets_df.sort_index()
|
|
|
|
|
|
self.user_seq_lengths = self.features_df.index.value_counts().sort_index()
|
|
|
|
|
|
self.cat_features = torch.tensor(self.features_df[self.cat_columns].values, dtype=torch.int16)
|
|
|
self.cat_features = pad_sequence(torch.split(self.cat_features, self.user_seq_lengths.tolist()), batch_first=True) # implicit max seq
|
|
|
self.num_features = torch.tensor(self.features_df[self.num_columns].values, dtype=torch.float32)
|
|
|
self.num_features = pad_sequence(torch.split(self.num_features, self.user_seq_lengths.tolist()), batch_first=True)
|
|
|
self.targets = torch.tensor(self.targets_df.flag.values).type(torch.float32)
|
|
|
|
|
|
def get_batch(self, batch_size=4):
|
|
|
sampled_ids = np.random.choice(self.train_uniq_client_ids, batch_size, replace=False) # think about replace=True
|
|
|
cat_features_batch = self.cat_features[sampled_ids] * torch.empty_like(self.cat_features[sampled_ids]).bernoulli_(0.9) # dropout features with prob 0.1
|
|
|
num_features_batch = self.num_features[sampled_ids] * torch.empty_like(self.num_features[sampled_ids]).bernoulli_(0.9) # dropout features with prob 0.1
|
|
|
targets_batch = self.targets[sampled_ids]
|
|
|
return cat_features_batch, num_features_batch, targets_batch
|
|
|
|
|
|
def get_test_batch_iterator(self, batch_size=4):
|
|
|
for i in range(0, len(self.test_uniq_client_ids), batch_size):
|
|
|
ids = self.test_uniq_client_ids[i:i+batch_size]
|
|
|
cat_features_batch = self.cat_features[ids]
|
|
|
num_features_batch = self.num_features[ids]
|
|
|
targets_batch = self.targets[ids]
|
|
|
yield cat_features_batch, num_features_batch, targets_batch
|
|
|
|
|
|
class Encoder(nn.Module):
|
|
|
def __init__(self, cat_columns, num_columns, cat_features_max_id, category_feature_dim=4, out_dim=64):
|
|
|
super().__init__()
|
|
|
self.__dict__.update({k:v for k,v in locals().items() if k != 'self'})
|
|
|
self.total_h_dim = len(self.cat_columns) * category_feature_dim + len(self.num_columns)
|
|
|
self.cat_embeds = nn.Embedding(cat_features_max_id + 1, self.category_feature_dim, padding_idx=0)
|
|
|
self.num_scales = nn.Parameter(torch.randn(1, len(self.num_columns)))
|
|
|
self.num_shifts = nn.Parameter(torch.randn(1, len(self.num_columns)))
|
|
|
self.proj = nn.Linear(self.total_h_dim, self.out_dim, bias=False)
|
|
|
|
|
|
@property
|
|
|
def device(self):
|
|
|
return next(self.parameters()).device
|
|
|
|
|
|
def forward(self, cat_features_batch, num_features_batch, targets_batch):
|
|
|
cat_features_batch = cat_features_batch.to(self.device)
|
|
|
num_features_batch = num_features_batch.to(self.device)
|
|
|
|
|
|
cat_embed_tensor = self.cat_embeds(cat_features_batch.type(torch.int32))
|
|
|
cat_embed_tensor = cat_embed_tensor.reshape(cat_features_batch.shape[0], cat_features_batch.shape[1], -1)
|
|
|
num_embed_tensor = self.num_scales * num_features_batch + self.num_shifts
|
|
|
embed_tensor = torch.concat([cat_embed_tensor, num_embed_tensor], dim=-1)
|
|
|
|
|
|
inputs = self.proj(embed_tensor)
|
|
|
targets = targets_batch.to(self.device)
|
|
|
|
|
|
return inputs, targets
|
|
|
|
|
|
# RoFormer: Enhanced Transformer with Rotary Position Embedding https://arxiv.org/abs/2104.09864
|
|
|
class RoPE(nn.Module):
|
|
|
def __init__(self, dim, max_seq_len=512):
|
|
|
super().__init__()
|
|
|
inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
|
|
|
t = torch.arange(max_seq_len).type_as(inv_freq)
|
|
|
freqs = torch.einsum('i,j->ij', t, inv_freq)
|
|
|
self.register_buffer('emb', torch.cat([freqs, freqs], dim=-1))
|
|
|
|
|
|
def rotate_half(self, x):
|
|
|
x1, x2 = x.chunk(2, dim=-1)
|
|
|
return torch.cat((-x2, x1), dim=-1)
|
|
|
|
|
|
def forward(self, x, offset=0):
|
|
|
seq_len = x.size(1)
|
|
|
emb = self.emb[offset:offset+seq_len].view(1, seq_len, -1)
|
|
|
cos = emb.cos()
|
|
|
sin = emb.sin()
|
|
|
return (x * cos) + (self.rotate_half(x) * sin)
|
|
|
|
|
|
# Transformers without Normalization https://jiachenzhu.github.io/DyT/
|
|
|
class DyT(nn.Module):
|
|
|
def __init__(self, num_features, alpha_init_value=0.5):
|
|
|
super().__init__()
|
|
|
self.alpha = nn.Parameter(torch.ones(1) * alpha_init_value)
|
|
|
self.weight = nn.Parameter(torch.ones(num_features))
|
|
|
self.bias = nn.Parameter(torch.zeros(num_features))
|
|
|
|
|
|
def forward(self, x):
|
|
|
x = torch.tanh(self.alpha * x)
|
|
|
return x * self.weight + self.bias
|
|
|
|
|
|
# Attention Is All You Need https://arxiv.org/pdf/1706.03762v7
|
|
|
# NeoBERT: A Next-Generation BERT https://arxiv.org/html/2502.19587v1
|
|
|
class TransformerLayer(nn.Module):
|
|
|
def __init__(self, h_dim, sim_scores, sim_output, num_heads=4, dropout_rate = 0.1, max_seq_len = 128):
|
|
|
super().__init__()
|
|
|
self.__dict__.update({k:v for k,v in locals().items() if k != 'self'})
|
|
|
self.q_proj = nn.Linear(h_dim, h_dim)
|
|
|
self.k_proj = nn.Linear(h_dim, h_dim)
|
|
|
self.v_proj = nn.Linear(h_dim, h_dim)
|
|
|
self.o_proj = nn.Linear(h_dim, h_dim)
|
|
|
self.ff1 = nn.Linear(h_dim, 4*h_dim)
|
|
|
self.ff2 = nn.Linear(4*h_dim, h_dim)
|
|
|
self.ln1 = DyT(h_dim)
|
|
|
self.ln2 = DyT(h_dim)
|
|
|
self.rope = RoPE(dim=h_dim//self.num_heads, max_seq_len=max_seq_len)
|
|
|
self.k1 = nn.Parameter(torch.randn(1))
|
|
|
self.k2 = nn.Parameter(torch.randn(1))
|
|
|
|
|
|
def split_to_heads(self, x, B, T, H):
|
|
|
if self.num_heads <= 1: return x
|
|
|
return rearrange(x, 'b t (n h) -> (b n) t h', b=B, t=T, n=self.num_heads)
|
|
|
|
|
|
def gather_heads(self, x, B, T, H):
|
|
|
if self.num_heads <= 1: return x
|
|
|
return rearrange(x, '(b n) t h -> b t (n h)', b=B, t=T, n=self.num_heads)
|
|
|
|
|
|
def attention(self, x):
|
|
|
q = self.rope(self.split_to_heads(self.q_proj(x), *x.shape))
|
|
|
k = self.rope(self.split_to_heads(self.k_proj(x), *x.shape))
|
|
|
v = self.split_to_heads(self.v_proj(x), *x.shape)
|
|
|
scores = self.k1 * new_formula(self.sim_scores, q, k.transpose(1, 2)) * (self.h_dim ** -0.5)
|
|
|
attention = nn.functional.softmax(scores, dim=2)
|
|
|
output = self.k2 * new_formula(self.sim_output, attention, v)
|
|
|
return self.o_proj(self.gather_heads(output, *x.shape))
|
|
|
|
|
|
def forward(self, x):
|
|
|
x = x + F.dropout1d(self.attention(self.ln1(x)), p=self.dropout_rate)
|
|
|
x = x + F.dropout1d(self.ff2(F.gelu(self.ff1(self.ln2(x)))), p=self.dropout_rate)
|
|
|
return x
|
|
|
|
|
|
# Vision Transformers Need Registers https://arxiv.org/html/2309.16588v2
|
|
|
class BertClassifier(nn.Module):
|
|
|
def __init__(self, layers_num=1, h_dim=64, class_num=2, max_seq_len=128, num_heads=4, num_reg=0, dropout_rate = 0.1):
|
|
|
super().__init__()
|
|
|
self.__dict__.update({k:v for k,v in locals().items() if k != 'self'})
|
|
|
self.cls_token = nn.Parameter(torch.randn(1,1+num_reg,h_dim)) # reg tokens can be added by second dim >1
|
|
|
self.max_seq_len = max_seq_len + self.cls_token.shape[1]
|
|
|
print(h_dim, self.max_seq_len)
|
|
|
if max_seq_len < 512:
|
|
|
self.sim_scores = omm.OpticalMul(
|
|
|
omm.Config(right_matrix_count_columns = self.max_seq_len,
|
|
|
right_matrix_count_rows = h_dim // num_heads,
|
|
|
right_matrix_width = pixel_size * self.max_seq_len,
|
|
|
right_matrix_height = pixel_size * (h_dim // num_heads),
|
|
|
min_height_gap = pixel_size,
|
|
|
right_matrix_split_x = 2,
|
|
|
right_matrix_split_y = 2,
|
|
|
left_matrix_split_x = 2,
|
|
|
left_matrix_split_y = 2,
|
|
|
result_matrix_split = 2,
|
|
|
distance = 0.01,
|
|
|
trainable_cylind_lens=False)
|
|
|
)
|
|
|
self.sim_output = omm.OpticalMul(
|
|
|
omm.Config(right_matrix_count_columns = h_dim // num_heads,
|
|
|
right_matrix_count_rows = self.max_seq_len,
|
|
|
right_matrix_width = pixel_size * (h_dim // num_heads),
|
|
|
right_matrix_height = pixel_size * self.max_seq_len,
|
|
|
min_height_gap = pixel_size,
|
|
|
right_matrix_split_x = 2,
|
|
|
right_matrix_split_y = 2,
|
|
|
left_matrix_split_x = 2,
|
|
|
left_matrix_split_y = 2,
|
|
|
result_matrix_split = 2,
|
|
|
distance = 0.01,
|
|
|
trainable_cylind_lens=False)
|
|
|
)
|
|
|
if max_seq_len >= 512:
|
|
|
self.sim_scores = omm.OpticalMul(
|
|
|
omm.Config(right_matrix_count_columns = self.max_seq_len,
|
|
|
right_matrix_count_rows = h_dim // num_heads,
|
|
|
right_matrix_width = pixel_size * self.max_seq_len,
|
|
|
right_matrix_height = pixel_size * (h_dim // num_heads),
|
|
|
min_height_gap = pixel_size,
|
|
|
right_matrix_split_x = 2,
|
|
|
right_matrix_split_y = 2,
|
|
|
left_matrix_split_x = 2,
|
|
|
left_matrix_split_y = 2,
|
|
|
result_matrix_split = 2,
|
|
|
distance = 0.15,
|
|
|
lens_size = 8192 * 2,
|
|
|
trainable_cylind_lens=False)
|
|
|
)
|
|
|
self.sim_output = omm.OpticalMul(
|
|
|
omm.Config(right_matrix_count_columns = h_dim // num_heads,
|
|
|
right_matrix_count_rows = self.max_seq_len,
|
|
|
right_matrix_width = pixel_size * (h_dim // num_heads),
|
|
|
right_matrix_height = pixel_size * self.max_seq_len,
|
|
|
min_height_gap = pixel_size,
|
|
|
right_matrix_split_x = 2,
|
|
|
right_matrix_split_y = 2,
|
|
|
left_matrix_split_x = 2,
|
|
|
left_matrix_split_y = 2,
|
|
|
result_matrix_split = 2,
|
|
|
distance = 0.15,
|
|
|
lens_size = 8192 * 2,
|
|
|
trainable_cylind_lens=False)
|
|
|
)
|
|
|
self.layers = nn.ModuleList([TransformerLayer(h_dim=h_dim, sim_scores=self.sim_scores, sim_output=self.sim_output, num_heads=num_heads, dropout_rate = dropout_rate, max_seq_len=self.max_seq_len) for _ in range(layers_num)])
|
|
|
self.classifier_head = nn.Sequential(nn.Linear(h_dim, h_dim), nn.GELU(), nn.Linear(h_dim, class_num))
|
|
|
self.pos_embeds = nn.Parameter(torch.randn(1, self.max_seq_len, h_dim))
|
|
|
|
|
|
def forward(self, x):
|
|
|
x = torch.concat([self.cls_token.expand([x.shape[0], self.cls_token.shape[1], self.cls_token.shape[2]]), x], dim=1)
|
|
|
x = x + self.pos_embeds[:, :x.shape[1], :]
|
|
|
for l in self.layers:
|
|
|
x = l(x)
|
|
|
x = self.classifier_head(x[:,0,:])
|
|
|
return x[:,:] if self.class_num > 1 else x[:,0]
|
|
|
|
|
|
start_prep_time = datetime.now()
|
|
|
credit_train_dataset = CreditProductsDataset(
|
|
|
features_path="/wd/finbert_data/train_data",
|
|
|
targets_path="/wd/finbert_data/train_target .csv",
|
|
|
train_uniq_client_ids_path="/wd/finbert_data/train_uniq_client_ids.csv",
|
|
|
test_uniq_client_ids_path="/wd/finbert_data/test_uniq_client_ids.csv",
|
|
|
)
|
|
|
print(f"Dataset preparation time: {datetime.now() - start_prep_time}")
|
|
|
|
|
|
h_dim = 64
|
|
|
category_feature_dim = 8
|
|
|
layers_num = 2
|
|
|
num_heads = 1
|
|
|
class_num = 1
|
|
|
dropout_rate = 0.1
|
|
|
|
|
|
encoder = Encoder(
|
|
|
cat_columns=credit_train_dataset.cat_columns,
|
|
|
num_columns=credit_train_dataset.num_columns,
|
|
|
cat_features_max_id=credit_train_dataset.cat_features.max(),
|
|
|
category_feature_dim=category_feature_dim,
|
|
|
out_dim=h_dim,
|
|
|
).to(device)
|
|
|
|
|
|
model = BertClassifier(
|
|
|
layers_num=layers_num,
|
|
|
num_heads=num_heads,
|
|
|
h_dim=h_dim,
|
|
|
class_num=class_num,
|
|
|
max_seq_len=credit_train_dataset.max_user_history,
|
|
|
dropout_rate = dropout_rate
|
|
|
).to(device)
|
|
|
|
|
|
print(f'Parameters model - {sum(p.numel() for p in model.parameters())}, parameters encoder - {sum(p.numel() for p in encoder.parameters())}')
|
|
|
model_description = str(model) + f'\nParameters count - {sum(p.numel() for p in model.parameters())}, parameters encoder - {sum(p.numel() for p in encoder.parameters())}'
|
|
|
writer.add_text('model', model_description, 0)
|
|
|
optimizer = schedulefree.AdamWScheduleFree(list(model.parameters()) + list(encoder.parameters()))
|
|
|
|
|
|
if checkpoint_file is not None:
|
|
|
load_checkpoint(checkpoint_file)
|
|
|
|
|
|
positive_counts = credit_train_dataset.targets_df.loc[credit_train_dataset.train_uniq_client_ids].values.sum()
|
|
|
negative_counts = len(credit_train_dataset.targets_df.loc[credit_train_dataset.train_uniq_client_ids]) - positive_counts
|
|
|
pos_weight = negative_counts / (positive_counts + 1e-15)
|
|
|
print(f"Class imbalance: {negative_counts} {positive_counts}. Pos weight: {pos_weight}")
|
|
|
criterion = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight))
|
|
|
|
|
|
epochs = 50
|
|
|
batch_size = 300
|
|
|
batches_per_epoch = len(credit_train_dataset.uniq_client_ids) // batch_size
|
|
|
|
|
|
# for parallel data selection
|
|
|
class WrapperDataset(Dataset):
|
|
|
def __init__(self, credit_dataset, encoder, batch_size):
|
|
|
self.credit_dataset = credit_dataset
|
|
|
self.encoder = encoder
|
|
|
self.batch_size = batch_size
|
|
|
|
|
|
def __len__(self):
|
|
|
return len(self.credit_dataset.uniq_client_ids) // self.batch_size
|
|
|
|
|
|
def __getitem__(self, idx):
|
|
|
cat_inputs, num_inputs, targets = credit_train_dataset.get_batch(batch_size=self.batch_size)
|
|
|
return cat_inputs, num_inputs, targets
|
|
|
|
|
|
training_data = WrapperDataset(credit_train_dataset, encoder, batch_size=batch_size)
|
|
|
dataloader = DataLoader(training_data, batch_size=1, shuffle=False, num_workers=8, pin_memory=True)
|
|
|
|
|
|
val_auroc = AUROC(task='binary')
|
|
|
test_auroc = AUROC(task='binary')
|
|
|
|
|
|
def test(epoch):
|
|
|
model.eval()
|
|
|
encoder.eval()
|
|
|
optimizer.eval()
|
|
|
with torch.no_grad():
|
|
|
test_iterator = credit_train_dataset.get_test_batch_iterator(batch_size=batch_size)
|
|
|
for test_batch_id, (test_cat_inputs, test_num_inputs, test_targets) in enumerate(test_iterator):
|
|
|
inputs, targets = encoder(test_cat_inputs, test_num_inputs, test_targets)
|
|
|
outputs = model(inputs)
|
|
|
test_auroc.update(outputs, targets.long())
|
|
|
print(f"\r {test_batch_id}/{len(credit_train_dataset.test_uniq_client_ids)//batch_size} {test_auroc.compute().item():.4f}", end = " "*40)
|
|
|
|
|
|
writer.add_scalar('test_roc_auc', test_auroc.compute().item(), epoch * batches_per_epoch)
|
|
|
print(f"\r {datetime.now() - start_time} {epoch}/{epochs} Test rocauc: {test_auroc.compute().item():.4f}", end = " "*40)
|
|
|
print()
|
|
|
|
|
|
start_time = datetime.now()
|
|
|
print("Started at:", start_time)
|
|
|
last_display_time = start_time
|
|
|
last_checkpoint_time = start_time
|
|
|
for epoch in range(epochs):
|
|
|
test(epoch)
|
|
|
for batch_id, (cat_inputs, num_inputs, targets) in enumerate(dataloader):
|
|
|
# with autograd.detect_anomaly(True):
|
|
|
model.train()
|
|
|
encoder.train()
|
|
|
optimizer.train()
|
|
|
inputs, targets = encoder(cat_inputs[0], num_inputs[0], targets[0])
|
|
|
outputs = model(inputs)
|
|
|
loss = criterion(outputs, targets)
|
|
|
loss.backward()
|
|
|
current_time = datetime.now()
|
|
|
if current_time - last_display_time > timedelta(seconds=1):
|
|
|
writer.add_scalar('Gradient_Norm/Total', torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1e6).item(), epoch*batches_per_epoch+batch_id)
|
|
|
optimizer.step()
|
|
|
optimizer.zero_grad()
|
|
|
if current_time - last_display_time > timedelta(seconds=1):
|
|
|
model.eval()
|
|
|
encoder.eval()
|
|
|
optimizer.eval()
|
|
|
last_display_time = current_time
|
|
|
writer.add_scalar('Loss', loss.item(), epoch*batches_per_epoch+batch_id)
|
|
|
print(f"\r {current_time-start_time} {epoch+1}/{epochs} {batch_id}/{batches_per_epoch} loss: {loss.item()}", end = " "*40)
|
|
|
if current_time - last_checkpoint_time > timedelta(hours=4):
|
|
|
last_checkpoint_time = current_time
|
|
|
save_checkpoint(
|
|
|
credit_dataset=credit_train_dataset,
|
|
|
encoder = encoder, model=model, optimizer=optimizer, epoch=epoch,
|
|
|
loss=loss.item(), rocauc=test_auroc.compute().item(), сhekpoints_dir=сhekpoints_dir)
|
|
|
|
|
|
test(epochs)
|
|
|
print()
|
|
|
|
|
|
save_checkpoint(
|
|
|
credit_dataset=credit_train_dataset,
|
|
|
encoder = encoder, model=model, optimizer=optimizer, epoch=epochs,
|
|
|
loss=loss.item(), rocauc=test_auroc.compute().item(), сhekpoints_dir=сhekpoints_dir)
|
|
|
|
|
|
writer.close() |