@ -2,6 +2,7 @@ import os
import sys
import sys
# DEVICE_IDX = sys.argv[1]
# DEVICE_IDX = sys.argv[1]
# os.environ['CUDA_VISIBLE_DEVICES'] = f"{DEVICE_IDX}"
# os.environ['CUDA_VISIBLE_DEVICES'] = f"{DEVICE_IDX}"
comment = sys . argv [ 1 ]
from torch import nn
from torch import nn
import torch
import torch
@ -14,9 +15,10 @@ from torch.nn.utils.rnn import pad_sequence
from torch . utils . data import Dataset , DataLoader
from torch . utils . data import Dataset , DataLoader
from pathlib import Path
from pathlib import Path
import schedulefree
import schedulefree
import einops
from einops import rearrange , repeat
from einops import rearrange , repeat
import torch . nn . functional as F
import torch . nn . functional as F
from typing import Optional
import functools
import functools
def save_checkpoint ( credit_dataset , encoder , model , optimizer , epoch , loss , rocauc , checkpoints_dir ) :
def save_checkpoint ( credit_dataset , encoder , model , optimizer , epoch , loss , rocauc , checkpoints_dir ) :
@ -48,8 +50,7 @@ def save_checkpoint(credit_dataset, encoder, model, optimizer, epoch, loss, roca
class CreditProductsDataset :
class CreditProductsDataset :
def __init__ ( self ,
def __init__ ( self ,
features_path , targets_path , train_test_split_ratio = 0.9 ,
features_path , targets_path , train_test_split_ratio = 0.9 ,
train_uniq_client_ids_path = None , test_uniq_client_ids_path = None ,
train_uniq_client_ids_path = None , test_uniq_client_ids_path = None
dropout_rate = 0.0
) :
) :
self . __dict__ . update ( { k : v for k , v in locals ( ) . items ( ) if k != ' self ' } )
self . __dict__ . update ( { k : v for k , v in locals ( ) . items ( ) if k != ' self ' } )
if Path ( self . train_uniq_client_ids_path ) . exists ( ) :
if Path ( self . train_uniq_client_ids_path ) . exists ( ) :
@ -97,6 +98,8 @@ class CreditProductsDataset:
self . cat_features = pad_sequence ( torch . split ( self . cat_features , self . user_seq_lengths . tolist ( ) ) , batch_first = True ) # implicit max seq
self . cat_features = pad_sequence ( torch . split ( self . cat_features , self . user_seq_lengths . tolist ( ) ) , batch_first = True ) # implicit max seq
self . num_features = torch . tensor ( self . features_df [ self . num_columns ] . values , dtype = torch . float32 )
self . num_features = torch . tensor ( self . features_df [ self . num_columns ] . values , dtype = torch . float32 )
self . num_features = pad_sequence ( torch . split ( self . num_features , self . user_seq_lengths . tolist ( ) ) , batch_first = True )
self . num_features = pad_sequence ( torch . split ( self . num_features , self . user_seq_lengths . tolist ( ) ) , batch_first = True )
self . padding_mask = torch . ones ( len ( self . features_df ) , dtype = torch . bool )
self . padding_mask = pad_sequence ( torch . split ( self . padding_mask , self . user_seq_lengths . tolist ( ) ) , batch_first = True )
self . targets_df = self . targets_df . set_index ( ' id ' )
self . targets_df = self . targets_df . set_index ( ' id ' )
self . targets_df = self . targets_df . sort_index ( )
self . targets_df = self . targets_df . sort_index ( )
@ -104,20 +107,22 @@ class CreditProductsDataset:
def get_train_batch ( self , batch_size = 4 ) :
def get_train_batch ( self , batch_size = 4 ) :
sampled_ids = np . random . choice ( self . train_uniq_client_ids , batch_size , replace = False ) # think about replace=True
sampled_ids = np . random . choice ( self . train_uniq_client_ids , batch_size , replace = False ) # think about replace=True
cat_features_batch = self . cat_features [ sampled_ids ]
return (
num_features_batch = self . num _features[ sampled_ids ]
self . cat _features[ sampled_ids ] ,
cat_features_batch * = torch . empty_like ( cat_features_batch ) . bernoulli_ ( 1 - self . dropout_rate ) # arg is keep_probability
self . num_features [ sampled_ids ] ,
num_features_batch * = torch . empty_like ( num_features_batch ) . bernoulli_ ( 1 - self . dropout_rate )
self . padding_mask [ sampled_ids ] ,
targets_batch = self . targets [ sampled_ids ]
self . targets [ sampled_ids ]
return cat_features_batch , num_features_batch , targets_batch
)
def get_test_batch_iterator ( self , batch_size = 4 ) :
def get_test_batch_iterator ( self , batch_size = 4 ) :
for i in range ( 0 , len ( self . test_uniq_client_ids ) , batch_size ) :
for i in range ( 0 , len ( self . test_uniq_client_ids ) , batch_size ) :
ids = self . test_uniq_client_ids [ i : i + batch_size ]
sampled_ids = self . test_uniq_client_ids [ i : i + batch_size ]
cat_features_batch = self . cat_features [ ids ]
yield (
num_features_batch = self . num_features [ ids ]
self . cat_features [ sampled_ids ] ,
targets_batch = self . targets [ ids ]
self . num_features [ sampled_ids ] ,
yield cat_features_batch , num_features_batch , targets_batch
self . padding_mask [ sampled_ids ] ,
self . targets [ sampled_ids ]
)
# for parallel data selection
# for parallel data selection
class WrapperDataset ( Dataset ) :
class WrapperDataset ( Dataset ) :
@ -132,15 +137,15 @@ class WrapperDataset(Dataset):
return self . num_batches
return self . num_batches
def __getitem__ ( self , idx ) :
def __getitem__ ( self , idx ) :
cat_inputs , num_inputs , targets = self . credit_dataset . get_train_batch ( batch_size = self . batch_size )
cat_inputs , num_inputs , padding_mask, targets = self . credit_dataset . get_train_batch ( batch_size = self . batch_size )
return cat_inputs , num_inputs , targets
return cat_inputs , num_inputs , padding_mask, targets
##################################### Model ###########################################################################################
##################################### Model ###########################################################################################
class Encoder ( nn . Module ) :
class Encoder ( nn . Module ) :
def __init__ ( self , cat_columns , num_columns , cat_features_max_id , category_feature_dim = 4 , out_dim = 64 ):
def __init__ ( self , cat_columns , num_columns , cat_features_max_id , category_feature_dim = 4 , out_dim = 64 , features_dropout_rate = 0.0 ):
super ( ) . __init__ ( )
super ( ) . __init__ ( )
self . __dict__ . update ( { k : v for k , v in locals ( ) . items ( ) if k != ' self ' } )
self . __dict__ . update ( { k : v for k , v in locals ( ) . items ( ) if k != ' self ' } ) # all args are added as object variables
self . total_h_dim = len ( self . cat_columns ) * category_feature_dim + len ( self . num_columns )
self . total_h_dim = len ( self . cat_columns ) * category_feature_dim + len ( self . num_columns )
self . cat_embeds = nn . Embedding ( cat_features_max_id + 1 , self . category_feature_dim , padding_idx = 0 )
self . cat_embeds = nn . Embedding ( cat_features_max_id + 1 , self . category_feature_dim , padding_idx = 0 )
self . num_scales = nn . Parameter ( torch . randn ( 1 , len ( self . num_columns ) ) )
self . num_scales = nn . Parameter ( torch . randn ( 1 , len ( self . num_columns ) ) )
@ -148,10 +153,11 @@ class Encoder(nn.Module):
self . proj = nn . Linear ( self . total_h_dim , self . out_dim , bias = False )
self . proj = nn . Linear ( self . total_h_dim , self . out_dim , bias = False )
def forward ( self , cat_features_batch , num_features_batch ) :
def forward ( self , cat_features_batch , num_features_batch ) :
cat_embed_tensor = self . cat_embeds ( cat_features_batch . type ( torch . int32 ) )
cat_embed_tensor = self . cat_embeds ( cat_features_batch . data . type ( torch . int32 ) )
cat_embed_tensor = cat_embed_tensor . reshape ( cat_features_batch . shape [ 0 ] , cat_features_batch . shape [ 1 ] , - 1 )
cat_embed_tensor = cat_embed_tensor . reshape ( cat_features_batch . data . shape [ 0 ] , cat_features_batch . data . shape [ 1 ] , - 1 )
num_embed_tensor = self . num_scales * num_features_batch + self . num_shifts
num_embed_tensor = self . num_scales * num_features_batch . data + self . num_shifts
embed_tensor = torch . concat ( [ cat_embed_tensor , num_embed_tensor ] , dim = - 1 )
embed_tensor = torch . concat ( [ cat_embed_tensor . data , num_embed_tensor . data ] , dim = - 1 )
embed_tensor = F . dropout ( embed_tensor , self . features_dropout_rate )
inputs = self . proj ( embed_tensor )
inputs = self . proj ( embed_tensor )
return inputs
return inputs
@ -204,24 +210,32 @@ class TransformerLayer(nn.Module):
self . rope = RoPE ( dim = h_dim / / self . num_heads , max_seq_len = max_seq_len )
self . rope = RoPE ( dim = h_dim / / self . num_heads , max_seq_len = max_seq_len )
def split_to_heads ( self , x , B , T , H ) :
def split_to_heads ( self , x , B , T , H ) :
return rearrange ( x , ' b t (n h) -> (b n) t h ' , b = B , t = T , n = self . num_heads ) if self . num_heads > 1 else x
if self . num_heads < = 1 : return x
return rearrange ( x , ' b t (n h) -> (b n) t h ' , b = B , t = T , n = self . num_heads )
def gather_heads ( self , x , B , T , H ) :
def gather_heads ( self , x , B , T , H ) :
return rearrange ( x , ' (b n) t h -> b t (n h) ' , b = B , t = T , n = self . num_heads ) if self . num_heads > 1 else x
if self . num_heads < = 1 : return x
return rearrange ( x , ' (b n) t h -> b t (n h) ' , b = B , t = T , n = self . num_heads )
def attention ( self , x ) :
def attention ( self , x , padding_mask ) :
padding_mask = padding_mask . unsqueeze ( - 1 ) . expand ( * padding_mask . shape + ( self . num_heads , ) )
padding_mask = self . split_to_heads ( padding_mask , * padding_mask . shape )
q = self . rope ( self . split_to_heads ( self . q_proj ( x ) , * x . shape ) )
q = self . rope ( self . split_to_heads ( self . q_proj ( x ) , * x . shape ) )
k = self . rope ( self . split_to_heads ( self . k_proj ( x ) , * x . shape ) )
k = self . rope ( self . split_to_heads ( self . k_proj ( x ) , * x . shape ) )
v = self . split_to_heads ( self . v_proj ( x ) , * x . shape )
v = self . split_to_heads ( self . v_proj ( x ) , * x . shape )
scores = ( q @ k . transpose ( 1 , 2 ) ) * ( self . h_dim * * - 0.5 )
scores = ( q @ k . transpose ( 1 , 2 ) ) * ( self . h_dim * * - 0.5 )
scores = scores . masked_fill ( ~ padding_mask , - 1e9 )
attention = nn . functional . softmax ( scores , dim = 2 )
attention = nn . functional . softmax ( scores , dim = 2 )
return self . o_proj ( self . gather_heads ( attention @ v , * x . shape ) )
return self . o_proj ( self . gather_heads ( attention @ v , * x . shape ) )
def forward ( self , x ):
def forward ( self , x , padding_mask ):
x = x + F . dropout1d ( self . attention ( self . ln1 ( x ) ), p = self . dropout_rate )
x = x + F . dropout1d ( self . attention ( self . ln1 ( x ) , padding_mask ), p = self . dropout_rate )
x = x + F . dropout1d ( self . ff2 ( F . gelu ( self . ff1 ( self . ln2 ( x ) ) ) ) , p = self . dropout_rate )
x = x + F . dropout1d ( self . ff2 ( F . gelu ( self . ff1 ( self . ln2 ( x ) ) ) ) , p = self . dropout_rate )
return x
return x
def prepend ( element , tensor ) :
return torch . cat ( [ element . expand ( [ tensor . shape [ 0 ] , element . shape [ 1 ] , tensor . shape [ 2 ] ] ) , tensor ] , dim = 1 )
class BertClassifier ( nn . Module ) :
class BertClassifier ( nn . Module ) :
def __init__ ( self , layers_num = 1 , h_dim = 64 , class_num = 2 , max_seq_len = 128 , num_heads = 4 , dropout_rate = 0.1 ) :
def __init__ ( self , layers_num = 1 , h_dim = 64 , class_num = 2 , max_seq_len = 128 , num_heads = 4 , dropout_rate = 0.1 ) :
super ( ) . __init__ ( )
super ( ) . __init__ ( )
@ -232,11 +246,12 @@ class BertClassifier(nn.Module):
self . classifier_head = nn . Sequential ( nn . Linear ( h_dim , h_dim ) , nn . GELU ( ) , nn . Linear ( h_dim , class_num ) )
self . classifier_head = nn . Sequential ( nn . Linear ( h_dim , h_dim ) , nn . GELU ( ) , nn . Linear ( h_dim , class_num ) )
self . pos_embeds = nn . Parameter ( torch . randn ( 1 , self . max_seq_len , h_dim ) )
self . pos_embeds = nn . Parameter ( torch . randn ( 1 , self . max_seq_len , h_dim ) )
def forward ( self , x ) :
def forward ( self , x , padding_mask ) :
x = torch . concat ( [ self . cls_token . expand ( [ x . shape [ 0 ] , self . cls_token . shape [ 1 ] , self . cls_token . shape [ 2 ] ] ) , x ] , dim = 1 )
x = prepend ( self . cls_token , x )
padding_mask = torch . cat ( [ torch . ones ( x . shape [ 0 ] , 1 , dtype = torch . bool , device = x . device ) , padding_mask ] , dim = 1 )
x = x + self . pos_embeds [ : , : x . shape [ 1 ] , : ]
x = x + self . pos_embeds [ : , : x . shape [ 1 ] , : ]
for l in self . layers :
for l in self . layers :
x = l ( x )
x = l ( x , padding_mask )
x = self . classifier_head ( x [ : , 0 , : ] )
x = self . classifier_head ( x [ : , 0 , : ] )
return x [ : , : ] if self . class_num > 1 else x [ : , 0 ]
return x [ : , : ] if self . class_num > 1 else x [ : , 0 ]
@ -246,20 +261,21 @@ class Model(nn.Module):
self . encoder = encoder
self . encoder = encoder
self . classifier = classifier
self . classifier = classifier
def forward ( self , cat_inputs , num_inputs ):
def forward ( self , cat_inputs , num_inputs , padding_mask ):
inputs = self . encoder ( cat_inputs , num_inputs )
inputs = self . encoder ( cat_inputs , num_inputs )
return self . classifier ( inputs )
return self . classifier ( inputs , padding_mask )
def test ( start_time , epoch , batches_per_epoch , batch_size , model , optimizer , credit_dataset , test_auroc , writer ) :
def test ( start_time , epoch , batches_per_epoch , batch_size , model , optimizer , credit_dataset , test_auroc , writer ) :
model . eval ( )
model . eval ( )
optimizer . eval ( )
optimizer . eval ( )
with torch . no_grad ( ) :
with torch . no_grad ( ) :
test_iterator = credit_dataset . get_test_batch_iterator ( batch_size = batch_size )
test_iterator = credit_dataset . get_test_batch_iterator ( batch_size = batch_size )
for test_batch_id , ( test_cat_inputs , test_num_inputs , test_ targets) in enumerate ( test_iterator ) :
for test_batch_id , ( test_cat_inputs , test_num_inputs , test_ padding_mask, test_ targets) in enumerate ( test_iterator ) :
test_cat_inputs = test_cat_inputs . to ( " cuda " , non_blocking = True )
test_cat_inputs = test_cat_inputs . to ( " cuda " , non_blocking = True )
test_num_inputs = test_num_inputs . to ( " cuda " , non_blocking = True )
test_num_inputs = test_num_inputs . to ( " cuda " , non_blocking = True )
test_padding_mask = test_padding_mask . to ( " cuda " , non_blocking = True )
test_targets = test_targets . to ( " cuda " , non_blocking = True )
test_targets = test_targets . to ( " cuda " , non_blocking = True )
outputs = model ( test_cat_inputs , test_num_inputs )
outputs = model ( test_cat_inputs , test_num_inputs , test_padding_mask )
test_auroc . update ( outputs , test_targets . long ( ) )
test_auroc . update ( outputs , test_targets . long ( ) )
print ( f " \r { test_batch_id } / { len ( credit_dataset . test_uniq_client_ids ) / / batch_size } { test_auroc . compute ( ) . item ( ) : .5f } " , end = " " * 20 )
print ( f " \r { test_batch_id } / { len ( credit_dataset . test_uniq_client_ids ) / / batch_size } { test_auroc . compute ( ) . item ( ) : .5f } " , end = " " * 20 )
if not writer is None :
if not writer is None :
@ -281,8 +297,8 @@ batch_size = 30000
datasets_per_epoch = 1
datasets_per_epoch = 1
num_workers = 10
num_workers = 10
comment = sys . argv [ 1 ]
logs_dir = f ' run s/{ datetime . now ( ) . date ( ) } _ { datetime . now ( ) . hour : 02d } _ { datetime . now ( ) . minute : 02d } _ { datetime . now ( ) . second : 02d } _ { comment } / '
logs_dir = f ' log s/{ datetime . now ( ) . date ( ) } _ { datetime . now ( ) . hour : 02d } _ { datetime . now ( ) . minute : 02d } _ { datetime . now ( ) . second : 02d } _ { comment } / '
writer = SummaryWriter ( logs_dir )
writer = SummaryWriter ( logs_dir )
checkpoints_dir = f ' checkpoints/ { datetime . now ( ) . date ( ) } _ { datetime . now ( ) . hour : 02d } _ { datetime . now ( ) . minute : 02d } _ { datetime . now ( ) . second : 02d } _ { comment } / '
checkpoints_dir = f ' checkpoints/ { datetime . now ( ) . date ( ) } _ { datetime . now ( ) . hour : 02d } _ { datetime . now ( ) . minute : 02d } _ { datetime . now ( ) . second : 02d } _ { comment } / '
script_snapshot_path = Path ( logs_dir + Path ( sys . argv [ 0 ] ) . name )
script_snapshot_path = Path ( logs_dir + Path ( sys . argv [ 0 ] ) . name )
@ -297,8 +313,7 @@ credit_train_dataset = CreditProductsDataset(
features_path = " /wd/data/train_data/ " ,
features_path = " /wd/data/train_data/ " ,
targets_path = " /wd/data/train_target.csv " ,
targets_path = " /wd/data/train_target.csv " ,
train_uniq_client_ids_path = f " /wd/fold3_train_ids.csv " ,
train_uniq_client_ids_path = f " /wd/fold3_train_ids.csv " ,
test_uniq_client_ids_path = f " /wd/fold3_test_ids.csv " ,
test_uniq_client_ids_path = f " /wd/fold3_test_ids.csv "
dropout_rate = features_dropout_rate
)
)
print ( f " Dataset preparation time: { datetime . now ( ) - start_prep_time } " )
print ( f " Dataset preparation time: { datetime . now ( ) - start_prep_time } " )
@ -307,7 +322,8 @@ encoder = Encoder(
num_columns = credit_train_dataset . num_columns ,
num_columns = credit_train_dataset . num_columns ,
cat_features_max_id = credit_train_dataset . cat_features . max ( ) ,
cat_features_max_id = credit_train_dataset . cat_features . max ( ) ,
category_feature_dim = category_feature_dim ,
category_feature_dim = category_feature_dim ,
out_dim = h_dim
out_dim = h_dim ,
features_dropout_rate = features_dropout_rate
)
)
classifier = BertClassifier (
classifier = BertClassifier (
@ -358,13 +374,14 @@ try:
test_auroc = test_auroc ,
test_auroc = test_auroc ,
writer = writer
writer = writer
)
)
for batch_id , ( cat_inputs , num_inputs , targets) in enumerate ( dataloader ) :
for batch_id , ( cat_inputs , num_inputs , padding_mask, targets) in enumerate ( dataloader ) :
model . train ( )
model . train ( )
optimizer . train ( )
optimizer . train ( )
optimizer . zero_grad ( )
optimizer . zero_grad ( )
outputs = model (
outputs = model (
cat_inputs [ 0 ] . to ( " cuda " ) ,
cat_inputs [ 0 ] . to ( " cuda " ) ,
num_inputs [ 0 ] . to ( " cuda " )
num_inputs [ 0 ] . to ( " cuda " ) ,
padding_mask [ 0 ] . to ( " cuda " )
)
)
loss = criterion ( outputs , targets [ 0 ] . to ( " cuda " ) )
loss = criterion ( outputs , targets [ 0 ] . to ( " cuda " ) )
loss . backward ( )
loss . backward ( )