|
|
|
@ -19,7 +19,7 @@ import torch.nn.functional as F
|
|
|
|
|
|
|
|
|
|
import functools
|
|
|
|
|
|
|
|
|
|
def save_checkpoint(credit_dataset, encoder, model, optimizer, epoch, loss, rocauc, сheсkpoints_dir):
|
|
|
|
|
def save_checkpoint(credit_dataset, encoder, model, optimizer, epoch, loss, rocauc, checkpoints_dir):
|
|
|
|
|
checkpoint = {
|
|
|
|
|
'encoder': {
|
|
|
|
|
'state_dict': encoder.state_dict(),
|
|
|
|
@ -38,7 +38,7 @@ def save_checkpoint(credit_dataset, encoder, model, optimizer, epoch, loss, roca
|
|
|
|
|
'train_uniq_client_ids_path': credit_dataset.train_uniq_client_ids_path,
|
|
|
|
|
'test_uniq_client_ids_path': credit_dataset.test_uniq_client_ids_path
|
|
|
|
|
}
|
|
|
|
|
path = сheсkpoints_dir + f"epoch_{epoch}_{rocauc:.4f}.pth"
|
|
|
|
|
path = checkpoints_dir + f"epoch_{epoch}_{rocauc:.4f}.pth"
|
|
|
|
|
# if torch.distributed.get_rank() == 0:
|
|
|
|
|
torch.save(checkpoint, path)
|
|
|
|
|
print(f"\nCheckpoint saved to {path}")
|
|
|
|
@ -46,7 +46,7 @@ def save_checkpoint(credit_dataset, encoder, model, optimizer, epoch, loss, roca
|
|
|
|
|
######################################## Dataset #########################################################
|
|
|
|
|
|
|
|
|
|
class CreditProductsDataset:
|
|
|
|
|
def __init__(self,
|
|
|
|
|
def __init__(self,
|
|
|
|
|
features_path, targets_path, train_test_split_ratio=0.9,
|
|
|
|
|
train_uniq_client_ids_path=None, test_uniq_client_ids_path=None,
|
|
|
|
|
dropout_rate=0.0
|
|
|
|
@ -55,12 +55,12 @@ class CreditProductsDataset:
|
|
|
|
|
if Path(self.train_uniq_client_ids_path).exists():
|
|
|
|
|
self.train_uniq_client_ids = pd.read_csv(self.train_uniq_client_ids_path).iloc[:,0].values
|
|
|
|
|
print("Loaded", self.train_uniq_client_ids_path)
|
|
|
|
|
else:
|
|
|
|
|
else:
|
|
|
|
|
raise Exception(f"No {self.train_uniq_client_ids_path}")
|
|
|
|
|
if Path(self.test_uniq_client_ids_path).exists():
|
|
|
|
|
self.test_uniq_client_ids = pd.read_csv(self.test_uniq_client_ids_path).iloc[:,0].values
|
|
|
|
|
print("Loaded", self.test_uniq_client_ids_path)
|
|
|
|
|
else:
|
|
|
|
|
else:
|
|
|
|
|
raise Exception(f"No {self.test_uniq_client_ids_path}")
|
|
|
|
|
assert(len(np.intersect1d(self.train_uniq_client_ids, self.test_uniq_client_ids)) == 0), "Train contains test examples."
|
|
|
|
|
self.features_df = pd.read_parquet(features_path)
|
|
|
|
@ -81,7 +81,7 @@ class CreditProductsDataset:
|
|
|
|
|
'pre_loans5', 'pre_loans6090', 'pre_loans530', 'pre_loans90', 'pre_loans3060'
|
|
|
|
|
]
|
|
|
|
|
self.num_columns = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# make unified category index for embeddings for all columns. zero index embedding for padding will be zeroed during training
|
|
|
|
|
self.cat_cardinalities = self.features_df.max(axis=0)[self.cat_columns] + 1
|
|
|
|
|
self.cat_cardinalities_integral = self.cat_cardinalities.cumsum()
|
|
|
|
@ -104,8 +104,8 @@ class CreditProductsDataset:
|
|
|
|
|
|
|
|
|
|
def get_train_batch(self, batch_size=4):
|
|
|
|
|
sampled_ids = np.random.choice(self.train_uniq_client_ids, batch_size, replace=False) # think about replace=True
|
|
|
|
|
cat_features_batch = self.cat_features[sampled_ids]
|
|
|
|
|
num_features_batch = self.num_features[sampled_ids]
|
|
|
|
|
cat_features_batch = self.cat_features[sampled_ids]
|
|
|
|
|
num_features_batch = self.num_features[sampled_ids]
|
|
|
|
|
cat_features_batch *= torch.empty_like(cat_features_batch).bernoulli_(1-self.dropout_rate) # arg is keep_probability
|
|
|
|
|
num_features_batch *= torch.empty_like(num_features_batch).bernoulli_(1-self.dropout_rate)
|
|
|
|
|
targets_batch = self.targets[sampled_ids]
|
|
|
|
@ -114,12 +114,12 @@ class CreditProductsDataset:
|
|
|
|
|
def get_test_batch_iterator(self, batch_size=4):
|
|
|
|
|
for i in range(0, len(self.test_uniq_client_ids), batch_size):
|
|
|
|
|
ids = self.test_uniq_client_ids[i:i+batch_size]
|
|
|
|
|
cat_features_batch = self.cat_features[ids]
|
|
|
|
|
num_features_batch = self.num_features[ids]
|
|
|
|
|
cat_features_batch = self.cat_features[ids]
|
|
|
|
|
num_features_batch = self.num_features[ids]
|
|
|
|
|
targets_batch = self.targets[ids]
|
|
|
|
|
yield cat_features_batch, num_features_batch, targets_batch
|
|
|
|
|
|
|
|
|
|
# for parallel data selection
|
|
|
|
|
# for parallel data selection
|
|
|
|
|
class WrapperDataset(Dataset):
|
|
|
|
|
def __init__(self, credit_dataset, batch_size, datasets_per_epoch=1):
|
|
|
|
|
self.credit_dataset = credit_dataset
|
|
|
|
@ -146,11 +146,11 @@ class Encoder(nn.Module):
|
|
|
|
|
self.num_scales = nn.Parameter(torch.randn(1, len(self.num_columns)))
|
|
|
|
|
self.num_shifts = nn.Parameter(torch.randn(1, len(self.num_columns)))
|
|
|
|
|
self.proj = nn.Linear(self.total_h_dim, self.out_dim, bias=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def forward(self, cat_features_batch, num_features_batch):
|
|
|
|
|
cat_embed_tensor = self.cat_embeds(cat_features_batch.type(torch.int32))
|
|
|
|
|
cat_embed_tensor = cat_embed_tensor.reshape(cat_features_batch.shape[0], cat_features_batch.shape[1], -1)
|
|
|
|
|
num_embed_tensor = self.num_scales * num_features_batch + self.num_shifts
|
|
|
|
|
num_embed_tensor = self.num_scales * num_features_batch + self.num_shifts
|
|
|
|
|
embed_tensor = torch.concat([cat_embed_tensor, num_embed_tensor], dim=-1)
|
|
|
|
|
inputs = self.proj(embed_tensor)
|
|
|
|
|
return inputs
|
|
|
|
@ -182,7 +182,7 @@ class DyT(nn.Module):
|
|
|
|
|
self.alpha = nn.Parameter(torch.ones(1) * alpha_init_value)
|
|
|
|
|
self.weight = nn.Parameter(torch.ones(num_features))
|
|
|
|
|
self.bias = nn.Parameter(torch.zeros(num_features))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def forward(self, x):
|
|
|
|
|
x = torch.tanh(self.alpha * x)
|
|
|
|
|
return x * self.weight + self.bias
|
|
|
|
@ -199,8 +199,8 @@ class TransformerLayer(nn.Module):
|
|
|
|
|
self.o_proj = nn.Linear(h_dim, h_dim)
|
|
|
|
|
self.ff1 = nn.Linear(h_dim, 4*h_dim)
|
|
|
|
|
self.ff2 = nn.Linear(4*h_dim, h_dim)
|
|
|
|
|
self.ln1 = DyT(h_dim)
|
|
|
|
|
self.ln2 = DyT(h_dim)
|
|
|
|
|
self.ln1 = DyT(h_dim)
|
|
|
|
|
self.ln2 = DyT(h_dim)
|
|
|
|
|
self.rope = RoPE(dim=h_dim//self.num_heads, max_seq_len=max_seq_len)
|
|
|
|
|
|
|
|
|
|
def split_to_heads(self, x, B, T, H):
|
|
|
|
@ -213,7 +213,7 @@ class TransformerLayer(nn.Module):
|
|
|
|
|
q = self.rope(self.split_to_heads(self.q_proj(x), *x.shape))
|
|
|
|
|
k = self.rope(self.split_to_heads(self.k_proj(x), *x.shape))
|
|
|
|
|
v = self.split_to_heads(self.v_proj(x), *x.shape)
|
|
|
|
|
scores = (q @ k.transpose(1, 2)) * (self.h_dim ** -0.5)
|
|
|
|
|
scores = (q @ k.transpose(1, 2)) * (self.h_dim ** -0.5)
|
|
|
|
|
attention = nn.functional.softmax(scores, dim=2)
|
|
|
|
|
return self.o_proj(self.gather_heads(attention @ v, *x.shape))
|
|
|
|
|
|
|
|
|
@ -226,7 +226,7 @@ class BertClassifier(nn.Module):
|
|
|
|
|
def __init__(self, layers_num=1, h_dim=64, class_num=2, max_seq_len=128, num_heads=4, dropout_rate = 0.1):
|
|
|
|
|
super().__init__()
|
|
|
|
|
self.__dict__.update({k:v for k,v in locals().items() if k != 'self'})
|
|
|
|
|
self.cls_token = nn.Parameter(torch.randn(1,1,h_dim))
|
|
|
|
|
self.cls_token = nn.Parameter(torch.randn(1,1,h_dim))
|
|
|
|
|
self.max_seq_len = max_seq_len + self.cls_token.shape[1]
|
|
|
|
|
self.layers = nn.ModuleList([TransformerLayer(h_dim=h_dim, num_heads=num_heads, dropout_rate = dropout_rate, max_seq_len=self.max_seq_len) for _ in range(layers_num)])
|
|
|
|
|
self.classifier_head = nn.Sequential(nn.Linear(h_dim, h_dim), nn.GELU(), nn.Linear(h_dim, class_num))
|
|
|
|
@ -245,7 +245,7 @@ class Model(nn.Module):
|
|
|
|
|
super().__init__()
|
|
|
|
|
self.encoder = encoder
|
|
|
|
|
self.classifier = classifier
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def forward(self, cat_inputs, num_inputs):
|
|
|
|
|
inputs = self.encoder(cat_inputs, num_inputs)
|
|
|
|
|
return self.classifier(inputs)
|
|
|
|
@ -284,17 +284,17 @@ num_workers = 10
|
|
|
|
|
comment = sys.argv[1]
|
|
|
|
|
logs_dir = f'runs/{datetime.now().date()}_{datetime.now().hour:02d}_{datetime.now().minute:02d}_{datetime.now().second:02d}_{comment}/'
|
|
|
|
|
writer = SummaryWriter(logs_dir)
|
|
|
|
|
сheсkpoints_dir = f'checkpoints/{datetime.now().date()}_{datetime.now().hour:02d}_{datetime.now().minute:02d}_{datetime.now().second:02d}_{comment}/'
|
|
|
|
|
checkpoints_dir = f'checkpoints/{datetime.now().date()}_{datetime.now().hour:02d}_{datetime.now().minute:02d}_{datetime.now().second:02d}_{comment}/'
|
|
|
|
|
script_snapshot_path = Path(logs_dir + Path(sys.argv[0]).name)
|
|
|
|
|
Path(сheсkpoints_dir).mkdir(parents=True, exist_ok=True)
|
|
|
|
|
Path(checkpoints_dir).mkdir(parents=True, exist_ok=True)
|
|
|
|
|
print("Logs dir:", logs_dir)
|
|
|
|
|
print("Chekpoints dir:", сheсkpoints_dir)
|
|
|
|
|
print("Chekpoints dir:", checkpoints_dir)
|
|
|
|
|
script_snapshot_path.write_bytes(Path(sys.argv[0]).read_bytes()) # copy this version of script
|
|
|
|
|
script_snapshot_path.chmod(0o400) # with read-only permission
|
|
|
|
|
|
|
|
|
|
start_prep_time = datetime.now()
|
|
|
|
|
credit_train_dataset = CreditProductsDataset(
|
|
|
|
|
features_path="/wd/data/train_data/",
|
|
|
|
|
features_path="/wd/data/train_data/",
|
|
|
|
|
targets_path="/wd/data/train_target.csv",
|
|
|
|
|
train_uniq_client_ids_path=f"/wd/fold3_train_ids.csv",
|
|
|
|
|
test_uniq_client_ids_path=f"/wd/fold3_test_ids.csv",
|
|
|
|
@ -304,17 +304,17 @@ print(f"Dataset preparation time: {datetime.now() - start_prep_time}")
|
|
|
|
|
|
|
|
|
|
encoder = Encoder(
|
|
|
|
|
cat_columns=credit_train_dataset.cat_columns,
|
|
|
|
|
num_columns=credit_train_dataset.num_columns,
|
|
|
|
|
num_columns=credit_train_dataset.num_columns,
|
|
|
|
|
cat_features_max_id=credit_train_dataset.cat_features.max(),
|
|
|
|
|
category_feature_dim=category_feature_dim,
|
|
|
|
|
category_feature_dim=category_feature_dim,
|
|
|
|
|
out_dim=h_dim
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
classifier = BertClassifier(
|
|
|
|
|
layers_num=layers_num,
|
|
|
|
|
layers_num=layers_num,
|
|
|
|
|
num_heads=num_heads,
|
|
|
|
|
h_dim=h_dim,
|
|
|
|
|
class_num=class_num,
|
|
|
|
|
h_dim=h_dim,
|
|
|
|
|
class_num=class_num,
|
|
|
|
|
max_seq_len=credit_train_dataset.max_user_history,
|
|
|
|
|
dropout_rate = model_dropout_date
|
|
|
|
|
)
|
|
|
|
@ -328,7 +328,7 @@ optimizer = schedulefree.AdamWScheduleFree(model.parameters())
|
|
|
|
|
# class weighting is important
|
|
|
|
|
positive_counts = credit_train_dataset.targets_df.loc[credit_train_dataset.train_uniq_client_ids].values.sum()
|
|
|
|
|
negative_counts = len(credit_train_dataset.targets_df.loc[credit_train_dataset.train_uniq_client_ids]) - positive_counts
|
|
|
|
|
pos_weight = negative_counts / (positive_counts + 1e-15)
|
|
|
|
|
pos_weight = negative_counts / (positive_counts + 1e-15)
|
|
|
|
|
print(f"Class imbalance: {negative_counts} {positive_counts}. Pos weight: {pos_weight}")
|
|
|
|
|
|
|
|
|
|
criterion = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight))
|
|
|
|
@ -348,14 +348,14 @@ last_checkpoint_time = start_time
|
|
|
|
|
try:
|
|
|
|
|
for epoch in range(epochs):
|
|
|
|
|
test(
|
|
|
|
|
start_time=start_time,
|
|
|
|
|
epoch=epoch,
|
|
|
|
|
batches_per_epoch=batches_per_epoch,
|
|
|
|
|
batch_size=batch_size,
|
|
|
|
|
model=model,
|
|
|
|
|
optimizer=optimizer,
|
|
|
|
|
credit_dataset=credit_train_dataset,
|
|
|
|
|
test_auroc=test_auroc,
|
|
|
|
|
start_time=start_time,
|
|
|
|
|
epoch=epoch,
|
|
|
|
|
batches_per_epoch=batches_per_epoch,
|
|
|
|
|
batch_size=batch_size,
|
|
|
|
|
model=model,
|
|
|
|
|
optimizer=optimizer,
|
|
|
|
|
credit_dataset=credit_train_dataset,
|
|
|
|
|
test_auroc=test_auroc,
|
|
|
|
|
writer=writer
|
|
|
|
|
)
|
|
|
|
|
for batch_id, (cat_inputs, num_inputs, targets) in enumerate(dataloader):
|
|
|
|
@ -363,7 +363,7 @@ try:
|
|
|
|
|
optimizer.train()
|
|
|
|
|
optimizer.zero_grad()
|
|
|
|
|
outputs = model(
|
|
|
|
|
cat_inputs[0].to("cuda"),
|
|
|
|
|
cat_inputs[0].to("cuda"),
|
|
|
|
|
num_inputs[0].to("cuda")
|
|
|
|
|
)
|
|
|
|
|
loss = criterion(outputs, targets[0].to("cuda"))
|
|
|
|
@ -375,53 +375,53 @@ try:
|
|
|
|
|
last_display_time = current_time
|
|
|
|
|
writer.add_scalar(f'Loss', loss.item(), epoch*batches_per_epoch+batch_id)
|
|
|
|
|
print(f"\r {current_time-start_time} {epoch+1}/{epochs} {batch_id}/{batches_per_epoch} loss: {loss.item():.6f} {comment}", end = " "*2)
|
|
|
|
|
if current_time - last_checkpoint_time > timedelta(hours=8):
|
|
|
|
|
if current_time - last_checkpoint_time > timedelta(hours=8):
|
|
|
|
|
last_checkpoint_time = current_time
|
|
|
|
|
test(
|
|
|
|
|
start_time=start_time,
|
|
|
|
|
epoch=epoch,
|
|
|
|
|
batches_per_epoch=batches_per_epoch,
|
|
|
|
|
batch_size=batch_size,
|
|
|
|
|
model=model,
|
|
|
|
|
optimizer=optimizer,
|
|
|
|
|
credit_dataset=credit_train_dataset,
|
|
|
|
|
test_auroc=test_auroc,
|
|
|
|
|
start_time=start_time,
|
|
|
|
|
epoch=epoch,
|
|
|
|
|
batches_per_epoch=batches_per_epoch,
|
|
|
|
|
batch_size=batch_size,
|
|
|
|
|
model=model,
|
|
|
|
|
optimizer=optimizer,
|
|
|
|
|
credit_dataset=credit_train_dataset,
|
|
|
|
|
test_auroc=test_auroc,
|
|
|
|
|
writer=None
|
|
|
|
|
)
|
|
|
|
|
rocauc = test_auroc.compute().item()
|
|
|
|
|
save_checkpoint(
|
|
|
|
|
credit_dataset=credit_train_dataset,
|
|
|
|
|
encoder = model.module.encoder,
|
|
|
|
|
model=model.module.classifier,
|
|
|
|
|
optimizer=optimizer,
|
|
|
|
|
epoch=epoch,
|
|
|
|
|
loss=loss.item(),
|
|
|
|
|
rocauc=rocauc,
|
|
|
|
|
сheсkpoints_dir=сheсkpoints_dir
|
|
|
|
|
encoder = model.module.encoder,
|
|
|
|
|
model=model.module.classifier,
|
|
|
|
|
optimizer=optimizer,
|
|
|
|
|
epoch=epoch,
|
|
|
|
|
loss=loss.item(),
|
|
|
|
|
rocauc=rocauc,
|
|
|
|
|
checkpoints_dir=checkpoints_dir
|
|
|
|
|
)
|
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
|
print()
|
|
|
|
|
finally:
|
|
|
|
|
finally:
|
|
|
|
|
test(
|
|
|
|
|
start_time=start_time,
|
|
|
|
|
epoch=epoch+1,
|
|
|
|
|
batches_per_epoch=batches_per_epoch,
|
|
|
|
|
batch_size=batch_size,
|
|
|
|
|
model=model,
|
|
|
|
|
optimizer=optimizer,
|
|
|
|
|
credit_dataset=credit_train_dataset,
|
|
|
|
|
test_auroc=test_auroc,
|
|
|
|
|
epoch=epoch+1,
|
|
|
|
|
batches_per_epoch=batches_per_epoch,
|
|
|
|
|
batch_size=batch_size,
|
|
|
|
|
model=model,
|
|
|
|
|
optimizer=optimizer,
|
|
|
|
|
credit_dataset=credit_train_dataset,
|
|
|
|
|
test_auroc=test_auroc,
|
|
|
|
|
writer=writer
|
|
|
|
|
)
|
|
|
|
|
rocauc = test_auroc.compute().item()
|
|
|
|
|
save_checkpoint(
|
|
|
|
|
credit_dataset=credit_train_dataset,
|
|
|
|
|
encoder = model.encoder,
|
|
|
|
|
model=model.classifier,
|
|
|
|
|
optimizer=optimizer,
|
|
|
|
|
epoch=epoch,
|
|
|
|
|
loss=loss.item(),
|
|
|
|
|
rocauc=rocauc,
|
|
|
|
|
сheсkpoints_dir=сheсkpoints_dir
|
|
|
|
|
encoder = model.encoder,
|
|
|
|
|
model=model.classifier,
|
|
|
|
|
optimizer=optimizer,
|
|
|
|
|
epoch=epoch,
|
|
|
|
|
loss=loss.item(),
|
|
|
|
|
rocauc=rocauc,
|
|
|
|
|
checkpoints_dir=checkpoints_dir
|
|
|
|
|
)
|
|
|
|
|
writer.close()
|
|
|
|
|
writer.close()
|
|
|
|
|