主要进行训练框架优化

  • 端到端 ML 实施(训练、验证、预测、评估)
  • 轻松适应您自己的数据集
  • 促进其他基于 BERT 的模型(BERT、ALBERT、…)的快速实验
  • 使用有限的计算资源进行快速训练(混合精度、梯度累积……)
  • 多 GPU 执行
  • 分类决策的阈值选择(不一定是 0.5)
  • 冻结 BERT 层,只更新分类层权重或更新所有权重
  • 种子设置,可复现结果

PipeLine

导包

1
2
3
4
5
6
7
8
9
10
11
12
13
import torch
import torch.nn as nn
import os
import copy
import torch.optim as optim
import random
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset, load_metric

Dataset

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
class CustomDataset(Dataset):

def __init__(self, data, maxlen, with_labels=True, bert_model='albert-base-v2'):

self.data = data # pandas dataframe
#Initialize the tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(bert_model)

self.maxlen = maxlen
self.with_labels = with_labels

def __len__(self):
return len(self.data)

def __getitem__(self, index):

#根据索引索取DataFrame中句子1余句子2
sent1 = str(self.data.loc[index, 'sentence1'])
sent2 = str(self.data.loc[index, 'sentence2'])

# 对句子对分词,得到input_ids、attention_mask和token_type_ids
encoded_pair = self.tokenizer(sent1, sent2,
padding='max_length', # 填充到最大长度
truncation=True, # 根据最大长度进行截断
max_length=self.maxlen,
return_tensors='pt') # 返回torch.Tensor张量

token_ids = encoded_pair['input_ids'].squeeze(0) # tensor token ids
attn_masks = encoded_pair['attention_mask'].squeeze(0) # padded values对应为 "0" ,其他token为1
token_type_ids = encoded_pair['token_type_ids'].squeeze(0) #第一个句子的值为0,第二个句子的值为1 # 只有一句全为0

if self.with_labels: # True if the dataset has labels
label = self.data.loc[index, 'label']
return token_ids, attn_masks, token_type_ids, label
else:
return token_ids, attn_masks, token_type_ids

建议,进行测试

1
2
sample = next(iter(DataLoader(tr_dataset, batch_size=2)))
sample
1
2
tr_model = SentencePairClassifier(freeze_bert=True)
tr_model(sample[0], sample[1], sample[2])

就是方便最后的维度转换,squeeze、flatten、view;甚至可以用reshape方法

模型定义

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
class SentencePairClassifier(nn.Module):

def __init__(self, bert_model="albert-base-v2", freeze_bert=False):
super(SentencePairClassifier, self).__init__()
# 初始化预训练模型Bert xxx
self.bert_layer = AutoModel.from_pretrained(bert_model)

# encoder 隐藏层大小
if bert_model == "albert-base-v2": # 12M 参数
hidden_size = 768
elif bert_model == "albert-large-v2": # 18M 参数
hidden_size = 1024
elif bert_model == "albert-xlarge-v2": # 60M 参数
hidden_size = 2048
elif bert_model == "albert-xxlarge-v2": # 235M 参数
hidden_size = 4096
elif bert_model == "bert-base-uncased": # 110M 参数
hidden_size = 768
elif bert_model == "roberta-base": #
hidden_size = 768

# 固定Bert层 更新分类输出层
if freeze_bert:
for p in self.bert_layer.parameters():
p.requires_grad = False

self.dropout = nn.Dropout(p=0.1)
# 分类输出
self.cls_layer = nn.Linear(hidden_size, 1)


@autocast() # 混合精度训练
def forward(self, input_ids, attn_masks, token_type_ids):
'''
Inputs:
-input_ids : Tensor containing token ids
-attn_masks : Tensor containing attention masks to be used to focus on non-padded values
-token_type_ids : Tensor containing token type ids to be used to identify sentence1 and sentence2
'''

# 输入给Bert,获取上下文表示
# cont_reps, pooler_output = self.bert_layer(input_ids, attn_masks, token_type_ids)
outputs = self.bert_layer(input_ids, attn_masks, token_type_ids)
# last_hidden_state,pooler_output,all_hidden_states 12层
# 将last layer hidden-state of the [CLS] 输入到 classifier layer
# - last_hidden_state 的向量平均
# - 取all_hidden_states最后四层,然后做平均 weighted 平均
# - last_hidden_state+lstm
# 获取输出
logits = self.cls_layer(self.dropout(outputs['pooler_output']))

return logits

固定随机种子

1
2
3
4
5
6
7
8
9
10
def set_seed(seed):
""" 固定随机种子,保证结果复现
"""
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

训练和评估

1
2
!mkdir models 	#可以在之前补充绝对路径
!mkdir results
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate):

best_loss = np.Inf
best_ep = 1
nb_iterations = len(train_loader)
print_every = nb_iterations // 5 # 打印频率
iters = []
train_losses = []
val_losses = []

scaler = GradScaler()

for ep in range(epochs):

net.train()
running_loss = 0.0
for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(train_loader)):

# 转为cuda张量
seq, attn_masks, token_type_ids, labels = \
seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)

# 混合精度加速训练
with autocast():
# Obtaining the logits from the model
logits = net(seq, attn_masks, token_type_ids)

# Computing loss
loss = criterion(logits.squeeze(-1), labels.float())
loss = loss / iters_to_accumulate # Normalize the loss because it is averaged

# Backpropagating the gradients
# Scales loss. Calls backward() on scaled loss to create scaled gradients.
scaler.scale(loss).backward()

if (it + 1) % iters_to_accumulate == 0:
# Optimization step
# scaler.step() first unscales the gradients of the optimizer's assigned params.
# If these gradients do not contain infs or NaNs, opti.step() is then called,
# otherwise, opti.step() is skipped.
scaler.step(opti)
# Updates the scale for next iteration.
scaler.update()
# 根据迭代次数调整学习率。
lr_scheduler.step()
# 梯度清零
opti.zero_grad()


running_loss += loss.item()

if (it + 1) % print_every == 0: # Print training loss information
print()
print(f"Iteration {it+1}/{nb_iterations} of epoch {ep+1} complete. \
Loss : {running_loss / print_every} ")

running_loss = 0.0


val_loss = evaluate_loss(net, device, criterion, val_loader) # Compute validation loss
print()
print(f"Epoch {ep+1} complete! Validation Loss : {val_loss}")

if val_loss < best_loss:
print("Best validation loss improved from {} to {}".format(best_loss, val_loss))
print()
net_copy = copy.deepcopy(net) # # 保存最优模型
best_loss = val_loss
best_ep = ep + 1

# 保存模型
path_to_model=f'models/{bert_model}_lr_{lr}_val_loss_{round(best_loss, 5)}_ep_{best_ep}.pt'
torch.save(net_copy.state_dict(), path_to_model)
print("The model has been saved in {}".format(path_to_model))

del loss
torch.cuda.empty_cache() # 清空显存

def evaluate_loss(net, device, criterion, dataloader):
"""
评估输出
"""
net.eval()

mean_loss = 0
count = 0

with torch.no_grad():
for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(dataloader)):
seq, attn_masks, token_type_ids, labels = \
seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
logits = net(seq, attn_masks, token_type_ids)
mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
count += 1

return mean_loss / count
  1. 注意autocast和累计梯度 这两种加速计算的方法

  2. evaluate的时候要注意数据的维度,标签的类型

超参数 & 开始训练

1
2
3
4
5
6
7
bert_model = "albert-base-v2"  # 'albert-base-v2', 'albert-large-v2'
freeze_bert = False # 是否冻结Bert
maxlen = 128 # 最大长度
bs = 16 # batch size
iters_to_accumulate = 2 # 梯度累加
lr = 2e-5 # learning rate
epochs = 2 # 训练轮数
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#  固定随机种子 便于复现
set_seed(1) # 2022

# 创建训练集与验证集
print("Reading training data...")
train_set = CustomDataset(df_train, maxlen, bert_model)
print("Reading validation data...")
val_set = CustomDataset(df_val, maxlen, bert_model)
# 常见训练集与验证集DataLoader
train_loader = DataLoader(train_set, batch_size=bs, num_workers=0)
val_loader = DataLoader(val_set, batch_size=bs, num_workers=0)


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net = SentencePairClassifier(bert_model, freeze_bert=freeze_bert)

if torch.cuda.device_count() > 1: # if multiple GPUs
print("Let's use", torch.cuda.device_count(), "GPUs!")
net = nn.DataParallel(net)

net.to(device)

criterion = nn.BCEWithLogitsLoss()
opti = AdamW(net.parameters(), lr=lr, weight_decay=1e-2)
num_warmup_steps = 0 # The number of steps for the warmup phase.
num_training_steps = epochs * len(train_loader) # The total number of training steps
t_total = (len(train_loader) // iters_to_accumulate) * epochs # Necessary to take into account Gradient accumulation
lr_scheduler = get_linear_schedule_with_warmup(optimizer=opti, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)

train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate)
  1. 注意多gpu训练 torch.cuda.device_count() > 1, net = nn.DataParallel(net)的使用

测试

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def get_probs_from_logits(logits):
"""
Converts a tensor of logits into an array of probabilities by applying the sigmoid function
"""
probs = torch.sigmoid(logits.unsqueeze(-1))
return probs.detach().cpu().numpy()

def test_prediction(net, device, dataloader, with_labels=True, result_file="results/output.txt"):
"""
Predict the probabilities on a dataset with or without labels and print the result in a file
"""
net.eval()
w = open(result_file, 'w')
probs_all = []

with torch.no_grad():
if with_labels:
for seq, attn_masks, token_type_ids, _ in tqdm(dataloader):# 训练集、验证集
seq, attn_masks, token_type_ids = seq.to(device), attn_masks.to(device), token_type_ids.to(device)
logits = net(seq, attn_masks, token_type_ids)
probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1)
probs_all += probs.tolist()
else:
for seq, attn_masks, token_type_ids in tqdm(dataloader): # 没有标签的测试集
seq, attn_masks, token_type_ids = seq.to(device), attn_masks.to(device), token_type_ids.to(device)
logits = net(seq, attn_masks, token_type_ids)
probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1)
probs_all += probs.tolist()

w.writelines(str(prob)+'\n' for prob in probs_all)
w.close()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
path_to_model = './model'  
# path_to_model = '/content/models/...' # You can add here your trained model

path_to_output_file = './results'

print("Reading test data...")
test_set = CustomDataset(df_test, maxlen, bert_model)
test_loader = DataLoader(test_set, batch_size=bs, num_workers=0)

model = SentencePairClassifier(bert_model)
if torch.cuda.device_count() > 1: # if multiple GPUs
print("Let's use", torch.cuda.device_count(), "GPUs!")
model = nn.DataParallel(model)

print()
print("Loading the weights of the model...")
model.load_state_dict(torch.load(path_to_model))
model.to(device)

print("Predicting on test data...")
test_prediction(net=model, device=device, dataloader=test_loader, with_labels=True, # set the with_labels parameter to False if your want to get predictions on a dataset without labels
result_file=path_to_output_file)
print()
print("Predictions are available in : {}".format(path_to_output_file))
1
2
3
4
5
6
7
8
9
path_to_output_file = 'results/output.txt'  # 预测结果概率文件

labels_test = df_test['label'] # true labels

probs_test = pd.read_csv(path_to_output_file, header=None)[0] # 预测概率
threshold = 0.6 # you can adjust this threshold for your own dataset
preds_test=(probs_test>=threshold).astype('uint8') # predicted labels using the above fixed threshold

# metric = load_metric("glue", "mrpc")