Cyan's Blog

Search

Search IconIcon to open search

D2L-75-BERT

Last updated Apr 30, 2022 Edit Source

# Bidirectional Encoder Representations from Transformers (BERT)

2022-04-30

Tags: #BERT #Transformer #DeepLearning

# Motivation

# 构建一个通用的语言模型

# 结合两个现有架构的优点: ELMo & GPT

# GPT: task-agnostic

# ELMo: Bi-directional

# BERT: Combining the Best of Both Worlds

# Model - Overview

1
2
3
4
Base: 
	#blocks = 12, hidden size = 768, #heads = 12, #parameters = 110M
Large: 
	#blocks = 24, hidden size = 1024, #heads = 16, #parameter = 340M

# 预训练任务

Pretrain Tasks

# Model - Detail

# Masked Language Model 任务

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
class MaskLM(nn.Module):
    """BERT的掩蔽语言模型任务"""
    def __init__(self, vocab_size, num_hiddens, num_inputs=768, **kwargs):
        super(MaskLM, self).__init__(**kwargs)
        self.mlp = nn.Sequential(
	     nn.Linear(num_inputs, num_hiddens),
		 nn.ReLU(),
		 nn.LayerNorm(num_hiddens),
		 nn.Linear(num_hiddens, vocab_size))

    def forward(self, X, pred_positions):
        num_pred_positions = pred_positions.shape[1] # 一共有多少个位置需要预测
        pred_positions = pred_positions.reshape(-1)  # 全部排成一列
        batch_size = X.shape[0] 
        batch_idx = torch.arange(0, batch_size) # batch的序号
        # 假设batch_size=4,num_pred_positions=2
        # 那么batch_idx是np.array([0,0,1,1,2,2,3,3])
        batch_idx = torch.repeat_interleave(batch_idx, num_pred_positions)
        # 取出X里面需要预测的位置
        masked_X = X[batch_idx, pred_positions]
        masked_X = masked_X.reshape((batch_size, num_pred_positions, -1))
        # 进行预测
        mlm_Y_hat = self.mlp(masked_X)
        return mlm_Y_hat

# Next Sentence Prediction 任务

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
class NextSentencePred(nn.Module):
    """BERT的下一句预测任务"""

    def __init__(self, num_inputs,  num_hiddens, **kwargs):
        super(NextSentencePred, self).__init__(**kwargs)
        print("num_inputs:", num_inputs)

        self.output = nn.Sequential(
            nn.Linear(num_inputs, num_hiddens),
            nn.Tanh(),
            nn.Linear(num_hiddens, 2))

    def forward(self, X):
        # X的形状:(batchsize,num_hiddens)
        return self.output(X)

# 完整的模型

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
class BERTModel(nn.Module):
    """BERT模型"""

    def __init__(self, vocab_size, num_hiddens, norm_shape, ffn_num_input, 
                 ffn_num_hiddens, num_heads, num_layers, dropout,
                 max_len=1000,
                 #  这里设置默认大小 768 有点误导, 
                 #  768是BERT base版本的hidden_size
                 #  其实这里的维度要和前面的hidden_size对应起来,
                 key_size=768,
                 query_size=768,
                 value_size=768,
                 mlm_in_features=768,
                 nsp_in_features=768):
        super(BERTModel, self).__init__()
        self.encoder = BERTEncoder(vocab_size, 
	        num_hiddens, norm_shape,  ffn_num_input, ffn_num_hiddens,
	        num_heads, num_layers, dropout, max_len=max_len,
	        key_size=key_size, query_size=query_size, 
	        value_size=value_size)
        # mlm的hidden_size是768, 但是可以取不一样的
        self.mlm = MaskLM(vocab_size, num_hiddens, mlm_in_features)
        self.nsp = NextSentencePred(nsp_in_features, num_hiddens)

    def forward(self, tokens, segments, 
		    valid_lens=None, pred_positions=None):
        encoded_X = self.encoder(tokens, segments, valid_lens)
        if pred_positions is not None:
            mlm_Y_hat = self.mlm(encoded_X, pred_positions)
        else:
            mlm_Y_hat = None
        # 用于下一句预测的多层感知机分类器的隐藏层,0是“<cls>”标记的索引
        nsp_Y_hat = self.nsp(encoded_X[:, 0, :])
        return encoded_X, mlm_Y_hat, nsp_Y_hat

# Fine-tune

D2L-77-BERT - Fine-tune


  1. Agonistic: someone who does not know, or believes that it is impossible to know, if a god exists 不可知论者(对神存在与否不能肯定或认为不可知), ↩︎

  2. A Visual Guide to Using BERT for the First Time – Jay Alammar – Visualizing machine learning one concept at a time. ↩︎