编码注意力机制的梳理(自用-分享)

目前主流的大语言模型都是基于Transformer架构的，而Transformer架构的核心是Attention机制，所以了解Attention机制对于理解Transformer架构至关重要。

1.长序列建模中问题
过去在处理序列文本等数据时，大多采用RNN等模型，虽然部分应用场景下RNN模型的效果不错，但是面对长序列数据时，存在遗忘问题，从而导致对长序列模型建模时效果不是很好。因此，在2017年，Transformer模型应运而生，Transformer模型在长序列建模中，通过引入Attention机制，解决RNN模型中的遗忘问题，并大大提高模型的效果。

2.过去编码器-解码器模型存在问题
由于之前编码器-解码器模型，主要还是基于RNN模型，编码器生成整个文本的编码向量，再将这个编码向量输入给解码器，解码器基于编码向量生成输出预测，但是由于存在长距离遗忘信息丢失问题，编码向量会丢失长文本前段的信息，从而导致预测效果不好。然后注意力机制可以很好的解决这个问题，注意力机制可以捕获较长的依赖关系，获得更准确的编码向量表示。

1.无训练权重的简单注意力机制

# 1.无训练权重的简单注意力机制
import torch
x = torch.tensor([[1,2,3],[4,5,6]])
# 假设 [1,2,3] 是一个编码向量表示，[4,5,6]是一个编码向量表示
w1 = 0.5
w2 = 0.3  # w1 和 w2 可以理解为表示向量重要度的权重值，不同的值表示不同的注意力权重
print(x[0] * w1 + x[1] * w2) # 简单的加权求和，表示注意力机制

# 使用一个 （6,6）形状的矩阵，表示6个token，每个token嵌入向量大小为6的token嵌入表示
x = torch.randn(6,6)
print(x)
# 通过点积(逐元素乘法)获得向量间的注意力权重
attn_weights = torch.matmul(x,x.transpose(0,1))
print(attn_weights)
# 再对weights进行softmax归一化
attn_weights = torch.softmax(attn_weights, dim=-1)
print(attn_weights)

2.实现带有可训练权重的自注意力机制

from torch import nn    
class SelfAttention(nn.Module):
    
    def __init__(self, d_in, d_out):
        super(SelfAttention, self).__init__()
        self.W_query = nn.Linear(d_in, d_out)
        self.W_key = nn.Linear(d_in, d_out)
        self.W_value = nn.Linear(d_in, d_out)

    def forward(self, x):
        keys = self.W_key(x)
        querys = self.W_query(x)    
        values = self.W_value(x)
        attn_scores = querys @ keys.transpose(-2, -1) # 计算注意力分数
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1] ** 0.5, dim = -1
        ) # 1.缩放点积防止梯度消失。 2.让输出分布和输入分布一致
        attn_output = attn_weights @ values
        return attn_output

3.因果注意力机制

from torch import nn
import math
import torch.nn.functional as F 
class SelfAttention(nn.Module):
    def __init__(self, d_in, context_length, d_out):
        super(SelfAttention, self).__init__()
        self.W_query = nn.Linear(d_in, d_out)
        self.W_key = nn.Linear(d_in, d_out)
        self.W_value = nn.Linear(d_in, d_out)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length),diagonal=1))

    def forward(self, x, mask = None):
        query = self.W_query(x)
        key = self.W_key(x)
        value = self.W_value(x)

        attention_scores = torch.matmul(query, key.transpose(-1, -2)) / math.sqrt(key.shaep[-1]) 
        attention_scores = torch.masked_fill( # 实现因果语言模型中的掩码，防止未来信息泄露
            attention_scores, self.mask.bool(), float('-inf')
        )
        attention_weights = F.softmax(attention_scores, dim=-1) # 进行softmax归一化
        attention_weights = F.dropout(attention_weights, p=0.1) # dropout随机失活
        output = torch.matmul(attention_weights, value)
        return output

4.多头注意力机制实现

通过拼接多个单头注意力机制的模块实现

class MultiHeadAttention(nn.Module):
	def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
		super().__init__()
		self.heads = nn.ModuleList(
			[CausualAttention(d_in, d_out, context_length,dropout,qkv_bias) for _ in range(num_heads)]
		)
	def forward(self, x):
		return torch.cat([head(x) for head in self.heads], dim=-1)

多头注意力机制的模块实现

import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiHeadAttention(nn.Module):
    def __init__(self, embed_size, num_heads, dropout=0.1):
        """
        Args:
            embed_size (int): 输入嵌入的维度大小。
            num_heads (int): 多头注意力中的头数。
            dropout (float): Dropout 概率。
        """
        super(MultiHeadAttention, self).__init__()
        assert embed_size % num_heads == 0, "Embedding size must be divisible by num_heads"
        self.embed_size = embed_size
        self.num_heads = num_heads
        self.head_dim = embed_size // num_heads
        

        # 定义线性层用于 Query, Key, Value 的投影
        self.values = nn.Linear(self.head_dim, embed_size, bias=False)
        self.keys = nn.Linear(self.head_dim, embed_size, bias=False)
        self.queries = nn.Linear(self.head_dim, embed_size, bias=False)
        
        # 最后的线性映射层
        self.fc_out = nn.Linear(embed_size, embed_size)
        
        # Dropout 层
        self.dropout = nn.Dropout(dropout)

    def forward(self, values, keys, query, mask=None):
        """
        前向传播函数。
        
        Args:
            values (torch.Tensor): 值向量，形状为 [batch_size, value_len, embed_size]。
            keys (torch.Tensor): 键向量，形状为 [batch_size, key_len, embed_size]。
            query (torch.Tensor): 查询向量，形状为 [batch_size, query_len, embed_size]。
            mask (torch.Tensor): 掩码张量，形状为 [batch_size, 1, 1, key_len]。
            
        Returns:
            torch.Tensor: 输出张量，形状为 [batch_size, query_len, embed_size]
        """
        batch_size = query.shape[0] 

        # 将 Q, K, V 投影到嵌入空间
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        # 分割成多个头
        values = values.reshape(batch_size, value_len, self.num_heads, self.head_dim)
        keys = keys.reshape(batch_size, key_len, self.num_heads, self.head_dim)
        queries = query.reshape(batch_size, query_len, self.num_heads, self.head_dim)

        # 转换为 [batch_size, num_heads, seq_len, head_dim]
        values = values.transpose(1, 2)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)

        # 缩放点积注意力
        attn_scores = torch.matmul(queries, keys.transpose(-2, -1)) / torch.sqrt(
            torch.tensor(self.head_dim, dtype=torch.float32)
        )

        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 1, float("-inf"))

        attention = F.softmax(attn_scores, dim=-1)
        x = torch.matmul(self.dropout(attention), values)

        # 重新调整形状以合并多个头
        x = x.transpose(1, 2).contiguous()
        x = x.reshape(batch_size, query_len, self.embed_size)

        # 最终的线性层
        return self.fc_out(x)

总结：

大模型的核心就是注意力分数矩阵的计算，计算完注意力分数后，再计算出最后的上下文向量，其中因果掩码添加是为了防止在大模型训练的时候泄露未来信息，这样避免了模型在学习过程中作弊，softmax处理使得注意力权重分布和之前仅在未掩码的位置上计算一样，dropout层的引入主要是为了避免大模型过拟合。