基于Transformers库的命名实体识别 | 读书or跑步，旅行or思考

step1 导入相关包

1
2
3

import evaluate
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification

step2 加载数据集

1
2
3

from datasets import DatasetDict
ner_datasets = DatasetDict.load_from_disk("ner_data")
ner_datasets

输出：
DatasetDict({
train: Dataset({
features: [‘id’, ‘tokens’, ‘ner_tags’],
num_rows: 20865
})
validation: Dataset({
features: [‘id’, ‘tokens’, ‘ner_tags’],
num_rows: 2319
})
test: Dataset({
features: [‘id’, ‘tokens’, ‘ner_tags’],
num_rows: 4637
})
})

查看tag标签类别

1 2	label_list = ner_datasets["train"].features["ner_tags"].feature.names label_list # 7类 0~6

step3 数据集预处理

1	tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base") # 加载分词器

使用分词器处理train[0]分词后的结果，将分词后的结果进行token化

tokenizer(ner_datasets["train"][0]["tokens"], is_split_into_words=True)   # 对于已经做好tokenize的数据，要指定is_split_into_words参数为True

output:
   {'input_ids': [101, 3862, 7157, 3683, 6612, 1765, 4157, 1762, 1336, 7305, 680, 7032, 7305, 722, 7313, 4638, 3862, 1818, 511, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

有时候一个词，可能由于分词算法的原因会被分成多个子词，然后再转换成对应的token，如下：

res = tokenizer("interesting word")
print(res)


output:
{'input_ids': [101, 10673, 12865, 12921, 8181, 8681, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
# 除开101,102开始和结束token，我们会发现interesting这个词被分成了四个子词，然后再转换成token

那我们如何处理上述情况呢？由于标签tag是针对单个词的，现在我们ner任务，需要找到每个token所对应的tag，就需要用到word_ids()方法

res.word_ids() # 原始输入词语列表中的位置索引
# word_ids()方法返回token归属的词
output: [None, 0, 0, 0, 0, 1, None]
# 也就是说上述的输出代表了每个token所对应的原始词，None则代表没有对应的，是额外添加的符号词元

那我们用这个返回结果究竟有什么用呢？

答：通过定位token所对应的原始”整词”的索引，来定位从而获得每个token应该归属的标签tag。

# 借助word_ids 实现标签映射
def process_function(examples):
    tokenized_exmaples = tokenizer(examples["tokens"], max_length=128, truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(iterable=examples["ner_tags"]):
        # ! print(label) label即每个单独的词 对应 的tag标签
        word_ids = tokenized_exmaples.word_ids(batch_index=i)
        label_ids = []
        for word_id in word_ids:
            if word_id is None: # ! 主要是为了将每个token对应的tag添加到label_ids，这样也就找到每个token对应的tag标签了
                label_ids.append(-100) # ! 添加的特殊字符
            else:
                label_ids.append(label[word_id])
        labels.append(label_ids) 
    tokenized_exmaples["labels"] = labels 
    return tokenized_exmaples

print(process_function(ner_datasets['train'][:2]))
output:
{'input_ids': [[101, 3862, 7157, 3683, 6612, 1765, 4157, 1762, 1336, 7305, 680, 7032, 7305, 722, 7313, 4638, 3862, 1818, 511, 102], [101, 6821, 2429, 898, 2255, 988, 3717, 4638, 1300, 4289, 7667, 4507, 1744, 1079, 671, 3837, 4638, 6392, 6369, 2360, 712, 2898, 6392, 6369, 8024, 3146, 702, 2456, 5029, 5408, 5125, 5401, 5445, 2612, 2131, 511, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0, -100], [-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]]}

数据集预处理：

tokenized_datasets = ner_datasets.map(process_function, batched=True)
tokenized_datasets

output:
DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 20865
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2319
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 4637
    })
})

step4 创建模型

model = AutoModelForTokenClassification.from_pretrained("hfl/chinese-macbert-base", num_labels=len(label_list))

print(model.config.num_labels)
# output: 7

step5 创建评估函数

import numpy as np
seqeval = evaluate.load("seqeval_metric.py")

def eval_metric(pred):
    predictions, labels = pred # !预测结果和标签
    predictions = np.argmax(predictions, axis=-1)

    # 将id转换为原始的字符串类型的标签
    true_predictions = [    
        [label_list[p] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels) 
    ]

    true_labels = [
        [label_list[l] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels) 
    ]

    result = seqeval.compute(predictions=true_predictions, references=true_labels, mode="strict", scheme="IOB2")

    return {
        "f1": result["overall_f1"]
    }

step6 配置训练参数

args = TrainingArguments(
    output_dir="models_for_ner",
    per_device_train_batch_size=16, # 训练batch_size
    per_device_eval_batch_size=32, # 验证batch_size
    eval_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="f1", # 衡量模型好坏依据的指标
    load_best_model_at_end=True, 
    logging_steps=50,
    num_train_epochs=1 # 训练的epoch数
)

Step7 创建训练器

trainer = Trainer(
    model=model,
    args=args,
    tokenizer=tokenizer,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=eval_metric,
    data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer)
)

step8 模型训练

1 2	trainer.train() trainer.evaluate(eval_dataset=tokenized_datasets["test"])

Step9 模型预测

from transformers import pipeline

# 使用pipeline进行推理，要指定id2label
model.config.id2label = {idx: label for idx, label in enumerate(label_list)}

ner_pipe = pipeline("token-classification", model=model, tokenizer=tokenizer, device=0, aggregation_strategy="simple") # aggregation_startegy参数设置为None的话，则为单个token的预测结果，不会对结果进行聚合

res = ner_pipe("小明在北京上班")
print(res)

output:
  [{'entity_group': 'PER',
  'score': 0.9070835,
  'word': '小 明',
  'start': 0,
  'end': 2},
 {'entity_group': 'LOC',
  'score': 0.9970835,
  'word': '北 京',
  'start': 3,
  'end': 5}]