step1 导入相关包

1
2
3
import evaluate
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification

step2 加载数据集

1
2
3
from datasets import DatasetDict
ner_datasets = DatasetDict.load_from_disk("ner_data")
ner_datasets

输出:
DatasetDict({
train: Dataset({
features: [‘id’, ‘tokens’, ‘ner_tags’],
num_rows: 20865
})
validation: Dataset({
features: [‘id’, ‘tokens’, ‘ner_tags’],
num_rows: 2319
})
test: Dataset({
features: [‘id’, ‘tokens’, ‘ner_tags’],
num_rows: 4637
})
})

查看tag标签类别

1
2
label_list = ner_datasets["train"].features["ner_tags"].feature.names
label_list # 7类 0~6

step3 数据集预处理

1
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base") # 加载分词器

使用分词器处理train[0]分词后的结果,将分词后的结果进行token化

1
2
3
4
5
tokenizer(ner_datasets["train"][0]["tokens"], is_split_into_words=True)   # 对于已经做好tokenize的数据,要指定is_split_into_words参数为True

output:
{'input_ids': [101, 3862, 7157, 3683, 6612, 1765, 4157, 1762, 1336, 7305, 680, 7032, 7305, 722, 7313, 4638, 3862, 1818, 511, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

有时候一个词,可能由于分词算法的原因会被分成多个子词,然后再转换成对应的token,如下:

1
2
3
4
5
6
7
res = tokenizer("interesting word")
print(res)


output:
{'input_ids': [101, 10673, 12865, 12921, 8181, 8681, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
# 除开101,102开始和结束token,我们会发现interesting这个词被分成了四个子词,然后再转换成token

那我们如何处理上述情况呢?由于标签tag是针对单个词的,现在我们ner任务,需要找到每个token所对应的tag,就需要用到word_ids()方法

1
2
3
4
res.word_ids() # 原始输入词语列表中的位置索引
# word_ids()方法返回token归属的词
output: [None, 0, 0, 0, 0, 1, None]
# 也就是说上述的输出代表了每个token所对应的原始词,None则代表没有对应的,是额外添加的符号词元

那我们用这个返回结果究竟有什么用呢?

答:通过定位token所对应的原始”整词”的索引,来定位从而获得每个token应该归属的标签tag。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# 借助word_ids 实现标签映射
def process_function(examples):
tokenized_exmaples = tokenizer(examples["tokens"], max_length=128, truncation=True, is_split_into_words=True)
labels = []
for i, label in enumerate(iterable=examples["ner_tags"]):
# ! print(label) label即每个单独的词 对应 的tag标签
word_ids = tokenized_exmaples.word_ids(batch_index=i)
label_ids = []
for word_id in word_ids:
if word_id is None: # ! 主要是为了将每个token对应的tag添加到label_ids,这样也就找到每个token对应的tag标签了
label_ids.append(-100) # ! 添加的特殊字符
else:
label_ids.append(label[word_id])
labels.append(label_ids)
tokenized_exmaples["labels"] = labels
return tokenized_exmaples

print(process_function(ner_datasets['train'][:2]))
output:
{'input_ids': [[101, 3862, 7157, 3683, 6612, 1765, 4157, 1762, 1336, 7305, 680, 7032, 7305, 722, 7313, 4638, 3862, 1818, 511, 102], [101, 6821, 2429, 898, 2255, 988, 3717, 4638, 1300, 4289, 7667, 4507, 1744, 1079, 671, 3837, 4638, 6392, 6369, 2360, 712, 2898, 6392, 6369, 8024, 3146, 702, 2456, 5029, 5408, 5125, 5401, 5445, 2612, 2131, 511, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0, -100], [-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]]}

数据集预处理:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
tokenized_datasets = ner_datasets.map(process_function, batched=True)
tokenized_datasets

output:
DatasetDict({
train: Dataset({
features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
num_rows: 20865
})
validation: Dataset({
features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
num_rows: 2319
})
test: Dataset({
features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
num_rows: 4637
})
})

step4 创建模型

1
2
3
4
model = AutoModelForTokenClassification.from_pretrained("hfl/chinese-macbert-base", num_labels=len(label_list))

print(model.config.num_labels)
# output: 7

step5 创建评估函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import numpy as np
seqeval = evaluate.load("seqeval_metric.py")

def eval_metric(pred):
predictions, labels = pred # !预测结果和标签
predictions = np.argmax(predictions, axis=-1)

# 将id转换为原始的字符串类型的标签
true_predictions = [
[label_list[p] for p, l in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]

true_labels = [
[label_list[l] for p, l in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]

result = seqeval.compute(predictions=true_predictions, references=true_labels, mode="strict", scheme="IOB2")

return {
"f1": result["overall_f1"]
}

step6 配置训练参数

1
2
3
4
5
6
7
8
9
10
11
args = TrainingArguments(
output_dir="models_for_ner",
per_device_train_batch_size=16, # 训练batch_size
per_device_eval_batch_size=32, # 验证batch_size
eval_strategy="epoch",
save_strategy="epoch",
metric_for_best_model="f1", # 衡量模型好坏依据的指标
load_best_model_at_end=True,
logging_steps=50,
num_train_epochs=1 # 训练的epoch数
)

Step7 创建训练器

1
2
3
4
5
6
7
8
9
trainer = Trainer(
model=model,
args=args,
tokenizer=tokenizer,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
compute_metrics=eval_metric,
data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer)
)

step8 模型训练

1
2
trainer.train()
trainer.evaluate(eval_dataset=tokenized_datasets["test"])

Step9 模型预测

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
from transformers import pipeline

# 使用pipeline进行推理,要指定id2label
model.config.id2label = {idx: label for idx, label in enumerate(label_list)}

ner_pipe = pipeline("token-classification", model=model, tokenizer=tokenizer, device=0, aggregation_strategy="simple") # aggregation_startegy参数设置为None的话,则为单个token的预测结果,不会对结果进行聚合

res = ner_pipe("小明在北京上班")
print(res)

output:
[{'entity_group': 'PER',
'score': 0.9070835,
'word': '小 明',
'start': 0,
'end': 2},
{'entity_group': 'LOC',
'score': 0.9970835,
'word': '北 京',
'start': 3,
'end': 5}]