1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
| def process_function(examples): tokenized_exmaples = tokenizer(examples["tokens"], max_length=128, truncation=True, is_split_into_words=True) labels = [] for i, label in enumerate(iterable=examples["ner_tags"]): word_ids = tokenized_exmaples.word_ids(batch_index=i) label_ids = [] for word_id in word_ids: if word_id is None: label_ids.append(-100) else: label_ids.append(label[word_id]) labels.append(label_ids) tokenized_exmaples["labels"] = labels return tokenized_exmaples
print(process_function(ner_datasets['train'][:2])) output: {'input_ids': [[101, 3862, 7157, 3683, 6612, 1765, 4157, 1762, 1336, 7305, 680, 7032, 7305, 722, 7313, 4638, 3862, 1818, 511, 102], [101, 6821, 2429, 898, 2255, 988, 3717, 4638, 1300, 4289, 7667, 4507, 1744, 1079, 671, 3837, 4638, 6392, 6369, 2360, 712, 2898, 6392, 6369, 8024, 3146, 702, 2456, 5029, 5408, 5125, 5401, 5445, 2612, 2131, 511, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0, -100], [-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]]}
|