forked from MohamedAlaaAli/SWIZT
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprepare_data.py
More file actions
109 lines (91 loc) · 4.34 KB
/
prepare_data.py
File metadata and controls
109 lines (91 loc) · 4.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from datasets import load_dataset
from collections import defaultdict, Counter
from datasets import DatasetDict
import pandas as pd
import torch
tags=None
def get_data():
langs = ["de", "fr", "it", "en"]
panx_ch = defaultdict(DatasetDict)
for lang in langs:
ds = load_dataset("xtreme", name=f"PAN-X.{lang}")
for split in ds:
panx_ch[lang][split] = ds[split].shuffle(seed=0)
return panx_ch
def create_tag_names(batch):
return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}
def tag_text(text, tags, model, tokenizer, device="cuda"):
"""
Tags a given text with predictions from a NER model.
This function tokenizes the input text, runs it through a NER model, and
generates predictions for each token. The predictions are converted into
human-readable tag names, and the tokens and their corresponding tags are
returned as a pandas DataFrame.
Args:
text (str): The input text to be tagged.
tags: A mapping or object containing tag names (e.g., `tags.names`).
model: A pre-trained NER model that takes tokenized inputs and
outputs logits for each token.
tokenizer: A tokenizer that splits the input text into tokens
compatible with the model.
Returns:
pd.DataFrame: A DataFrame containing:
- "Tokens": List of tokens from the input text.
- "Tags": Predicted tags corresponding to each token.
"""
tokens = tokenizer(text).tokens()
input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
outputs = model(input_ids)[0]
predictions = torch.argmax(outputs, dim=2)
preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
return pd.DataFrame([tokens, preds], index=["Tokens", "Tags"])
def tokenize_and_align_labels(examples, xlmr_tokenizer):
"""
Tokenizes input sentences and aligns NER labels with tokenized outputs.
This function uses a tokenizer that supports word-level tokenization and aligns
the NER tags to the subword tokenization scheme. It assigns `-100` to subword
tokens or special tokens to ensure they are ignored during the loss computation.
Args:
examples (dict): A dictionary containing:
- "tokens" (list of list of str): Sentences represented as lists of tokens.
- "ner_tags" (list of list of int): Corresponding NER tags for the tokens.
Returns:
dict: A dictionary containing:
- Tokenized inputs (e.g., "input_ids", "attention_mask").
- "labels": Aligned labels for the tokenized inputs.
"""
tokenized_inputs = xlmr_tokenizer(examples["tokens"], truncation=True,
is_split_into_words=True)
labels = []
for idx, label in enumerate(examples["ner_tags"]):
word_ids = tokenized_inputs.word_ids(batch_index=idx)
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
if word_idx is None or word_idx == previous_word_idx:
label_ids.append(-100)
else:
label_ids.append(label[word_idx])
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs["labels"] = labels
return tokenized_inputs
def encode_panx_dataset(corpus):
"""
Encodes a PAN-X dataset by tokenizing the input sentences and aligning NER labels.
This function applies the `tokenize_and_align_labels` function to the dataset using
batched processing, removing unnecessary columns (e.g., 'tokens', 'ner_tags', 'langs')
to prepare the dataset for model training.
Args:
corpus (DatasetDict): A `DatasetDict` object containing splits (e.g., 'train',
'validation', 'test') with the features:
- "tokens": List of tokens for each sentence.
- "ner_tags": NER labels for the tokens.
- "langs": Language identifiers.
Returns:
DatasetDict: A `DatasetDict` with tokenized inputs and aligned labels,
containing features such as "input_ids", "attention_mask",
and "labels".
"""
return corpus.map(tokenize_and_align_labels, batched=True,
remove_columns=['tokens', 'ner_tags', 'langs'])