scandeval.task_utils.token_classification

docs module scandeval.task_utils.token_classification
"""Utility functions related to the token-classification task group."""importimportlib.utilimportloggingimportreimporttypingastfromcopyimportdeepcopyimportevaluateimportnumpyasnpfromevaluateimportEvaluationModulefromtransformersimportPreTrainedTokenizerfrom..data_modelsimportBenchmarkConfig,DatasetConfig,GenerativeModelOutputfrom..exceptionsimportInvalidBenchmark,NeedsExtraInstalledfrom..utilsimportraise_if_model_output_contains_nan_valuesift.TYPE_CHECKING:fromtransformersimportBatchEncodingfrom..typesimportLabels,Predictionsifimportlib.util.find_spec("demjson3")isnotNone:importdemjson3logger=logging.getLogger("scandeval")defcompute_metrics(docs
model_outputs_and_labels:tuple["Predictions","Labels"],has_misc_tags:bool,dataset_config:"DatasetConfig",benchmark_config:"BenchmarkConfig",)->dict[str,float]:    """Compute the metrics needed for evaluation.    Args:        model_outputs_and_labels:            The first array contains the probability predictions and the second            array contains the true labels.        has_misc_tags:            Whether the dataset has MISC tags.        dataset_config:            The configuration of the dataset.        benchmark_config:            The configuration of the benchmark.    Returns:        A dictionary with the names of the metrics as keys and the metric values as        values.    """model_outputs,labels=model_outputs_and_labelsraise_if_model_output_contains_nan_values(model_output=model_outputs)metrics={metric_cfg.name:(evaluate.load(path=metric_cfg.huggingface_id,cache_dir=benchmark_config.cache_dir)ifmetric_cfg.huggingface_id!=""elseNone)formetric_cfgindataset_config.task.metrics}predictions:list[list[str]]ifnotisinstance(model_outputs[0][0],str):raw_predictions:list[list[int]]=np.argmax(model_outputs,axis=-1).tolist()# Remove ignored index (special tokens)predictions=[[dataset_config.id2label[pred_id]forpred_id,lbl_idinzip(pred,label)iflbl_id!=-100]forpred,labelinzip(raw_predictions,labels)]labels=[[(dataset_config.id2label[int(lbl_id)]ifisinstance(lbl_id,int)orisinstance(lbl_id,np.int_)elselbl_id)forlbl_idinlabeliflbl_id!=-100]forlabelinlabels]else:predictions=model_outputs# type: ignore[assignment]# Replace predicted tag with either MISC or O tags if they are not part of the# datasetlabels_without_misc={labelforlabelindataset_config.id2label.values()iflabelnotin{"b-misc","i-misc"}}ner_tag:strfori,prediction_listinenumerate(predictions):forj,ner_taginenumerate(prediction_list):ifner_tagnotinlabels_without_misc:ifhas_misc_tagsandner_tag[:2]=="b-":predictions[i][j]="b-misc"elifhas_misc_tagsandner_tag[:2]=="i-":predictions[i][j]="i-misc"else:predictions[i][j]="o"# Remove MISC labels from predictionspredictions_no_misc=deepcopy(predictions)fori,prediction_listinenumerate(predictions_no_misc):forj,ner_taginenumerate(prediction_list):ifner_tag[-4:]=="misc":predictions_no_misc[i][j]="o"# Remove MISC labels from labelslabels_no_misc:list[list[str]]=deepcopy(labels)# type: ignore[arg-type]fori,label_listinenumerate(labels_no_misc):forj,ner_taginenumerate(label_list):if(isinstance(ner_tag,str)andlen(ner_tag)>=4andner_tag[-4:]=="misc"):labels_no_misc[i][j]="o"# Compute the metrics# We manually set the F1 metric to be 100% if both the labels and the models# have no NER tags in them, since this causes an error with the `compute`# method otherwisepredictions_all_zero=all(all(ner_tag=="o"forner_taginprediction_list)forprediction_listinpredictions)labels_all_zero=all(all(ner_tag=="o"forner_taginlabel_list)forlabel_listinlabels)ifpredictions_all_zeroandlabels_all_zero:results=dict(overall_f1=1.0)else:metric=metrics["micro_f1"]assertisinstance(metric,EvaluationModule)results=metric.compute(predictions=predictions,references=labels)# Compute the metrics without MISC tags# We manually set the F1 metric to be 100% if both the labels and the models# have no NER tags in them, since this causes an error with the `compute`# method otherwisepredictions_no_misc_all_zero=all(all(ner_tag=="o"forner_taginprediction_list)forprediction_listinpredictions_no_misc)labels_no_misc_all_zero=all(all(ner_tag=="o"forner_taginlabel_list)forlabel_listinlabels_no_misc)ifpredictions_no_misc_all_zeroandlabels_no_misc_all_zero:results_no_misc=dict(overall_f1=1.0)else:metric=metrics["micro_f1_no_misc"]assertisinstance(metric,EvaluationModule)results_no_misc=metric.compute(predictions=predictions_no_misc,references=labels_no_misc)# Raise error if the metrics are invalidifresultsisNoneorresults_no_miscisNone:raiseInvalidBenchmark("The predictions and labels are not of the same length.")returndict(micro_f1_no_misc=results_no_misc["overall_f1"],micro_f1=results["overall_f1"])defextract_labels_from_generation(docs
input_batch:dict[str,list],model_output:"GenerativeModelOutput",dataset_config:"DatasetConfig",)->list[t.Any]:    """Extract the predicted labels from the generated output.    Args:        input_batch:            The input batch, where the keys are the feature names and the values            are lists with the feature values.        model_output:            The raw generated output of the model.        dataset_config:            The configuration of the dataset.    Returns:        The predicted labels.    """ifimportlib.util.find_spec("demjson3")isNone:raiseNeedsExtraInstalled(extra="generative")raw_predictions=model_output.sequences# Attempt to extract the JSON dictionary from the predictionsjson_regex=r"\{.+?\}"json_matches=[re.search(pattern=json_regex,string=raw_prediction,flags=re.DOTALL)orraw_predictionforraw_predictioninraw_predictions]raw_predictions=[json_match.group()ifisinstance(json_match,re.Match)elsejson_matchforjson_matchinjson_matches]tokens=input_batch["tokens"]predicted_labels:list[list[str]]=[["o"]*len(token_ids)fortoken_idsintokens]foridx,raw_predictioninenumerate(raw_predictions):try:json_output=demjson3.decode(txt=raw_prediction)ifnotisinstance(json_output,dict):logger.debug("The model output is not a JSON dictionary, so cannot parse "f"it. Skipping. Here is the output: {raw_prediction}")continueelifnotall(isinstance(key,str)forkeyinjson_output.keys()):logger.debug("The model output is not a JSON dictionary with string keys, ""so cannot parse it. Skipping. Here is the output: "f"{raw_prediction}")continueelifnotall(isinstance(value,list)forvalueinjson_output.values()):logger.debug("The model output is not a JSON dictionary with list values, ""so cannot parse it. Skipping. Here is the output: "f"{raw_prediction}")continueprediction_dict:dict[str,list[str]]=json_outputexceptdemjson3.JSONDecodeError:logger.debug("The model output is not valid JSON, so cannot parse it. Skipping. "f"Here is the output: {raw_prediction!r}")continueprompt_label_mapping=dataset_config.prompt_label_mappingforprompt_tag_name,named_entitiesinprediction_dict.items():try:tag_name=[tag[2:]fortag,prompt_taginprompt_label_mapping.items()ifprompt_tag==prompt_tag_name][0]exceptIndexError:logger.debug("The model produced an invalid prompt tag name, "f"{prompt_tag_name}. Skipping.")continuenamed_entities=[str(named_entity)fornamed_entityinnamed_entities]fornamed_entityinnamed_entities:forne_idx,named_entity_wordinenumerate(named_entity.split()):fortoken_idx,tokeninenumerate(tokens[idx]):ifnamed_entity_wordintoken:ifne_idx==0:predicted_labels[idx][token_idx]=f"b-{tag_name}"elif(predicted_labels[idx][token_idx]=="o"andpredicted_labels[idx][token_idx-1][2:]==tag_name):predicted_labels[idx][token_idx]=f"i-{tag_name}"returnpredicted_labelsdeftokenize_and_align_labels(docs
examples:dict,tokenizer:"PreTrainedTokenizer",label2id:dict[str,int])->"BatchEncoding":    """Tokenise all texts and align the labels with them.    Args:        examples:            The examples to be tokenised.        tokenizer:            A pretrained tokenizer.        label2id:            A dictionary that converts NER tags to IDs.    Returns:        A dictionary containing the tokenized data as well as labels.    """# Tokenize the texts. We use the `is_split_into_words` argument here because# the texts in our dataset are lists of words (with a label for each word)tokenized_inputs=tokenizer(examples["tokens"],is_split_into_words=True,truncation=True,padding=True)# Extract a mapping between all the tokens and their corresponding word. If the# tokenizer is of a "fast" variant then this can be accessed through the# `word_ids` method. Otherwise, we have to extract it manually.all_labels:list[list[int]]=list()labels:list[str]word_ids:list[int|None]fori,labelsinenumerate(examples["labels"]):# Try to get the word IDs from the tokenizertry:word_ids=tokenized_inputs.word_ids(batch_index=i)# If the tokenizer is not of a "fast" variant, we have to extract the word# IDs manuallyexceptValueError:# Get the list of words in the documentwords:list[str]=examples["tokens"][i]# Get the list of token IDs in the documenttok_ids:list[int]=tokenized_inputs.input_ids[i]# Decode the token IDstokens=tokenizer.convert_ids_to_tokens(tok_ids)assertisinstance(tokens,list)# Remove prefixes from the tokensprefixes_to_remove=["▁","##"]fortok_idx,tokinenumerate(tokens):iftok:forprefixinprefixes_to_remove:iftok.startswith(prefix):tokens[tok_idx]=tok[len(prefix):]# Replace UNK tokens with the correct wordtokens=handle_unk_tokens(tokenizer=tokenizer,tokens=tokens,words=words)# Get list of special tokens. Some tokenizers do not record these# properly, which is why we convert the values to their indices and# then back to stringssp_toks=[tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids(sp_tok))forsp_tokintokenizer.special_tokens_map.values()]# Replace special tokens with `None`tokens_with_none=[Noneiftokinsp_tokselsetokfortokintokens]# Get the alignment between the words and the tokens, on a character# levelword_idxs=[word_idxforword_idx,wordinenumerate(words)for_instr(word)]token_idxs=[tok_idxfortok_idx,tok_or_noneinenumerate(tokens_with_none)for_instr(tok_or_none)iftok_or_noneisnotNone]alignment=list(zip(word_idxs,token_idxs))# Raise error if there are not as many characters in the words as in# the tokens. This can be due to the use of a different prefix.iflen(word_idxs)!=len(token_idxs):raiseInvalidBenchmark("The tokens could not be aligned with the words during manual ""word-token alignment. It seems that the tokenizer is neither ""of the fast variant nor of a SentencePiece/WordPiece variant.")# Get the aligned word IDsword_ids=list()fortok_idx,tok_or_noneinenumerate(tokens_with_none):iftok_or_noneisNoneortok_or_none=="":word_ids.append(None)else:word_idx=[word_idxforword_idx,token_idxinalignmentiftoken_idx==tok_idx][0]word_ids.append(word_idx)previous_word_idx:int|None=Nonelabel_ids:list[int]=list()forword_idinword_ids:# Special tokens have a word id that is None. We set the label to -100# so they are automatically ignored in the loss functionifword_idisNone:label_ids.append(-100)# We set the label for the first token of each wordelifword_id!=previous_word_idx:label=labels[word_id]try:label_id=label2id[label.lower()]exceptKeyError:msg=f"The label {label} was not found in the model's config."raiseInvalidBenchmark(msg)label_ids.append(label_id)# For the other tokens in a word, we set the label to -100else:label_ids.append(-100)previous_word_idx=word_idall_labels.append(label_ids)tokenized_inputs["labels"]=all_labelsreturntokenized_inputsdefhandle_unk_tokens(docs
tokenizer:"PreTrainedTokenizer",tokens:list[str],words:list[str])->list[str]:    """Replace unknown tokens in the tokens with the corresponding word.    Args:        tokenizer:            The tokenizer used to tokenize the words.        tokens:            The list of tokens.        words:            The list of words.    Returns:        The list of tokens with unknown tokens replaced by the corresponding word.    """# Locate the token indices of the unknown tokenstoken_unk_idxs=[ifori,tokinenumerate(tokens)iftok==tokenizer.unk_token]# Locate the word indices of the words which contain an unknown tokenword_unk_idxs=[ifori,wordinenumerate(words)iftokenizer.unk_tokenintokenizer.convert_ids_to_tokens(tokenizer.encode(word,add_special_tokens=False))]# Iterate over the token index and word index pairsfortok_idx,word_idxinzip(token_unk_idxs,word_unk_idxs):# Fetch the wordword=words[word_idx]# Tokenize the word, which is now a list containing at least one UNK tokentokens_with_unk=tokenizer.convert_ids_to_tokens(tokenizer.encode(word,add_special_tokens=False))# Iterate over the tokens in the wordforpossible_unk_tokenintokens_with_unk:# If the token is not an UNK token then we remove the first occurence# of the content of this token from the word. The result of the `word`# variable will be the content of the UNK token.# NOTE: This is a bit hacky and not bulletproof. For instance, if the# word is "1925-1950" and the tokenizer splits it into ["[UNK]", "-",# "19", "50"], then the result will be 2519 instead of 1925. This# happens almost never, however, so we can live with it.ifpossible_unk_token!=tokenizer.unk_token:word=word.replace(possible_unk_token,"",1)# Replace the token with the wordtokens[tok_idx]=wordreturntokens