scandeval.task_utils.question_answering

docs module scandeval.task_utils.question_answering
"""Utility functions related to the question-answering task group."""importloggingimporttypingastfromcollectionsimportdefaultdictimportevaluateimportnumpyasnpfromevaluateimportEvaluationModulefromtransformersimportPreTrainedTokenizerfromtransformers.trainerimportTrainerfrom..data_modelsimportBenchmarkConfig,DatasetConfig,GenerativeModelOutputfrom..utilsimport(get_special_token_metadata,raise_if_model_output_contains_nan_values,)ift.TYPE_CHECKING:fromdatasets.arrow_datasetimportDatasetfromtransformers.tokenization_utils_baseimportBatchEncodingfrom..typesimportLabels,Predictionslogger=logging.getLogger("scandeval")classQuestionAnsweringTrainer(Trainer):docs
    """Trainer subclass for question answering tasks."""def__init__(self,*args,**kwargs)->None:        """Initialize the trainer."""super().__init__(*args,**kwargs)# Get the CLS token id for the tokenizerspecial_token_metadata=get_special_token_metadata(self.tokenizer)self.cls_token_id=special_token_metadata["cls_token_id"]# Set the label namesself.label_names=["start_positions","end_positions"]defevaluate(docs
self,eval_dataset:"Dataset | None"=None,orig_eval_dataset:"Dataset | None"=None,ignore_keys:list[str]|None=None,metric_key_prefix:str="eval",)->dict[str,float]|None:        """Evaluate the model on the given dataset.        Args:            eval_dataset:                The dataset to evaluate on. If None, then use the stored evaluation                dataset.            orig_eval_dataset:                The original evaluation dataset, before any postprocessing. If None,                then use the stored original evaluation dataset.            ignore_keys:                The keys to ignore when computing the metrics.            metric_key_prefix:                The prefix to use for the metric keys.        Returns:            The metrics computed on the evaluation dataset.        """eval_dataloader=self.get_eval_dataloader(eval_dataset)# Temporarily disable metric computation, we will do it in the loop here.compute_metrics=self.compute_metrics# type: ignore[has-type]self.compute_metrics=Noneeval_loop=(self.prediction_loopifself.args.use_legacy_prediction_loopelseself.evaluation_loop)try:output=eval_loop(eval_dataloader,description="Evaluation",prediction_loss_only=Trueifcompute_metricsisNoneelseNone,ignore_keys=ignore_keys,metric_key_prefix=metric_key_prefix,)finally:self.compute_metrics=compute_metricsiforig_eval_datasetisnotNone:preds_and_labels=postprocess_predictions_and_labels(predictions=output.predictions,dataset=orig_eval_dataset,prepared_dataset=eval_dataset,cls_token_index=self.cls_token_id,)output.metrics.update(self.compute_metrics(preds_and_labels))# Prefix all keys with metric_key_prefix + '_'forkeyinlist(output.metrics.keys()):ifnotkey.startswith(f"{metric_key_prefix}_"):output.metrics[f"{metric_key_prefix}_{key}"]=output.metrics.pop(key)# Only the main node log the results by defaultifself.args.should_log:self.log(output.metrics)self.control=self.callback_handler.on_evaluate(self.args,self.state,self.control,# type: ignore[has-type]output.metrics,)returnoutput.metricsdefcompute_metrics(docs
model_outputs_and_labels:tuple["Predictions","Labels"],dataset_config:"DatasetConfig",benchmark_config:"BenchmarkConfig",)->dict[str,float]:    """Compute the metrics needed for evaluation.    Args:        model_outputs_and_labels:            The first sequence contains the model outputs and the second sequence            contains the true labels.        dataset_config:            The configuration of the dataset.        benchmark_config:            The configuration of the benchmark.    Returns:        A dictionary with the names of the metrics as keys and the metric values as        values.    """model_outputs,labels=model_outputs_and_labelsraise_if_model_output_contains_nan_values(model_output=model_outputs)metrics={metric_cfg.name:(evaluate.load(path=metric_cfg.huggingface_id,cache_dir=benchmark_config.cache_dir)ifmetric_cfg.huggingface_id!=""elseNone)formetric_cfgindataset_config.task.metrics}model_output_dtype=np.asarray(model_outputs).dtypeifmodel_output_dtypein[np.float16,np.float32,np.float64]:predictions=np.asarray(model_outputs).argmax(axis=-1)else:predictions=model_outputsresults:dict[str,float]=dict()forcfgindataset_config.task.metrics:metric=metrics[cfg.name]assertisinstance(metric,EvaluationModule)score_dict:dict[str,float]|None=metric.compute(predictions=predictions,references=labels,**cfg.compute_kwargs)# The metric returns None if we are running on multi-GPU and the current# process is not the main processifscore_dictisnotNone:scores=score_dict[cfg.results_key]ifisinstance(scores,list):scores=sum(scores)/len(scores)results[cfg.name]=scoresreturnresultsdefextract_labels_from_generation(docs
input_batch:dict[str,list],model_output:"GenerativeModelOutput")->list[t.Any]:    """Extract the predicted labels from the generated output.    Args:        input_batch:            The input batch, where the keys are the feature names and the values            are lists with the feature values.        model_output:            The raw generated output of the model.    Returns:        The predicted labels.    """raw_predictions=model_output.sequencespredictions=[dict(id=id,prediction_text=predicted_answer.lower(),no_answer_probability=0.0)forid,predicted_answerinzip(input_batch["id"],raw_predictions)]returnpredictionsdefprepare_train_examples(docs
examples:"BatchEncoding",tokenizer:"PreTrainedTokenizer")->"BatchEncoding":    """Prepare the features for training.    Args:        examples:            The examples to prepare.        tokenizer:            The tokenizer to use to prepare the examples.    Returns:        The prepared examples.    """# Some of the questions have lots of whitespace on the left, which is not useful# and will make the truncation of the context fail (the tokenized question will# take a lots of space). So we remove that left whitespaceexamples["question"]=[q.lstrip()forqinexamples["question"]]# Extract special token metadata from the tokenizerspecial_token_metadata=get_special_token_metadata(tokenizer=tokenizer)has_cls_token=special_token_metadata["has_cls_token"]has_sep_token=special_token_metadata["has_sep_token"]cls_token_id=special_token_metadata["cls_token_id"]cls_token=special_token_metadata["cls_token"]sep_token=special_token_metadata["sep_token"]# If the tokenizer is not adding special tokens, then we add them manuallyifnothas_cls_tokenandnothas_sep_token:examples["question"]=[f"{cls_token}{q}{sep_token}"forqinexamples["question"]]examples["context"]=[f"{c}{sep_token}"forcinexamples["context"]]# Set the stride used during tokenization, when the context is long enough to be# split into several features. Since we are always keeping the question tokens, we# need to make sure that the stride does not exceed the resulting maximum context# length.max_question_tokens=max(len(tokenizer(q).input_ids)forqinexamples["question"])num_special_tokens=int(has_cls_token)+int(has_sep_token)stride=tokenizer.model_max_length//4max_length=tokenizer.model_max_length-stridestride=min(stride,max_length-max_question_tokens-num_special_tokens)max_length=tokenizer.model_max_length-stride# Tokenize our examples with truncation and padding, but keep the overflows using a# stride. This results in one example possible giving several features when a# context is long, each of those features having a context that overlaps a bit the# context of the previous feature.tokenized_examples=tokenizer(text=examples["question"],text_pair=examples["context"],truncation="only_second",max_length=max_length,stride=stride,return_overflowing_tokens=True,return_offsets_mapping=True,padding="max_length",)# Since one example might give us several features if it has a long context, we# need a map from a feature to its corresponding example. This key gives us just# thatsample_mapping=tokenized_examples.pop("overflow_to_sample_mapping")# The offset mappings will give us a map from token to character position in the# original context. This will help us compute the start_positions and# end_positions.offset_mapping=tokenized_examples.pop("offset_mapping")# Initialise the start- and end positions of the answerstokenized_examples["start_positions"]=list()tokenized_examples["end_positions"]=list()fori,offsetsinenumerate(offset_mapping):# Get the input IDs for the current exampleinput_ids=tokenized_examples.input_ids[i]# We will label impossible answers with the index of the CLS tokencls_index=input_ids.index(cls_token_id)# Grab the sequence corresponding to that example (to know what is the context# and what is the question).sequence_ids=tokenized_examples.sequence_ids(i)# Manually ensure that the special tokens are set to None in `sequence_ids`forspecial_tokenintokenizer.special_tokens_map.keys():ifhasattr(tokenizer,f"{special_token}_id"):special_token_id=getattr(tokenizer,f"{special_token}_id")ifspecial_token_idisnotNone:sequence_ids=[Noneiftoken_id==special_token_idelseseq_idfortoken_id,seq_idinzip(input_ids,sequence_ids)]# One example can give several spans, this is the index of the example# containing this span of text.sample_index=sample_mapping[i]answers=examples["answers"][sample_index]# If no answers are given, set the cls_index as answer.iflen(answers["answer_start"])==0:tokenized_examples.start_positions.append(cls_index)tokenized_examples.end_positions.append(cls_index)else:# Start/end character index of the answer in the text.start_char=answers["answer_start"][0]end_char=start_char+len(answers["text"][0])# Start token index of the current span in the text.token_start_index=0whilesequence_ids[token_start_index]!=1:token_start_index+=1# End token index of the current span in the text.token_end_index=len(input_ids)-1whilesequence_ids[token_end_index]!=1:token_end_index-=1# Detect if the answer is out of the span (in which case this feature is# labeled with the CLS index).ifnot(offsets[token_start_index][0]<=start_charandoffsets[token_end_index][1]>=end_char):tokenized_examples.start_positions.append(cls_index)tokenized_examples.end_positions.append(cls_index)# Otherwise move the token_start_index and token_end_index to the two ends# of the answer. Note: we could go after the last offset if the answer is# the last word (edge case).else:while(token_start_index<=token_end_indexandoffsets[token_start_index][0]<=start_char):token_start_index+=1token_start_index-=1tokenized_examples.start_positions.append(token_start_index)while(token_start_index<=token_end_indexandoffsets[token_end_index][1]>=end_char):token_end_index-=1token_end_index+=1tokenized_examples.end_positions.append(token_end_index)asserttoken_end_index>=token_start_indexreturntokenized_examplesdefprepare_test_examples(docs
examples:"BatchEncoding",tokenizer:"PreTrainedTokenizer")->"BatchEncoding":    """Prepare test examples.    Args:        examples:            Dictionary of test examples.        tokenizer:            The tokenizer used to preprocess the examples.    Returns:        The prepared test examples.    """# Some of the questions have lots of whitespace on the left, which is not useful# and will make the truncation of the context fail (the tokenized question will# take a lots of space). So we remove that left whitespaceexamples["question"]=[q.lstrip()forqinexamples["question"]]# Extract special token metadata from the tokenizerspecial_token_metadata=get_special_token_metadata(tokenizer=tokenizer)has_cls_token=special_token_metadata["has_cls_token"]has_sep_token=special_token_metadata["has_sep_token"]cls_token=special_token_metadata["cls_token"]sep_token=special_token_metadata["sep_token"]# If the tokenizer is not adding special tokens, then we add them manuallyifnothas_cls_tokenandnothas_sep_token:examples["question"]=[f"{cls_token}{q}{sep_token}"forqinexamples["question"]]examples["context"]=[f"{c}{sep_token}"forcinexamples["context"]]# Set the stride used during tokenization, when the context is long enough to be# split into several features. Since we are always keeping the question tokens, we# need to make sure that the stride does not exceed the resulting maximum context# length.max_question_tokens=max(len(tokenizer(q).input_ids)forqinexamples["question"])num_special_tokens=int(has_cls_token)+int(has_sep_token)stride=tokenizer.model_max_length//4max_length=tokenizer.model_max_length-stridestride=min(stride,max_length-max_question_tokens-num_special_tokens)max_length=tokenizer.model_max_length-stride# Tokenize our examples with truncation and maybe padding, but keep the overflows# using a stride. This results in one example possible giving several features when# a context is long, each of those features having a context that overlaps a bit# the context of the previous feature.tokenized_examples=tokenizer(text=examples["question"],text_pair=examples["context"],truncation="only_second",max_length=max_length,stride=stride,return_overflowing_tokens=True,return_offsets_mapping=True,padding="max_length",)# Since one example might give us several features if it has a long context, we# need a map from a feature to its corresponding example. This key gives us just# that.sample_mapping=tokenized_examples.pop("overflow_to_sample_mapping")# We keep the id that gave us this feature and we will store the offset mappings.tokenized_examples["id"]=list()foriinrange(len(tokenized_examples.input_ids)):# Grab the sequence corresponding to that example (to know what is the context# and what is the question).sequence_ids=tokenized_examples.sequence_ids(i)context_index=1# One example can give several spans, this is the index of the example# containing this span of text.sample_index=sample_mapping[i]tokenized_examples.id.append(examples["id"][sample_index])# Set to (-1, -1) the offset_mapping that are not part of the context so it's# easy to determine if a token position is part of the context or not.tokenized_examples.offset_mapping[i]=[(oifsequence_ids[k]==context_indexelse(-1,-1))fork,oinenumerate(tokenized_examples.offset_mapping[i])]returntokenized_examplesdefpostprocess_predictions_and_labels(docs
predictions:list,dataset:"Dataset",prepared_dataset:"Dataset",cls_token_index:int,)->tuple[list[dict],list[dict]]:    """Postprocess the predictions and labels, to allow easier metric computation.    Args:        predictions:            A pair of (start_logits, end_logits) predictions.        dataset:            The dataset containing the examples.        prepared_dataset:            The dataset containing the prepared examples.        cls_token_index:            The index of the CLS token.    Returns:        The postprocessed predictions and labels.    """# Extract the logits from the predictionsall_start_logits=predictions[0]all_end_logits=predictions[1]# Build a map from an example to its corresponding features, being the blocks of# text from the context that we're feeding into the model. An example can have# multiple features/blocks if it has a long context.id_to_index={k:ifori,kinenumerate(dataset["id"])}features_per_example=defaultdict(list)fori,featureinenumerate(prepared_dataset):id=feature["id"]example_index=id_to_index[id]features_per_example[example_index].append(i)# Loop over all the examplespredictions=list()labels=list()forexample_index,exampleinenumerate(dataset):# Extract the best valid answer associated with the current examplebest_answer=find_best_answer(all_start_logits=all_start_logits,all_end_logits=all_end_logits,prepared_dataset=prepared_dataset,feature_indices=features_per_example[example_index],context=example["context"],max_answer_length=30,num_best_logits=20,min_null_score=0.0,cls_token_index=cls_token_index,)# Create the final prediction dictionary, to be added to the list of# predictionsprediction=dict(id=example["id"],prediction_text=best_answer,no_answer_probability=0.0)# Add the answer to the list of predictionspredictions.append(prediction)# Create the associated reference dictionary, to be added to the list of# referenceslabel=dict(id=example["id"],answers=dict(text=example["answers"]["text"],answer_start=example["answers"]["answer_start"],),)# Add the answer and label to the list of predictions and labels, respectivelylabels.append(label)returnpredictions,labelsdeffind_best_answer(docs
all_start_logits:np.ndarray,all_end_logits:np.ndarray,prepared_dataset:"Dataset",feature_indices:list[int],context:str,max_answer_length:int,num_best_logits:int,min_null_score:float,cls_token_index:int,)->str:    """Find the best answer for a given example.    Args:        all_start_logits:            The start logits for all the features.        all_end_logits:            The end logits for all the features.        prepared_dataset:            The dataset containing the prepared examples.        feature_indices:            The indices of the features associated with the current example.        context:            The context of the example.        max_answer_length:            The maximum length of the answer.        num_best_logits:            The number of best logits to consider.        min_null_score:            The minimum score an answer can have.        cls_token_index:            The index of the CLS token.    Returns:        The best answer for the example.    """# Loop through all the features associated to the current examplevalid_answers=list()forfeature_indexinfeature_indices:# Get the features associated with the current examplefeatures=prepared_dataset[feature_index]# Get the predictions of the model for this featurestart_logits=all_start_logits[feature_index]end_logits=all_end_logits[feature_index]# Update minimum null predictioncls_index=features["input_ids"].index(cls_token_index)feature_null_score=(start_logits[cls_index]+end_logits[cls_index]).item()ifmin_null_score<feature_null_score:min_null_score=feature_null_score# Find the valid answers for the featurevalid_answers_for_feature=find_valid_answers(start_logits=start_logits,end_logits=end_logits,offset_mapping=features["offset_mapping"],context=context,max_answer_length=max_answer_length,num_best_logits=num_best_logits,min_null_score=min_null_score,)valid_answers.extend(valid_answers_for_feature)# In the very rare edge case we have not a single non-null prediction, we create a# fake prediction to avoid failureifnotvalid_answers:return""# Otherwise, we select the answer with the largest score as the best answer, and# return itbest_answer_dict=sorted(valid_answers,key=lambdax:x["score"],reverse=True)[0]returnbest_answer_dict["text"]deffind_valid_answers(docs
start_logits:np.ndarray,end_logits:np.ndarray,offset_mapping:list[tuple[int,int]],context:str,max_answer_length:int,num_best_logits:int,min_null_score:float,)->list[dict]:    """Find the valid answers from the start and end indexes.    Args:        start_logits:            The logits for the start of the answer.        end_logits:            The logits for the end of the answer.        offset_mapping:            The offset mapping, being a list of pairs of integers for each token index,            containing the start and end character index in the original context.        context:            The context of the example.        max_answer_length:            The maximum length of the answer.        num_best_logits:            The number of best logits to consider. Note that this function will run in            O(`num_best_logits` ^ 2) time.        min_null_score:            The minimum score an answer can have.    Returns:        A list of the valid answers, each being a dictionary with keys "text" and        "score", the score being the sum of the start and end logits.    """# Fetch the top-k predictions for the start- and end token indicesstart_indexes=np.argsort(start_logits)[-1:-num_best_logits-1:-1].tolist()end_indexes=np.argsort(end_logits)[-1:-num_best_logits-1:-1].tolist()# We loop over all combinations of starting and ending indexes for valid answersvalid_answers=list()forstart_indexinstart_indexes:forend_indexinend_indexes:# If the starting or ending index is out-of-scope, meaning that they are# either out of bounds or correspond to part of the input_ids that are not# in the context, then we skip this indexif(start_index>=len(offset_mapping)orend_index>=len(offset_mapping)ortuple(offset_mapping[start_index])==(-1,-1)ortuple(offset_mapping[end_index])==(-1,-1)):continue# Do not consider answers with a length that is either negative or greater# than the context lengthmax_val=max_answer_length+start_index-1ifend_index<start_indexorend_index>max_val:continue# If we got to this point then the answer is valid, so we store the# corresponding start- and end character indices in the original context,# and from these extract the answerstart_char=offset_mapping[start_index][0]end_char=offset_mapping[end_index][1]text=context[start_char:end_char]# Compute the score of the answer, being the sum of the start and end# logits. Intuitively, this indicates how likely the answer is to be# correct, and allows us to pick the best valid answer.score=start_logits[start_index]+end_logits[end_index]# Add the answer to the list of valid answers, if the score is greater# than the minimum null scoreifscore>min_null_score:valid_answers.append(dict(score=score,text=text))returnvalid_answers