scandeval.task_utils.multiple_choice_classification

docs module scandeval.task_utils.multiple_choice_classification
"""Utility functions related to the multiple-choice classification task group."""importhashlibimportloggingimportreimporttypingastfromcollectionsimportdefaultdictimportnumpyasnpfromdatasetsimportDatasetfromtransformersimportBatchEncoding,PreTrainedTokenizer,Trainerift.TYPE_CHECKING:from..typesimportLabels,Predictionslogger=logging.getLogger("scandeval")classMultipleChoiceClassificationTrainer(Trainer):docs
    """Trainer subclass for question answering tasks."""defevaluate(docs
self,eval_dataset:"Dataset | None"=None,ignore_keys:list[str]|None=None,metric_key_prefix:str="eval",)->dict[str,float]|None:        """Evaluate the model on the given dataset.        Args:            eval_dataset:                The dataset to evaluate on. If None, then use the stored evaluation                dataset.            ignore_keys:                The keys to ignore when computing the metrics.            metric_key_prefix:                The prefix to use for the metric keys.        Returns:            The metrics computed on the evaluation dataset.        """eval_dataloader=self.get_eval_dataloader(eval_dataset)eval_loop=(self.prediction_loopifself.args.use_legacy_prediction_loopelseself.evaluation_loop)output=eval_loop(eval_dataloader,description="Evaluation",prediction_loss_only=None,ignore_keys=ignore_keys,metric_key_prefix=metric_key_prefix,)ifmetric_key_prefix=="test":preds_and_labels=postprocess_predictions_and_labels(predictions=output.predictions,dataset=eval_dataset)output.metrics.update(self.compute_metrics(preds_and_labels))# Prefix all keys with metric_key_prefix + '_'forkeyinlist(output.metrics.keys()):ifnotkey.startswith(f"{metric_key_prefix}_"):output.metrics[f"{metric_key_prefix}_{key}"]=output.metrics.pop(key)# Only the main node log the results by defaultifself.args.should_log:self.log(output.metrics)self.control=self.callback_handler.on_evaluate(self.args,self.state,self.control,# type: ignore[has-type]output.metrics,)returnoutput.metricsdefprepare_examples(docs
examples:"BatchEncoding",tokenizer:"PreTrainedTokenizer")->"BatchEncoding":    """Prepare the features.    Args:        examples:            The examples to prepare.        tokenizer:            The tokenizer to use to prepare the examples.    Returns:        The prepared examples.    """doc:str=examples["text"][0]sections=doc.split("\n")choice_idxs=[idxforidx,sectioninenumerate(sections)ifre.match(pattern=r"^[a-e]\. ",string=section)isnotNone]choices=[sections[idx]foridxinchoice_idxs]# Check that the choices are present, and that all of them are at the endassertlen(choices)>0,"No choices found in the document."assertall(choice_idx==len(sections)-ifori,choice_idxinenumerate(sorted(choice_idxs,reverse=True),start=1)),"Choices are not at the end of the document."question_idx=min(choice_idxs)-2# -2 to remove the 'Choices:' linecontext_and_question="\n".join(sections[:question_idx+1]).strip()new_examples=tokenizer(text=[context_and_question]*len(choices),text_pair=[choice[3:]forchoiceinchoices],padding=True,truncation=True,)new_examples["label"]=[int(choice.startswith(f"{letter}. ")andletter==examples["label"][0])forletter,choiceinzip("abcde",choices)]new_examples["id"]=[hashlib.md5(string=doc.encode()).hexdigest()]*len(choices)returnnew_examplesdefpostprocess_predictions_and_labels(docs
predictions:np.ndarray,dataset:"Dataset")->tuple["Predictions","Labels"]:    """Postprocess the predictions and labels.    Args:        predictions:            The model predictions, of shape (num_examples, 2).        dataset:            The dataset containing the examples.    Returns:        The postprocessed predictions and labels.    """mapping={0:"a",1:"b",2:"c",3:"d",4:"e"}all_predictions:list[str]=list()all_labels:list[str]=list()pred_label_dict=defaultdict(list)forpred_arr,exampleinzip(predictions,dataset):pred_label_dict[example["id"]].append((pred_arr[1],example["label"]))# Compute the final predictions and labelsforid_inset(dataset["id"]):preds,labels=zip(*pred_label_dict[id_])# Some IDs appear multiple times in the dataset, since we are bootstrapping.# Here we separate them into their respective groups.assert(len(labels)%sum(labels)==0),"The number of labels is not divisible by the sum of the labels."group_size=len(labels)//sum(labels)preds_groups=[preds[i:i+group_size]foriinrange(0,len(preds),group_size)]labels_groups=[labels[i:i+group_size]foriinrange(0,len(labels),group_size)]forpreds_group,labels_groupinzip(preds_groups,labels_groups):prediction:str=mapping[np.argmax(preds_group).item()]label:str=mapping[np.argmax(labels_group).item()]all_predictions.append(prediction)all_labels.append(label)returnall_predictions,all_labels