scandeval.task_utils.sequence_classification

docs module scandeval.task_utils.sequence_classification
"""Utility functions related to the sequence-classification task group."""importloggingimportreimporttypingastimportevaluateimportLevenshteinimportnumpyasnpfromevaluateimportEvaluationModulefrom..data_modelsimportBenchmarkConfig,GenerativeModelOutputfrom..utilsimportlog_once,raise_if_model_output_contains_nan_valuesift.TYPE_CHECKING:from..data_modelsimportDatasetConfigfrom..typesimportLabels,Predictionslogger=logging.getLogger("scandeval")defcompute_metrics(docs
model_outputs_and_labels:tuple["Predictions","Labels"],dataset_config:"DatasetConfig",benchmark_config:"BenchmarkConfig",)->dict[str,float]:    """Compute the metrics needed for evaluation.    Args:        model_outputs_and_labels:            The first sequence contains the model outputs and the second sequence            contains the true labels.        dataset_config:            The configuration of the dataset.        benchmark_config:            The configuration of the benchmark.    Returns:        A dictionary with the names of the metrics as keys and the metric values as        values.    """model_outputs,labels=model_outputs_and_labelslabel2id={label:idxforidx,labelindataset_config.id2label.items()}raise_if_model_output_contains_nan_values(model_output=model_outputs)metrics={metric_cfg.name:(evaluate.load(path=metric_cfg.huggingface_id,cache_dir=benchmark_config.cache_dir)ifmetric_cfg.huggingface_id!=""elseNone)formetric_cfgindataset_config.task.metrics}model_output_dtype=np.asarray(model_outputs).dtypeifmodel_output_dtypein[np.float16,np.float32,np.float64]:predictions=np.asarray(model_outputs).argmax(axis=-1)else:predictions=model_outputsprompt_label_to_label_mapping={prompt_label:labelforlabel,prompt_labelindataset_config.prompt_label_mapping.items()}predictions=[(label2id[prompt_label_to_label_mapping[pred.lower()]]ifisinstance(pred,str)elsepred)forpredinpredictions]label_ids=[label2id[label.lower()]ifisinstance(label,str)elselabelforlabelinlabels]results:dict[str,float]=dict()forcfgindataset_config.task.metrics:metric=metrics[cfg.name]assertisinstance(metric,EvaluationModule)score_dict:dict[str,float]|None=metric.compute(predictions=predictions,references=label_ids,**cfg.compute_kwargs)# The metric returns None if we are running on multi-GPU and the current# process is not the main processifscore_dictisnotNone:scores=score_dict[cfg.results_key]ifisinstance(scores,list):scores=sum(scores)/len(scores)results[cfg.name]=scoresreturnresultsdefextract_labels_from_generation(docs
input_batch:dict[str,list],model_output:GenerativeModelOutput,dataset_config:"DatasetConfig",)->list[str]:    """Extract the predicted labels from the generated output.    Args:        input_batch:            The input batch, where the keys are the feature names and the values            are lists with the feature values.        model_output:            The raw generated output of the model.        dataset_config:            The configuration of the dataset.    Returns:        The predicted labels.    """ifmodel_output.scoresisnotNone:returnget_closest_logprobs_labels(generation_logprobs=model_output.scores,dataset_config=dataset_config)else:returnget_closest_word_edit_labels(generated_sequences=model_output.sequences,dataset_config=dataset_config)defget_closest_logprobs_labels(docs
generation_logprobs:list[list[list[tuple[str,float]]]],dataset_config:"DatasetConfig",)->list[str]:    """Get the labels with the highest predicted logprob value.    In case a candidate label is split into multiple tokens, we only use the first    token to compute the logprob value. E.g., if the candidate label "positive" is    tokenised as ["pos", "itive"], we only use the logprob value of "pos" to    represent the logprob value of the entire label.    Args:        generation_logprobs:            The logprobs of the generated tokens, for all samples in the batch. Of shape            (batch_size, num_tokens, num_logprobs).        dataset_config:            The configuration of the dataset.    Returns:        The predicted labels.    Raises:        InvalidBenchmark:            If no candidate label can be found for any of the generated labels.    """english_labels=list(dataset_config.id2label.values())english2local=dataset_config.prompt_label_mappingcandidate_labels=[english2local[lbl].lower()forlblinenglish_labels]+english_labelsoutput_labels:list[str]=list()forsampleingeneration_logprobs:forlogprob_listinsample:generated_labels=[re.sub(pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$",repl="",string=label.lower(),)forlabel,_inlogprob_list]generated_labels=[labelforlabelingenerated_labelsiflabel!=""]# We want to use the first generated label which starts with a candidate# label, as the output labeloutput_label:str|None=Noneforgenerated_labelingenerated_labels:candidate_output_labels=[candidate_labelforcandidate_labelincandidate_labelsifcandidate_label.startswith(generated_label)]ifcandidate_output_labels:output_label=candidate_output_labels[0]breakifoutput_labelisnotNone:output_label=english2local.get(output_label,output_label)output_labels.append(output_label)breakelse:iflen(sample)==0:log_once("The model outputted an empty string, so no candidate labels could "f"be determined. Using {candidate_labels[0]!r} as the output ""label.",level=logging.DEBUG,)else:log_once("Could not find a candidate label for any of the generated "f"labels in the sample {sample}. Using {candidate_labels[0]!r} ""as the output label.",level=logging.DEBUG,)output_labels.append(candidate_labels[0])assertlen(output_labels)==len(generation_logprobs)returnoutput_labelsdefget_closest_word_edit_labels(docs
generated_sequences:list[str],dataset_config:"DatasetConfig")->list[str]:    """Get the labels with the smallest edit distance to the predicted labels.    Args:        generated_sequences:            The generated sequences from the model.        dataset_config:            The configuration of the dataset.    Returns:        The candidate labels with the smallest edit distance to the predicted labels.    """candidate_labels=[dataset_config.prompt_label_mapping[lbl]forlblindataset_config.id2label.values()]new_predicted_labels:list[str]=list()forpredicted_labelingenerated_sequences:edit_distances=[Levenshtein.distance(s1=predicted_label.lower(),s2=candidate_label.lower())forcandidate_labelincandidate_labels]closest_label=candidate_labels[np.argmin(edit_distances).item()]new_predicted_labels.append(closest_label)returnnew_predicted_labels