scandeval.task_utils.text_to_text

docs module scandeval.task_utils.text_to_text
"""Utility functions related to the text-to-text task group."""importloggingimporttypingastimportevaluateimportnumpyasnpfromevaluateimportEvaluationModulefrom..constantsimportMETRIC_ATTRIBUTES_TAKING_UP_MEMORYfrom..data_modelsimportBenchmarkConfig,DatasetConfig,GenerativeModelOutputfrom..exceptionsimportInvalidBenchmarkfrom..utilsimport(HiddenPrints,clear_memory,raise_if_model_output_contains_nan_values,)ift.TYPE_CHECKING:from..typesimportLabels,Predictionslogger=logging.getLogger("scandeval")defcompute_metrics(docs
model_outputs_and_labels:tuple["Predictions","Labels"],dataset_config:"DatasetConfig",benchmark_config:"BenchmarkConfig",)->dict[str,float]:    """Compute the metrics needed for evaluation.    Args:        model_outputs_and_labels:            The first sequence contains the model outputs and the second sequence            contains the true labels.        dataset_config:            The configuration of the dataset.        benchmark_config:            The configuration of the benchmark.    Returns:        A dictionary with the names of the metrics as keys and the metric values as        values.    """model_outputs,labels=model_outputs_and_labelsraise_if_model_output_contains_nan_values(model_output=model_outputs)metrics={metric_cfg.name:(evaluate.load(path=metric_cfg.huggingface_id,cache_dir=benchmark_config.cache_dir)ifmetric_cfg.huggingface_id!=""elseNone)formetric_cfgindataset_config.task.metrics}model_output_dtype=np.asarray(model_outputs).dtypeoutput_is_prob=model_output_dtypein[np.float16,np.float32,np.float64]ifoutput_is_prob:predictions=np.asarray(model_outputs).argmax(axis=-1)else:predictions=model_outputsresults:dict[str,float]=dict()forcfgindataset_config.task.metrics:metric=metrics[cfg.name]assertisinstance(metric,EvaluationModule)# Some metrics can be computed on hardware accelerators. In this case we# start by setting the device to the same device as the modelifcfg.compute_kwargs.get("device",None)=="auto":cfg.compute_kwargs["device"]=benchmark_config.device.typewhileTrue:try:withHiddenPrints():score_dict:dict[str,float]|None=metric.compute(predictions=predictions,references=labels,**cfg.compute_kwargs)# Clear the cache of the BERTScorer to avoid memory leaksforattributeinMETRIC_ATTRIBUTES_TAKING_UP_MEMORY:ifhasattr(metric,attribute):delattr(metric,attribute)clear_memory()breakexceptExceptionase:# Clear the cache of the BERTScorer to avoid memory leaksifhasattr(metric,"cached_bertscorer"):delmetric.cached_bertscorerclear_memory()oom_error=["CUDA out of memory","CUDA error","MPS backend out of memory",]ifnotany(errorinstr(e)forerrorinoom_error):raiseInvalidBenchmark(str(e))ifcfg.compute_kwargs.get("batch_size",1)>1:batch_size=cfg.compute_kwargs["batch_size"]cfg.compute_kwargs["batch_size"]=batch_size//2logger.debug("Out of memory error occurred during the computation of "f"the metric {cfg.pretty_name}. Reducing the batch size to "f"{cfg.compute_kwargs['batch_size']}.")elifcfg.compute_kwargs.get("device","cpu")!="cpu":cfg.compute_kwargs["batch_size"]=32cfg.compute_kwargs["device"]="cpu"logger.debug("Out of memory error occurred during the computation of "f"the metric {cfg.pretty_name}. Moving the computation to ""the CPU.")else:raiseInvalidBenchmark(str(e))# The metric returns None if we are running on multi-GPU and the current# process is not the main processifscore_dictisnotNone:scores=score_dict[cfg.results_key]ifisinstance(scores,list):scores=sum(scores)/len(scores)results[cfg.name]=scoresreturnresultsdefextract_labels_from_generation(docs
input_batch:dict[str,list],model_output:"GenerativeModelOutput")->list[t.Any]:    """Extract the predicted labels from the generated output.    Args:        input_batch:            The input batch, where the keys are the feature names and the values            are lists with the feature values.        model_output:            The raw generated output of the model.    Returns:        The predicted labels.    """returnmodel_output.sequences