scandeval.generation

docs module scandeval.generation
"""Functions related to text generation of models."""importloggingimportsysimporttypingastfrompathlibimportPathimportmore_itertoolsasmitfromdatasetsimportDataset,DatasetDictfromtqdm.autoimporttqdmfrom.benchmark_modulesimportBenchmarkModulefrom.enumsimportBatchingPreference,TaskGroupfrom.exceptionsimportInvalidBenchmarkfrom.model_cacheimport(ModelCache,load_cached_model_outputs,split_dataset_into_cached_and_non_cached,)from.utilsimportclear_memoryift.TYPE_CHECKING:from.data_modelsimportBenchmarkConfig,DatasetConfig,ModelConfiglogger=logging.getLogger("scandeval")defgenerate(docs
model:"BenchmarkModule",datasets:list[DatasetDict],model_config:"ModelConfig",dataset_config:"DatasetConfig",benchmark_config:"BenchmarkConfig",)->list[dict[str,float]]:    """Evaluate a model on a dataset through generation.    Args:        model:            The model to evaluate.        datasets:            The datasets to evaluate on.        model_config:            The configuration of the model.        benchmark_config:            The configuration of the benchmark.        dataset_config:            The configuration of the dataset.    Returns:        A list of dictionaries containing the test scores.    """# Set up the name of the model output cache. If we are testing then we save the# model outputs to a different cache and ensure that that cache is deleted before# the next test, to ensure that the tests are independent of each otherifbenchmark_config.debug:model_cache_dir=Path.cwd()else:model_cache_dir=Path(model_config.model_cache_dir)ifhasattr(sys,"_called_from_test"):cache_name=f"{dataset_config.name}-model-outputs-test.json"(model_cache_dir/cache_name).unlink(missing_ok=True)elifbenchmark_config.debug:cache_name=f"{model_config.model_id}-{dataset_config.name}-model-outputs.json"else:cache_name=f"{dataset_config.name}-model-outputs.json"cache=ModelCache(model_cache_dir=model_cache_dir,cache_name=cache_name,max_generated_tokens=dataset_config.max_generated_tokens,)scores:list[dict[str,float]]=list()foridxintqdm(iterable=range(benchmark_config.num_iterations),desc="Benchmarking",disable=notbenchmark_config.progress_bar,):test_scores=generate_single_iteration(model=model,dataset=datasets[idx]["test"],cache=cache,dataset_config=dataset_config,benchmark_config=benchmark_config,)logger.debug(f"Test scores for iteration {idx}: {test_scores}")scores.append(test_scores)clear_memory()ifnotbenchmark_config.debug:cache.remove()returnscoresdefgenerate_single_iteration(docs
dataset:Dataset,model:"BenchmarkModule",dataset_config:"DatasetConfig",benchmark_config:"BenchmarkConfig",cache:ModelCache,)->dict[str,float]:    """Evaluate a model on a dataset in a single iteration through generation.    Args:        dataset:            The dataset to evaluate on.        model:            The model to evaluate.        dataset_config:            The configuration of the dataset.        benchmark_config:            The configuration of the benchmark.        cache:            The model output cache.    Returns:        A list of dictionaries containing the scores for each metric.    """cache.load()# Split up the dataset into a cached and non-cached partcached_dataset,non_cached_dataset=split_dataset_into_cached_and_non_cached(dataset=dataset,cache=cache)all_preds:list[str]=list()iflen(non_cached_dataset)>0:matchmodel.batching_preference:caseBatchingPreference.SINGLE_SAMPLE:itr=tqdm(iterable=non_cached_dataset,leave=False)caseBatchingPreference.ALL_AT_ONCE:itr=[non_cached_dataset[:]]case _:num_batches=len(non_cached_dataset)//benchmark_config.batch_sizeiflen(non_cached_dataset)%benchmark_config.batch_size!=0:num_batches+=1itr=tqdm(iterable=mit.batched(iterable=non_cached_dataset,n=benchmark_config.batch_size),total=len(non_cached_dataset)//benchmark_config.batch_size,)# Generate the completions for the non-cached examplesforbatchinitr:assertisinstance(batch,dict)single_sample_batch=("text"inbatchandisinstance(batch["text"],str))or("messages"inbatchandisinstance(batch["messages"][0],dict))ifsingle_sample_batch:batch={key:[value]forkey,valueinbatch.items()}model_output=model.generate(inputs=batch)extracted_labels=model.extract_labels_from_generation(input_batch=batch,model_output=model_output)# Extended logging if we are running in debug modeifbenchmark_config.debug:debug_log(batch=batch,extracted_labels=extracted_labels,# type: ignore[arg-type]dataset_config=dataset_config,)cache.add_to_cache(model_inputs=batch,model_output=model_output)all_preds.extend(extracted_labels)# If we are debugging then we save the cache often, but since this makes# evaluation slower, we do not do this by defaultifbenchmark_config.debug:cache.save()ifisinstance(itr,tqdm):itr.close()# Store the cache to diskcache.save()# Fetch the cached predictions for the cached examplesiflen(cached_dataset)>0:model_output=load_cached_model_outputs(cached_dataset=cached_dataset,cache=cache)extracted_labels=model.extract_labels_from_generation(input_batch=cached_dataset[:],model_output=model_output)all_preds.extend(extracted_labels)if"label"innon_cached_dataset.column_names:ground_truth=[label.lower()ifisinstance(label,str)elselabelforlabelinnon_cached_dataset["label"]+cached_dataset["label"]]elif"labels"innon_cached_dataset.column_names:ground_truth=[[label.lower()ifisinstance(label,str)elselabelforlabelinlabel_list]forlabel_listinnon_cached_dataset["labels"]+cached_dataset["labels"]]elif"target_text"innon_cached_dataset.column_names:ground_truth=non_cached_dataset["target_text"]+cached_dataset["target_text"]else:raiseValueError("The dataset must have either a 'label', 'labels', or 'target_text' column")itr_scores:dict[str,float]=model.compute_metrics(model_outputs_and_labels=(all_preds,ground_truth))returnitr_scoresdefdebug_log(docs
batch:dict[str,t.Any],extracted_labels:list[dict|str|list[str]],dataset_config:"DatasetConfig",)->None:    """Log inputs and outputs for debugging purposes.    Args:        batch:            The batch of examples to evaluate on.        extracted_labels:            The extracted labels from the model output.        dataset_config:            The configuration of the dataset.    """matchdataset_config.task.task_group:caseTaskGroup.TOKEN_CLASSIFICATION:log_msgs=[""]fortokens,predictions,labelsinzip(batch["tokens"],extracted_labels,batch["labels"]):predictions=[tag.upper()fortaginpredictions]sample=list(zip(tokens,predictions,labels))log_batches=[[("Tokens: ","Predictions: ","Labels: ")]+sample[i:i+10]foriinrange(0,len(sample),10)]forlog_batchinlog_batches:lengths=[len(max(triple,key=len))fortripleinlog_batch]log_batch=[[f"{x:<{length}}"forxintriple]fortriple,lengthinzip(log_batch,lengths)]tokens=[triple[0]fortripleinlog_batch]predictions=[triple[1]fortripleinlog_batch]labels=[triple[2]fortripleinlog_batch]log_msgs.append("\t".join(tokens)+"\n"+"\t".join(predictions)+"\n"+"\t".join(labels))logger.info("\n\n".join(log_msgs))returncase(TaskGroup.SEQUENCE_CLASSIFICATION|TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION):labels=[dataset_config.prompt_label_mapping.get(label,label).lower()forlabelinbatch["label"]]caseTaskGroup.QUESTION_ANSWERING:extracted_labels=[prediction["prediction_text"]forpredictioninextracted_labelsifisinstance(prediction,dict)]labels=[label["answers"]["text"][0]forlabelinbatch["label"]]caseTaskGroup.TEXT_TO_TEXT:labels=batch["target_text"]case _:raiseInvalidBenchmark(f"The task group '{dataset_config.task.task_group}' is not supported.")if"messages"inbatch:input_texts=[messages[-1]["content"]formessagesinbatch["messages"]]else:input_texts=batch["text"]forinput_text,prediction,labelinzip(input_texts,extracted_labels,labels):logger.info(f"Input: '{input_text}'\nPrediction: '{prediction}'\nLabel: '{label}'")