scandeval.benchmarker

docs module scandeval.benchmarker
"""Class that benchmarks Scandinavian language models."""importjsonimportloggingimportreimportsysimporttypingastfromcopyimportdeepcopyfrompathlibimportPathfromshutilimportrmtreefromtimeimportsleepfromtorch.distributedimportdestroy_process_groupfrom.benchmark_config_factoryimportbuild_benchmark_configfrom.constantsimportGENERATIVE_PIPELINE_TAGSfrom.data_loadingimportload_datafrom.data_modelsimportBenchmarkConfigParams,BenchmarkResultfrom.dataset_configsimportget_all_dataset_configsfrom.enumsimportDevice,ModelTypefrom.exceptionsimportInvalidBenchmark,InvalidModelfrom.finetuningimportfinetunefrom.generationimportgeneratefrom.model_configimportget_model_configfrom.model_loadingimportload_modelfrom.scoresimportlog_scoresfrom.speed_benchmarkimportbenchmark_speedfrom.tasksimportSPEEDfrom.utilsimportenforce_reproducibilityift.TYPE_CHECKING:from.benchmark_modulesimportBenchmarkModulefrom.data_modelsimportBenchmarkConfig,DatasetConfig,ModelConfiglogger=logging.getLogger("scandeval")classBenchmarker:docs
    """Benchmarking all the Scandinavian language models.    Attributes:        benchmark_config_default_params:            The default parameters for the benchmark configuration.        benchmark_config:            The benchmark configuration.        force:            Whether to force evaluations of models, even if they have been benchmarked            already.        results_path:            The path to the results file.        benchmark_results:            The benchmark results.    """def__init__(self,progress_bar:bool=True,save_results:bool=True,task:str|list[str]|None=None,dataset:list[str]|str|None=None,language:str|list[str]="all",model_language:str|list[str]|None=None,dataset_language:str|list[str]|None=None,device:Device|None=None,batch_size:int=32,raise_errors:bool=False,cache_dir:str=".scandeval_cache",api_key:str|None=None,force:bool=False,verbose:bool=False,trust_remote_code:bool=False,use_flash_attention:bool|None=None,clear_model_cache:bool=False,evaluate_test_split:bool=False,few_shot:bool=True,num_iterations:int=10,api_base:str|None=None,api_version:str|None=None,debug:bool=False,run_with_cli:bool=False,only_allow_safetensors:bool=False,)->None:        """Initialise the benchmarker.        Args:            progress_bar:                Whether progress bars should be shown. Defaults to True.            save_results:                Whether to save the benchmark results to                'scandeval_benchmark_results.jsonl'. Defaults to True.            task:                The tasks benchmark the model(s) on. Mutually exclusive with `dataset`.                If both `task` and `dataset` are None then all datasets will be                benchmarked.            dataset:                The datasets to benchmark on. Mutually exclusive with `task`. If both                `task` and `dataset` are None then all datasets will be benchmarked.            language:                The language codes of the languages to include, both for models and                datasets. Set this to 'all' if all languages should be considered.                Defaults to "all".            model_language:                The language codes of the languages to include for models. If specified                then this overrides the `language` parameter for model languages.                Defaults to None.            dataset_language:                The language codes of the languages to include for datasets. If                specified then this overrides the `language` parameter for dataset                languages. Defaults to None.            device:                The device to use for benchmarking. Defaults to None.            batch_size:                The batch size to use. Defaults to 32.            raise_errors:                Whether to raise errors instead of skipping the model evaluation.                Defaults to False.            cache_dir:                Directory to store cached models. Defaults to '.scandeval_cache'.            api_key:                The API key to use for a given inference API.            force:                Whether to force evaluations of models, even if they have been                benchmarked already. Defaults to False.            verbose:                Whether to output additional output. This is automatically set if                `debug` is True. Defaults to False.            trust_remote_code:                Whether to trust remote code when loading models. Defaults to False.            use_flash_attention:                Whether to use Flash Attention. If None then it will be used if it is                installed and the model is a decoder model. Defaults to None.            clear_model_cache:                Whether to clear the model cache after benchmarking each model.                Defaults to False.            evaluate_test_split:                Whether to evaluate the test split of the datasets. Defaults to False.            few_shot:                Whether to only evaluate the model using few-shot evaluation. Only                relevant if the model is generative. Defaults to True.            num_iterations:                The number of times each model should be evaluated. This is only meant                to be used for power users, and scores will not be allowed on the                leaderboards if this is changed. Defaults to 10.            api_base:                The base URL for a given inference API. Only relevant if `model` refers                to a model on an inference API. Defaults to None.            api_version:                The version of the API to use. Defaults to None.            debug:                Whether to output debug information. Defaults to False.            run_with_cli:                Whether the benchmarker is being run from the command-line interface.                Defaults to False.            only_allow_safetensors:                Whether to only allow models that use the safetensors format. Defaults to                False.        Raises:            ValueError:                If both `task` and `dataset` are specified.        """iftaskisnotNoneanddatasetisnotNone:raiseValueError("Only one of `task` and `dataset` can be specified.")self.benchmark_config_default_params=BenchmarkConfigParams(progress_bar=progress_bar,save_results=save_results,task=task,dataset=dataset,language=language,model_language=model_language,dataset_language=dataset_language,device=device,batch_size=batch_size,raise_errors=raise_errors,cache_dir=cache_dir,api_key=api_key,force=force,verbose=verbose,trust_remote_code=trust_remote_code,use_flash_attention=use_flash_attention,clear_model_cache=clear_model_cache,evaluate_test_split=evaluate_test_split,few_shot=few_shot,num_iterations=num_iterations,api_base=api_base,api_version=api_version,debug=debug,run_with_cli=run_with_cli,only_allow_safetensors=only_allow_safetensors,)self.benchmark_config=build_benchmark_config(first_time=True,**self.benchmark_config_default_params.model_dump())# Initialise variable storing model lists, so we only have to fetch it onceself._model_lists:dict[str,list[str]]|None=Noneself.results_path=Path.cwd()/"scandeval_benchmark_results.jsonl"adjust_logging_level(verbose=self.benchmark_config.verbose)@propertydefbenchmark_results(self)->list[BenchmarkResult]:docs
        """The benchmark results."""ifself.results_path.exists():withself.results_path.open()asf:return[BenchmarkResult.from_dict(json.loads(line))forlineinfifline.strip()]else:returnlist()defbenchmark(docs
self,model:list[str]|str,task:str|list[str]|None=None,dataset:list[str]|str|None=None,progress_bar:bool|None=None,save_results:bool|None=None,language:str|list[str]|None=None,model_language:str|list[str]|None=None,dataset_language:str|list[str]|None=None,device:Device|None=None,batch_size:int|None=None,raise_errors:bool|None=None,cache_dir:str|None=None,api_key:str|None=None,force:bool|None=None,verbose:bool|None=None,trust_remote_code:bool|None=None,use_flash_attention:bool|None=None,clear_model_cache:bool|None=None,evaluate_test_split:bool|None=None,few_shot:bool|None=None,num_iterations:int|None=None,only_allow_safetensors:bool|None=None,)->list[BenchmarkResult]:        """Benchmarks models on datasets.        Args:            model:                The full Hugging Face Hub path(s) to the pretrained transformer model.                The specific model version to use can be added after the suffix '@':                "model@v1.0.0". It can be a branch name, a tag name, or a commit id,                and defaults to the latest version if not specified.            task:                The tasks benchmark the model(s) on. Mutually exclusive with `dataset`.                If both `task` and `dataset` are None then all datasets will be                benchmarked. Defaults to None.            dataset:                The datasets to benchmark on. Mutually exclusive with `task`. If both                `task` and `dataset` are None then all datasets will be benchmarked.                Defaults to None.            progress_bar:                Whether progress bars should be shown. Defaults to the value specified                when initialising the benchmarker.            save_results:                Whether to save the benchmark results to                'scandeval_benchmark_results.jsonl'. Defaults to the value specified                when initialising the benchmarker.            language:                The language codes of the languages to include, both for models and                datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this                to 'all' if all languages (also non-Scandinavian) should be considered.                Defaults to the value specified when initialising the benchmarker.            model_language:                The language codes of the languages to include for models. If specified                then this overrides the `language` parameter for model languages.                Defaults to the value specified when initialising the benchmarker.            dataset_language:                The language codes of the languages to include for datasets. If                specified then this overrides the `language` parameter for dataset                languages. Defaults to the value specified when initialising the                benchmarker.            device:                The device to use for benchmarking. Defaults to the value specified when                initialising the benchmarker.            batch_size:                The batch size to use. Defaults to the value specified when initialising                the benchmarker.            raise_errors:                Whether to raise errors instead of skipping the model evaluation.            cache_dir:                Directory to store cached models. Defaults to the value specified when                initialising the benchmarker.            api_key:                The API key to use for a given inference server. Defaults to the value                specified when initialising the benchmarker.            force:                Whether to force evaluations of models, even if they have been                benchmarked already. Defaults to the value specified when initialising                the benchmarker.            verbose:                Whether to output additional output. Defaults to the value specified when                initialising the benchmarker.            trust_remote_code:                Whether to trust remote code when loading models. Defaults to the value                specified when initialising the benchmarker.            use_flash_attention:                Whether to use Flash Attention. Defaults to the value specified when                initialising the benchmarker.            clear_model_cache:                Whether to clear the model cache after benchmarking each model. Defaults                to the value specified when initialising the benchmarker.            evaluate_test_split:                Whether to evaluate the test split of the datasets. Defaults to the                value specified when initialising the benchmarker.            few_shot:                Whether to only evaluate the model using few-shot evaluation. Only                relevant if the model is generative. Defaults to the value specified                when initialising the benchmarker.            num_iterations:                The number of times each model should be evaluated. This is only meant                to be used for power users, and scores will not be allowed on the                leaderboards if this is changed. Defaults to the value specified when                initialising the benchmarker.            only_allow_safetensors:                Whether to only allow models that use the safetensors format. Defaults                to the value specified when initialising the benchmarker.        Returns:            A list of benchmark results.        Raises:            ValueError:                If both `task` and `dataset` are specified.        """iftaskisnotNoneanddatasetisnotNone:raiseValueError("Only one of `task` and `dataset` can be specified.")benchmark_config=self._get_updated_benchmark_config(task=task,dataset=dataset,progress_bar=progress_bar,save_results=save_results,language=language,model_language=model_language,dataset_language=dataset_language,device=device,batch_size=batch_size,raise_errors=raise_errors,cache_dir=cache_dir,api_key=api_key,force=force,verbose=verbose,trust_remote_code=trust_remote_code,use_flash_attention=use_flash_attention,clear_model_cache=clear_model_cache,evaluate_test_split=evaluate_test_split,few_shot=few_shot,num_iterations=num_iterations,only_allow_safetensors=only_allow_safetensors,)adjust_logging_level(verbose=benchmark_config.verbose)ifbenchmark_config.clear_model_cache:clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)model_ids=self._prepare_model_ids(model_id=model)dataset_configs=prepare_dataset_configs(dataset_names=benchmark_config.datasets)current_benchmark_results:list[BenchmarkResult]=list()form_idinmodel_ids:try:model_config=get_model_config(model_id=m_id,benchmark_config=benchmark_config)exceptInvalidModelase:logger.info(e.message)continueloaded_model:BenchmarkModule|None=Nonefordataset_configindataset_configs:# Skip if we have already benchmarked this model on this dataset and# we are not forcing the benchmarkifnotbenchmark_config.forceandmodel_has_been_benchmarked(model_id=m_id,dataset=dataset_config.name,few_shot=benchmark_config.few_shot,validation_split=notbenchmark_config.evaluate_test_split,benchmark_results=self.benchmark_results,):logger.debug(f"Skipping benchmarking {m_id} on {dataset_config.pretty_name},"" as it has already been benchmarked.")continue# We do not re-initialise generative models as their architecture is not# customised to specific datasetsifmodel_config.taskinGENERATIVE_PIPELINE_TAGS:initial_logging(model_config=model_config,dataset_config=dataset_config,benchmark_config=benchmark_config,)ifloaded_modelisNone:logger.info("Loading model...")try:loaded_model=load_model(model_config=model_config,dataset_config=dataset_config,benchmark_config=benchmark_config,)exceptInvalidModelase:ifbenchmark_config.raise_errors:raiseelogger.info(e.message)breakelse:loaded_model.dataset_config=dataset_config# Benchmark a single model on a single datasetbenchmark_output_or_err=self._benchmark_single(model=loaded_model,model_config=model_config,dataset_config=dataset_config,benchmark_config=benchmark_config,)if(isinstance(benchmark_output_or_err,Exception)andbenchmark_config.raise_errors):raisebenchmark_output_or_errelifisinstance(benchmark_output_or_err,InvalidBenchmark):ifbenchmark_config.raise_errors:raisebenchmark_output_or_errlogger.info(f"{m_id} could not be benchmarked on "f"{dataset_config.pretty_name}. Skipping. The error message "f"raised was {benchmark_output_or_err.message!r}.")continueelifisinstance(benchmark_output_or_err,InvalidModel):ifbenchmark_config.raise_errors:raisebenchmark_output_or_errlogger.info(benchmark_output_or_err.message)breakelse:record=benchmark_output_or_errcurrent_benchmark_results.append(record)ifbenchmark_config.save_results:record.append_to_results(results_path=self.results_path)ifbenchmark_config.clear_model_cache:clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)# This avoids the following warning at the end of the benchmarking:#   Warning: WARNING: process group has NOT been destroyed before we destruct#   ProcessGroupNCCL. On normal program exit, the application should call#   destroy_process_group to ensure that any pending NCCL operations have#   finished in this process. In rare cases this process can exit before this#   point and block the progress of another member of the process group. This#   constraint has always been present,  but this warning has only been added#   since PyTorch 2.4 (function operator())try:destroy_process_group()exceptAssertionError:passreturncurrent_benchmark_resultsdef_get_updated_benchmark_config(self,**kwargs)->"BenchmarkConfig":        """Get an updated benchmark configuration.        Args:            **kwargs:                The new parameters for the benchmark configuration.        Returns:            The updated benchmark configuration.        """benchmark_config_params=deepcopy(self.benchmark_config_default_params)forkey,valueinkwargs.items():ifvalueisnotNoneandhasattr(benchmark_config_params,key):setattr(benchmark_config_params,key,value)ifkey=="task":benchmark_config_params.dataset=Noneelifkey=="dataset":benchmark_config_params.task=Nonereturnbuild_benchmark_config(**benchmark_config_params.model_dump())def_prepare_model_ids(self,model_id:list[str]|str)->list[str]:        """Prepare the model ID(s) to be benchmarked.        Args:            model_id:                The model ID(s) of the models to benchmark.        Returns:            The prepared list of model IDs.        """model_ids=[model_id]ifisinstance(model_id,str)elsemodel_id# Reorder the `model_ids` list to include the ones present in the benchmark# results firstbenchmarked_model_ids=[re.sub(r"\(.+\)","",record.model).strip()forrecordinself.benchmark_results]model_ids_sorted=[m_idform_idinmodel_idsifm_idinbenchmarked_model_ids]model_ids_sorted+=[m_idform_idinmodel_idsifm_idnotinbenchmarked_model_ids]return[m_id.rstrip(" /")form_idinmodel_ids_sorted]def_benchmark_single(self,model:"BenchmarkModule | None",model_config:"ModelConfig",dataset_config:"DatasetConfig",benchmark_config:"BenchmarkConfig",)->BenchmarkResult|InvalidBenchmark|InvalidModel:        """Benchmark a single model on a single dataset.        Args:            model:                The model to benchmark.            model_config:                The configuration of the model we are evaluating.            dataset_config:                The configuration of the dataset we are evaluating on.            benchmark_config:                The general benchmark configuration.        Returns:            The benchmark result, or an error if the benchmark was unsuccessful.        """ifmodelisNone:initial_logging(model_config=model_config,dataset_config=dataset_config,benchmark_config=benchmark_config,)whileTrue:try:# Set random seeds to enforce reproducibility of the randomly# initialised weightsrng=enforce_reproducibility()ifmodelisNoneormodel_config.model_type!=ModelType.GENERATIVE:logger.info("Loading model...")model=load_model(model_config=model_config,dataset_config=dataset_config,benchmark_config=benchmark_config,)assertmodelisnotNoneifdataset_config.task==SPEED:scores=benchmark_speed(model=model,benchmark_config=self.benchmark_config)else:bootstrapped_datasets=load_data(rng=rng,dataset_config=dataset_config,benchmark_config=benchmark_config,)prepared_datasets=model.prepare_datasets(datasets=bootstrapped_datasets,task=dataset_config.task)ifmodel_config.model_type==ModelType.GENERATIVE:scores=generate(model=model,datasets=prepared_datasets,model_config=model_config,dataset_config=dataset_config,benchmark_config=self.benchmark_config,)else:scores=finetune(model=model,datasets=prepared_datasets,model_config=model_config,dataset_config=dataset_config,benchmark_config=benchmark_config,)results=log_scores(dataset_name=dataset_config.pretty_name,metric_configs=dataset_config.task.metrics,scores=scores,model_id=model_config.model_id,)record=BenchmarkResult(dataset=dataset_config.name,task=dataset_config.task.name,dataset_languages=[language.codeforlanguageindataset_config.languages],model=model_config.model_id,results=results,num_model_parameters=model.num_params,max_sequence_length=model.model_max_length,vocabulary_size=model.vocab_size,merge=model_config.merge,generative=model_config.model_type==ModelType.GENERATIVE,generative_type=(model.generative_type.valueifmodel.generative_typeisnotNoneelseNone),few_shot=benchmark_config.few_shot,validation_split=notbenchmark_config.evaluate_test_split,)logger.debug(f"Results:\n{results}")returnrecordexcept(InvalidBenchmark,InvalidModel)ase:# If the model ID is not valid then raise an errormodel_err_msg="does not exist on the Hugging Face Hub"ifbenchmark_config.raise_errorsandmodel_err_msginstr(e):raisee# Otherwise, if the error is due to Hugging Face Hub being down, then# wait a bit and try againelif"The Hugging Face Hub seems to be down."instr(e):wait_time=30logger.debug("The Hugging Face Hub seems to be down. Retrying in "f"{wait_time} seconds.")sleep(wait_time)continue# Otherwise, if the error is due to the MPS fallback not being enabled,# then raise an error asking the user to enable itelif"PYTORCH_ENABLE_MPS_FALLBACK"instr(e):raiseRuntimeError("The benchmark failed because the environment variable ""`PYTORCH_ENABLE_MPS_FALLBACK` is not set. Please set this ""environment variable to `1` and try again.")elifbenchmark_config.raise_errors:raiseereturnedef__call__(self,*args,**kwargs)->list[BenchmarkResult]:        """Call the benchmarker. See `Benchmarker.benchmark`."""returnself.benchmark(*args,**kwargs)defmodel_has_been_benchmarked(docs
model_id:str,dataset:str,few_shot:bool,validation_split:bool,benchmark_results:list[BenchmarkResult],)->bool:    """Checks whether a model has already been benchmarked on a dataset.    Args:        model_id:            The model ID.        dataset:            The dataset.        few_shot:            Whether the model was evaluated using few-shot evaluation.        validation_split:            Whether the model was evaluated on the validation split.        benchmark_results:            The benchmark results.    Returns:        Whether the model has already been evaluated on the dataset.    """forrecordinbenchmark_results:same_evaluation=record.model==model_idandrecord.dataset==datasetsame_validation_split_setting=record.validation_split==validation_splitsame_few_shot_setting=record.few_shot==few_shotornotrecord.generativeifsame_evaluationandsame_validation_split_settingandsame_few_shot_setting:returnTruereturnFalsedocs
defadjust_logging_level(verbose:bool,ignore_testing:bool=False)->int:    """Adjust the logging level based on verbosity.    Args:        verbose:            Whether to output additional output.        ignore_testing:            Whether to ignore the testing flag.    Returns:        The logging level that was set.    """ifhasattr(sys,"_called_from_test")andnotignore_testing:logging_level=logging.CRITICALelifverbose:logging_level=logging.DEBUGelse:logging_level=logging.INFOlogger.setLevel(logging_level)returnlogging_leveldefclear_model_cache_fn(cache_dir:str)->None:docs
    """Clear the model cache.    Note that this will not remove the stored completions.    Args:        cache_dir:            The path to the cache directory.    """model_cache_path=Path(cache_dir)/"model_cache"model_cache_path.mkdir(parents=True,exist_ok=True)formodel_dirinmodel_cache_path.iterdir():ifmodel_dir.is_dir():forsub_model_dirinmodel_dir.iterdir():ifsub_model_dir.is_dir():rmtree(sub_model_dir)docs
defprepare_dataset_configs(dataset_names:list[str])->list["DatasetConfig"]:    """Prepare the dataset configuration(s) to be benchmarked.    Args:        dataset_names:            The dataset names to benchmark.    Returns:        The prepared list of model IDs.    """return[cfgforcfginget_all_dataset_configs().values()ifcfg.nameindataset_names]definitial_logging(docs
model_config:"ModelConfig",dataset_config:"DatasetConfig",benchmark_config:"BenchmarkConfig",)->None:    """Initial logging at the start of the benchmarking process.    Args:        model_config:            The configuration of the model we are evaluating.        dataset_config:            The configuration of the dataset we are evaluating on.        benchmark_config:            The general benchmark configuration.    """split_type="validation"ifnotbenchmark_config.evaluate_test_splitelse"test"ifmodel_config.taskinGENERATIVE_PIPELINE_TAGS:ifbenchmark_config.few_shot:eval_type="Few-shot benchmarking"else:eval_type="Zero-shot benchmarking"else:eval_type="Benchmarking"logger.info(f"{eval_type} {model_config.model_id} on the {split_type} split of "f"{dataset_config.pretty_name}")ifdataset_config.unofficial:logger.info(f"Note that the {dataset_config.name!r} dataset is unofficial, ""meaning that the resulting evaluation will not be included in the ""official leaderboard.")ifbenchmark_config.debug:logger.info("Running in debug mode. This will output additional information, as ""well as store the model outputs in the current directory after each ""batch. For this reason, evaluation will be slower.")