scandeval.benchmark_config_factory

docs module scandeval.benchmark_config_factory
"""Factory class for creating dataset configurations."""importimportlib.utilimportloggingimportsysimporttypingastimporttorchfrom.data_modelsimportBenchmarkConfigfrom.dataset_configsimportget_all_dataset_configsfrom.enumsimportDevicefrom.exceptionsimportInvalidBenchmarkfrom.languagesimportget_all_languagesfrom.tasksimportget_all_tasksfrom.utilsimportlog_onceift.TYPE_CHECKING:from.data_modelsimportLanguage,Tasklogger=logging.getLogger("scandeval")defbuild_benchmark_config(docs
progress_bar:bool,save_results:bool,task:str|list[str]|None,dataset:str|list[str]|None,language:str|list[str],model_language:str|list[str]|None,dataset_language:str|list[str]|None,device:Device|None,batch_size:int,raise_errors:bool,cache_dir:str,api_key:str|None,force:bool,verbose:bool,trust_remote_code:bool,use_flash_attention:bool|None,clear_model_cache:bool,evaluate_test_split:bool,few_shot:bool,num_iterations:int,api_base:str|None,api_version:str|None,debug:bool,run_with_cli:bool,only_allow_safetensors:bool,first_time:bool=False,)->BenchmarkConfig:    """Create a benchmark configuration.    Args:        progress_bar:            Whether to show a progress bar when running the benchmark.        save_results:            Whether to save the benchmark results to a file.        task:            The tasks to include for dataset. If None then datasets will not be            filtered based on their task.        dataset:            The datasets to include for task. If None then all datasets will be            included, limited by the `task` parameter.        language:            The language codes of the languages to include, both for models and            datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this            to 'all' if all languages (also non-Scandinavian) should be considered.        model_language:            The language codes of the languages to include for models. If None then            the `language` parameter will be used.        dataset_language:            The language codes of the languages to include for datasets. If None then            the `language` parameter will be used.        device:            The device to use for running the models. If None then the device will be            set automatically.        batch_size:            The batch size to use for running the models.        raise_errors:            Whether to raise errors when running the benchmark.        cache_dir:            The directory to use for caching the models.        api_key:            The API key to use for a given inference server.        force:            Whether to force the benchmark to run even if the results are already            cached.        verbose:            Whether to print verbose output when running the benchmark. This is            automatically set if `debug` is True.        trust_remote_code:            Whether to trust remote code when running the benchmark.        use_flash_attention:            Whether to use Flash Attention for the models. If None then it will be used            if it is available.        clear_model_cache:            Whether to clear the model cache before running the benchmark.        evaluate_test_split:            Whether to use the test split for the datasets.        few_shot:            Whether to use few-shot learning for the models.        num_iterations:            The number of iterations each model should be evaluated for.        api_base:            The base URL for a given inference API. Only relevant if `model` refers to a            model on an inference API.        api_version:            The version of the API to use for a given inference API.        debug:            Whether to run the benchmark in debug mode.        run_with_cli:            Whether the benchmark is being run with the CLI.        only_allow_safetensors:            Whether to only allow evaluations of models stored as safetensors.        first_time:            Whether this is the first time the benchmark configuration is being created.            Defaults to False.    Returns:        The benchmark configuration.    """language_codes=get_correct_language_codes(language_codes=language)model_languages=prepare_languages(language_codes=model_language,default_language_codes=language_codes)dataset_languages=prepare_languages(language_codes=dataset_language,default_language_codes=language_codes)tasks,datasets=prepare_tasks_and_datasets(task=task,dataset=dataset,dataset_languages=dataset_languages)torch_device=prepare_device(device=device)ifuse_flash_attentionisNone:iftorch_device.type!="cuda":use_flash_attention=Falseelif(importlib.util.find_spec("flash_attn")isNoneandimportlib.util.find_spec("vllm_flash_attn")isNone):use_flash_attention=Falseiffirst_timeandtorch_device.type=="cuda":message=("Flash attention has not been installed, so this will not be used. ""To install it, run `pip install -U wheel && ""FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install flash-attn ""--no-build-isolation`. Alternatively, you can disable this ""message by setting ")ifrun_with_cli:message+="the flag `--no-use-flash-attention`."else:message+=("the argument `use_flash_attention=False` in the `Benchmarker`.")log_once(message=message,level=logging.INFO)# Set variable with number of iterationsifhasattr(sys,"_called_from_test"):num_iterations=1returnBenchmarkConfig(model_languages=model_languages,dataset_languages=dataset_languages,tasks=tasks,datasets=datasets,batch_size=batch_size,raise_errors=raise_errors,cache_dir=cache_dir,api_key=api_key,force=force,progress_bar=progress_bar,save_results=save_results,verbose=verboseordebug,device=torch_device,trust_remote_code=trust_remote_code,use_flash_attention=use_flash_attention,clear_model_cache=clear_model_cache,evaluate_test_split=evaluate_test_split,few_shot=few_shot,num_iterations=num_iterations,api_base=api_base,api_version=api_version,debug=debug,run_with_cli=run_with_cli,only_allow_safetensors=only_allow_safetensors,)docs
defget_correct_language_codes(language_codes:str|list[str])->list[str]:    """Get correct language code(s).    Args:        language_codes:            The language codes of the languages to include, both for models and            datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this            to 'all' if all languages (also non-Scandinavian) should be considered.    Returns:        The correct language codes.    """# Create a dictionary that maps languages to their associated language objectslanguage_mapping=get_all_languages()# Create the list `languages`if"all"inlanguage_codes:languages=list(language_mapping.keys())elifisinstance(language_codes,str):languages=[language_codes]else:languages=language_codes# If `languages` contains 'no' then also include 'nb' and 'nn'. Conversely, if# either 'nb' or 'nn' are specified then also include 'no'.if"no"inlanguages:languages=list(set(languages)|{"nb","nn"})elif"nb"inlanguagesor"nn"inlanguages:languages=list(set(languages)|{"no"})returnlanguagesdefprepare_languages(docs
language_codes:str|list[str]|None,default_language_codes:list[str])->list["Language"]:    """Prepare language(s) for benchmarking.    Args:        language_codes:            The language codes of the languages to include for models or datasets.            If specified then this overrides the `language` parameter for model or            dataset languages.        default_language_codes:            The default language codes of the languages to include.    Returns:        The prepared model or dataset languages.    """# Create a dictionary that maps languages to their associated language objectslanguage_mapping=get_all_languages()# Create the list `languages_str` of language codes to use for models or datasetslanguages_str:list[str]iflanguage_codesisNone:languages_str=default_language_codeselifisinstance(language_codes,str):languages_str=[language_codes]else:languages_str=language_codes# Convert the model languages to language objectsif"all"inlanguages_str:prepared_languages=list(language_mapping.values())else:prepared_languages=[language_mapping[language]forlanguageinlanguages_str]returnprepared_languagesdefprepare_tasks_and_datasets(docs
task:str|list[str]|None,dataset_languages:list["Language"],dataset:str|list[str]|None,)->tuple[list["Task"],list[str]]:    """Prepare task(s) and dataset(s) for benchmarking.    Args:        task:            The tasks to include for dataset. If None then datasets will not be            filtered based on their task.        dataset_languages:            The languages of the datasets in the benchmark.        dataset:            The datasets to include for task. If None then all datasets will be            included, limited by the `task` and `dataset_languages` parameters.    Returns:        The prepared tasks and datasets.    Raises:        InvalidBenchmark:            If the task or dataset is not found in the benchmark tasks or datasets.    """# Create a dictionary that maps benchmark tasks to their associated benchmark# task objects, and a dictionary that maps dataset names to their associated# dataset configuration objectstask_mapping=get_all_tasks()all_dataset_configs=get_all_dataset_configs()# Create the list of dataset taskstry:iftaskisNone:tasks=list(task_mapping.values())elifisinstance(task,str):tasks=[task_mapping[task]]else:tasks=[task_mapping[t]fortintask]exceptKeyErrorase:raiseInvalidBenchmark(f"Task {e} not found in the benchmark tasks.")fromeall_official_datasets=[dataset_namefordataset_name,dataset_configinall_dataset_configs.items()ifnotdataset_config.unofficial]ifdatasetisNone:dataset=all_official_datasetselifisinstance(dataset,str):dataset=[dataset]all_datasets=list(all_dataset_configs.keys())invalid_datasets=set(dataset)-set(all_datasets)ifinvalid_datasets:raiseInvalidBenchmark(f"Dataset(s) {', '.join(invalid_datasets)} not found in the benchmark ""datasets.")datasets=[dataset_namefordataset_name,dataset_configinall_dataset_configs.items()ifdataset_nameindatasetanddataset_config.taskintasksandset(dataset_config.languages).intersection(dataset_languages)]returntasks,datasetsdefprepare_device(device:Device|None)->torch.device:docs
    """Prepare device for benchmarking.    Args:        device:            The device to use for running the models. If None then the device will be            set automatically.    Returns:        The prepared device.    """device_mapping={Device.CPU:torch.device("cpu"),Device.CUDA:torch.device("cuda"),Device.MPS:torch.device("mps"),}ifisinstance(device,Device):returndevice_mapping[device]iftorch.cuda.is_available():returntorch.device("cuda")eliftorch.backends.mps.is_available():returntorch.device("mps")else:returntorch.device("cpu")