scandeval.data_loading

docs module scandeval.data_loading
"""Functions related to the loading of the data."""importloggingimportsysimporttimefromdatasetsimportDataset,DatasetDict,load_datasetfromdatasets.exceptionsimportDatasetsErrorfromhuggingface_hub.utilsimportHfHubHTTPErrorfromnumpy.randomimportGeneratorfrom.data_modelsimportBenchmarkConfig,DatasetConfigfrom.exceptionsimportInvalidBenchmarkfrom.utilsimportunscramblelogger=logging.getLogger("scandeval")defload_data(docs
rng:Generator,dataset_config:"DatasetConfig",benchmark_config:"BenchmarkConfig")->list[DatasetDict]:    """Load the raw bootstrapped datasets.    Args:        rng:            The random number generator to use.        dataset_config:            The configuration for the dataset.        benchmark_config:            The configuration for the benchmark.    Returns:        A list of bootstrapped datasets, one for each iteration.    """num_attempts=5for_inrange(num_attempts):try:dataset=load_dataset(path=dataset_config.huggingface_id,cache_dir=benchmark_config.cache_dir,token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_"),)breakexcept(FileNotFoundError,DatasetsError):logger.warning(f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying...")time.sleep(1)continueexceptHfHubHTTPError:raiseInvalidBenchmark("The Hugging Face Hub seems to be down.")else:raiseInvalidBenchmark(f"Failed to load dataset {dataset_config.huggingface_id!r} after "f"{num_attempts} attempts.")assertisinstance(dataset,DatasetDict)# type: ignore[used-before-def]dataset=DatasetDict({key:dataset[key]forkeyin["train","val","test"]})ifnotbenchmark_config.evaluate_test_split:dataset["test"]=dataset["val"]# Remove empty examples from the datasetsfortext_featurein["tokens","text"]:iftext_featureindataset["train"].features:dataset=dataset.filter(lambdax:len(x[text_feature])>0)# If we are testing then truncate the test setifhasattr(sys,"_called_from_test"):dataset["test"]=dataset["test"].select(range(1))# Bootstrap the splitsbootstrapped_splits:dict[str,list[Dataset]]=dict()forsplitin["train","val","test"]:bootstrap_indices=rng.integers(0,len(dataset[split]),size=(benchmark_config.num_iterations,len(dataset[split])),)bootstrapped_splits[split]=[dataset[split].select(bootstrap_indices[idx])foridxinrange(benchmark_config.num_iterations)]datasets=[DatasetDict({split:bootstrapped_splits[split][idx]forsplitin["train","val","test"]})foridxinrange(benchmark_config.num_iterations)]returndatasets