scandeval.model_cache

docs module scandeval.model_cache
"""ModelCache class for caching model outputs."""importhashlibimportjsonimportloggingimportsysimporttypingastfromcollectionsimportdefaultdictfromdataclassesimportasdictfromtqdm.autoimporttqdmfrom.data_modelsimportGenerativeModelOutput,SingleGenerativeModelOutputift.TYPE_CHECKING:frompathlibimportPathfromdatasetsimportDatasetlogger=logging.getLogger("scandeval")classModelCache:docs
    """A cache for model outputs.    Attributes:        model_cache_dir:            The directory to store the cache in.        cache_path:            The path to the cache file.        cache:            The model output cache.        max_generated_tokens:            The maximum number of tokens to generate for each example.    """def__init__(self,model_cache_dir:"Path",cache_name:str,max_generated_tokens:int):        """Initialize the model output cache.        Args:            model_cache_dir:                The directory to store the cache in.            cache_name:                The name of the cache file.            max_generated_tokens:                The maximum number of tokens to generate for each example.        """self.model_cache_dir=model_cache_dirself.model_cache_dir.mkdir(parents=True,exist_ok=True)self.cache_path=self.model_cache_dir/cache_name.replace("/","--")self.max_generated_tokens=max_generated_tokensdefload(self)->None:docs
        """Load the model output cache."""ifnotself.cache_path.exists():withself.cache_path.open("w")asf:json.dump(dict(),f)try:withself.cache_path.open()asf:json_cache=json.load(f)exceptjson.JSONDecodeError:logger.warning(f"Failed to load the cache from {self.cache_path}. The cache will be "f"re-initialised.")json_cache=dict()withself.cache_path.open("w")asf:json.dump(dict(),f)cache:dict[str,SingleGenerativeModelOutput]=dict()forkeyinjson_cache:cache[key]=SingleGenerativeModelOutput(**json_cache[key])self.cache=cachedefsave(self)->None:docs
        """Save the model output cache to disk."""dumpable_cache:dict[str,dict]=defaultdict(dict)forkey,valueinself.cache.items():dumpable_cache[key]=asdict(value)try:withself.cache_path.open("w")asf:json.dump(dumpable_cache,f)exceptKeyError:logger.warning(f"Failed to load the cache from {self.cache_path}. The cache will be "f"re-initialised.")self.cache=dict()withself.cache_path.open("w")asf:json.dump(dict(),f)def_hash_key(self,key:str|list[dict[str,str]])->str:        """Hash the key to use as an index in the cache.        Args:            key:                The key to hash.        Returns:            The hashed key.        """returnhashlib.md5(string=str(key).encode()).hexdigest()def__getitem__(self,key:str|list[dict[str,str]])->SingleGenerativeModelOutput:        """Get an item from the cache.        Args:            key:                The key to use to index the cache.        Returns:            The model output.        """hashed_key=self._hash_key(key=key)returnself.cache[hashed_key]def__setitem__(self,key:t.Any,value:SingleGenerativeModelOutput)->None:        """Set an item in the cache.        Args:            key:                The key to use to index the cache.            value:                The value to set in the cache.        """hashed_key=self._hash_key(key=key)self.cache[hashed_key]=valuedefremove(self)->None:docs
        """Remove the cache from memory and delete it from disk."""self.cache_path.unlink()delself.cachedef__contains__(self,key:str|list[dict[str,str]])->bool:        """Check if a key is in the cache.        Args:            key:                The key to check.        Returns:            Whether the key is in the cache.        """hashed_key=self._hash_key(key=key)returnhashed_keyinself.cachedefadd_to_cache(docs
self,model_inputs:dict,model_output:GenerativeModelOutput)->None:        """Add the model input/output to the cache.        Args:            model_inputs:                The model inputs.            model_output:                The model output.        """input_column="messages"if"messages"inmodel_inputselse"text"model_inputs=model_inputs[input_column]# Store the generated sequences in the cache, one by onewithtqdm(iterable=model_inputs,desc="Caching model outputs",leave=False,disable=hasattr(sys,"_called_from_test"),)aspbar:forsample_idx,model_inputinenumerate(pbar):# Extract the scores from the model output, to be cached. We only store# the indices of the top scores, to save space. Further, we only store# the scores if the generated sequence is shorter than the maximum# lengthifmodel_output.scoresisnotNoneandself.max_generated_tokens<8:assertmodel_output.scoresisnotNonescores=model_output.scores[sample_idx]else:scores=Noneself[model_input]=SingleGenerativeModelOutput(sequence=model_output.sequences[sample_idx],scores=scores)defsplit_dataset_into_cached_and_non_cached(docs
dataset:"Dataset",cache:ModelCache)->tuple["Dataset","Dataset"]:    """Split a dataset into a cached and non-cached part.    Args:        dataset:            The dataset to split.        cache:            The model output cache.    Returns:        The cached and non-cached parts of the dataset.    """# Get the sample indices of the non-cached examples, which are unique with respect# to the "text" column.input_column="messages"if"messages"indataset.column_nameselse"text"dataset_texts=dataset[input_column]unique_non_cached_ids=set()unique_texts=list()foridx,dataset_textinenumerate(dataset_texts):ifdataset_textnotincacheanddataset_textnotinunique_texts:unique_non_cached_ids.add(idx)unique_texts.append(dataset_text)# The cached examples are the ones that are not in the non-cached examples. This# means that if the dataset has duplicates, only a single copy of the duplicate# will be put in the non-cached part, and the rest in the cached part.cached_ids=set(range(len(dataset)))-unique_non_cached_idscached=dataset.select(cached_ids)non_cached=dataset.select(unique_non_cached_ids)returncached,non_cacheddefload_cached_model_outputs(docs
cached_dataset:"Dataset",cache:ModelCache)->GenerativeModelOutput:    """Load the cached model outputs.    Args:        cached_dataset:            The dataset containing the cached examples.        cache:            The model output cache.    Returns:        The model output containing the cached sequences.    """input_column="messages"if"messages"incached_dataset.column_nameselse"text"cached_model_outputs:list[SingleGenerativeModelOutput]=[cache[prompt]forpromptincached_dataset[input_column]]cached_sequences=[model_output.sequenceformodel_outputincached_model_outputs]ifcached_model_outputs[0].scoresisNone:returnGenerativeModelOutput(sequences=cached_sequences)cached_scores=[model_output.scoresor[]formodel_outputincached_model_outputs]returnGenerativeModelOutput(sequences=cached_sequences,scores=cached_scores)