scandeval.utils

docs module scandeval.utils
"""Utility functions to be used in other scripts."""importgcimportimportlibimportimportlib.utilimportloggingimportosimportrandomimportreimportsysimporttypingastimportwarningsfromfunctoolsimportcachefrompathlibimportPathimportlitellmimportnumpyasnpimportpkg_resourcesimportrequestsimporttorchfromdatasets.utilsimportdisable_progress_barfromrequests.exceptionsimportRequestExceptionfromtransformersimportPreTrainedTokenizerfromtransformersimportloggingastf_loggingfrom.exceptionsimportNaNValueInModelOutputifimportlib.util.find_spec("ray")isnotNone:importrayift.TYPE_CHECKING:from.typesimportPredictionslogger=logging.getLogger("scandeval")defcreate_model_cache_dir(cache_dir:str,model_id:str)->str:docs
    """Create cache directory for a model.    Args:        cache_dir:            The cache directory.        model_id:            The model ID.    Returns:        The path to the cache directory.    """# to avoid nesting due to models name containing '/'_model_id=model_id.replace("/","--")cache_dir_path=Path(cache_dir)/"model_cache"/_model_idreturnstr(cache_dir_path)defclear_memory():docs
    """Clears the memory of unused items."""forgc_generationinrange(3):gc.collect(generation=gc_generation)iftorch.cuda.is_available():torch.cuda.empty_cache()iftorch.backends.mps.is_available():torch.mps.empty_cache()defenforce_reproducibility(seed:int=4242):docs
    """Ensures reproducibility of experiments.    Args:        seed:            Seed for the random number generator.    """random.seed(seed)np.random.seed(seed)rng=np.random.default_rng(seed)torch.manual_seed(seed)torch.cuda.manual_seed_all(seed)os.environ["CUDA_LAUNCH_BLOCKING"]="1"os.environ["CUBLAS_WORKSPACE_CONFIG"]=":4096:8"torch.backends.cudnn.benchmark=Falsetorch.backends.cudnn.deterministic=Truetorch.use_deterministic_algorithms(True,warn_only=True)returnrngdefis_module_installed(module:str)->bool:docs
    """Check if a module is installed.    This is used when dealing with spaCy models, as these are installed as separate    Python packages.    Args:        module:            The name of the module.    Returns:        Whether the module is installed or not.    """# Get list of all modules, including their versionsinstalled_modules_with_versions=list(pkg_resources.working_set)# Strip the module versions from the list of modules. Also make the modules lower# case and replace dashes with underscoresinstalled_modules=[re.sub("[0-9. ]","",str(module)).lower().replace("-","_")formoduleininstalled_modules_with_versions]# Check if the module is installed by checking if the module name is in the listreturnmodule.lower()ininstalled_modulesdefblock_terminal_output():docs
    """Blocks libraries from writing output to the terminal.    This filters warnings from some libraries, sets the logging level to ERROR for some    libraries, disabled tokeniser progress bars when using Hugging Face tokenisers, and    disables most of the logging from the `transformers` library.    """# Ignore miscellaneous warningswarnings.filterwarnings("ignore",category=UserWarning)warnings.filterwarnings("ignore",category=FutureWarning)warnings.filterwarnings("ignore",module="torch.nn.parallel*",message="Was asked to gather along dimension 0, but all input tensors were ""scalars; will instead unsqueeze and return a vector.",)warnings.filterwarnings("ignore",module="seqeval*")# Up the logging level, to disable outputslogging.getLogger("filelock").setLevel(logging.CRITICAL)logging.getLogger("absl").setLevel(logging.CRITICAL)logging.getLogger("datasets").setLevel(logging.CRITICAL)logging.getLogger("openai").setLevel(logging.CRITICAL)logging.getLogger("torch.distributed.distributed_c10d").setLevel(logging.CRITICAL)logging.getLogger("torch.nn.parallel.distributed").setLevel(logging.CRITICAL)logging.getLogger("vllm").setLevel(logging.CRITICAL)logging.getLogger("vllm.engine.llm_engine").setLevel(logging.CRITICAL)logging.getLogger("vllm.transformers_utils.tokenizer").setLevel(logging.CRITICAL)logging.getLogger("vllm.core.scheduler").setLevel(logging.CRITICAL)logging.getLogger("vllm.model_executor.weight_utils").setLevel(logging.CRITICAL)logging.getLogger("httpx").setLevel(logging.CRITICAL)logging.getLogger("ray._private.worker").setLevel(logging.CRITICAL)logging.getLogger("matplotlib.font_manager").setLevel(logging.CRITICAL)logging.getLogger("accelerate").setLevel(logging.CRITICAL)logging.getLogger("LiteLLM").setLevel(logging.CRITICAL)logging.getLogger("huggingface_hub").setLevel(logging.CRITICAL)# This suppresses vLLM loggingos.environ["LOG_LEVEL"]="CRITICAL"os.environ["VLLM_CONFIGURE_LOGGING"]="0"ifimportlib.util.find_spec("ray")isnotNone:ray._private.worker._worker_logs_enabled=False# Disable the tokeniser progress barsdisable_progress_bar()# Disable most of the `transformers` loggingtf_logging._default_log_level=logging.CRITICALtf_logging.set_verbosity(logging.CRITICAL)logging.getLogger("transformers.trainer").setLevel(logging.CRITICAL)# Disable logging from `litellm`litellm.suppress_debug_info=Truedocs
defget_class_by_name(class_name:str|list[str],module_name:str)->t.Type|None:    """Get a class by its name.    Args:        class_name:            The name of the class, written in kebab-case. The corresponding class name            must be the same, but written in PascalCase, and lying in a module with the            same name, but written in snake_case. If a list of strings is passed, the            first class that is found is returned.        module_name:            The name of the module where the class is located.    Returns:        The class. If the class is not found, None is returned.    """ifisinstance(class_name,str):class_name=[class_name]error_messages=list()fornameinclass_name:try:module=importlib.import_module(name=module_name)class_:t.Type=getattr(module,name)returnclass_except(ModuleNotFoundError,AttributeError)ase:error_messages.append(str(e))iferror_messages:errors="\n- "+"\n- ".join(error_messages)logger.debug(f"Could not find the class with the name(s) {', '.join(class_name)}. The "f"following error messages were raised: {errors}")# If the class could not be found, return NonereturnNonedefkebab_to_pascal(kebab_string:str)->str:docs
    """Converts a kebab-case string to PascalCase.    Args:        kebab_string:            The kebab-case string.    Returns:        The PascalCase string.    """return"".join(word.title()forwordinkebab_string.split("-"))definternet_connection_available()->bool:docs
    """Checks if internet connection is available by pinging google.com.    Returns:        Whether or not internet connection is available.    """try:requests.get("https://www.google.com")returnTrueexceptRequestException:returnFalsedocs
defget_special_token_metadata(tokenizer:"PreTrainedTokenizer")->dict:    """Get the special token metadata for a tokenizer.    Args:        tokenizer:            The tokenizer.    Returns:        The special token metadata.    """# Create some test input IDs, to check if the tokenizer is adding special tokenstest_input_ids=tokenizer("Test").input_ids# Extract the CLS token IDs from the tokenizer, if it's using themhas_cls_token=Trueiftokenizer.cls_token_idintest_input_ids:cls_token_id=tokenizer.cls_token_idcls_token=tokenizer.cls_tokeneliftokenizer.bos_token_idintest_input_ids:cls_token_id=tokenizer.bos_token_idcls_token=tokenizer.bos_tokeneliftokenizer.cls_tokenisnotNone:cls_token_id=tokenizer.cls_token_idcls_token=tokenizer.cls_tokenhas_cls_token=Falseelse:cls_token_id=tokenizer.bos_token_idcls_token=tokenizer.bos_tokenhas_cls_token=False# Extract the SEP token IDs from the tokenizer, if it's using themhas_sep_token=Trueiftokenizer.sep_token_idintest_input_ids:sep_token=tokenizer.sep_tokeneliftokenizer.eos_token_idintest_input_ids:sep_token=tokenizer.eos_tokeneliftokenizer.sep_tokenisnotNone:sep_token=tokenizer.sep_tokenhas_sep_token=Falseelse:sep_token=tokenizer.eos_tokenhas_sep_token=Falsereturndict(cls_token_id=cls_token_id,cls_token=cls_token,sep_token=sep_token,has_cls_token=has_cls_token,has_sep_token=has_sep_token,)classHiddenPrints:docs
    """Context manager which removes all terminal output."""def__enter__(self):        """Enter the context manager."""self._original_stdout=sys.stdoutself._original_stderr=sys.stderrsys.stdout=open(os.devnull,"w")sys.stderr=open(os.devnull,"w")def__exit__(self,exc_type,exc_val,exc_tb):        """Exit the context manager."""sys.stdout.close()sys.stderr.close()sys.stdout=self._original_stdoutsys.stderr=self._original_stderrdocs
defraise_if_model_output_contains_nan_values(model_output:"Predictions")->None:    """Raise an exception if the model output contains NaN values.    Args:        model_output:            The model output to check.    Raises:        If the model output contains NaN values.    """ifisinstance(model_output,np.ndarray):ifmodel_output.dtype==np.float32andnp.isnan(model_output).any():raiseNaNValueInModelOutput()eliflen(model_output)>0:ifisinstance(model_output[0],str):ifany(x!=xforxinmodel_output):raiseNaNValueInModelOutput()eliflen(model_output[0])>0:ifany(x!=xforsublistinmodel_outputforxinsublist):raiseNaNValueInModelOutput()defshould_prompts_be_stripped(docs
labels_to_be_generated:list[str],tokenizer:"PreTrainedTokenizer")->bool:    """Determine if we should strip the prompts for few-shot evaluation.    This is the case if the tokenizer needs to include the space as part of the label    token. The strategy is thus to tokenize a label with a preceeding colon (as in the    prompts), i.e., ": positive", and check if the tokenization starts with the tokens    of ": ". If this is the case, then we should not strip the prompts, since the    tokenizer produces the whitespace token separately.    Args:        labels_to_be_generated:            The labels that are to be generated.        tokenizer:            The tokenizer used to tokenize the labels.    Returns:        Whether we should strip the prompts.    """strip_prompts=Trueforlabelinlabels_to_be_generated:colon_tokens=tokenizer(": ",add_special_tokens=False).input_idslabel_tokens=tokenizer(": "+label,add_special_tokens=False).input_idsifisinstance(colon_tokens,torch.Tensor):colon_tokens=list(colon_tokens.squeeze(0))ifisinstance(label_tokens,torch.Tensor):label_tokens=list(label_tokens.squeeze(0))label_tokens_start_with_colon_tokens=(label_tokens[:len(colon_tokens)]==colon_tokens)iflabel_tokens_start_with_colon_tokens:strip_prompts=Falsereturnstrip_prompts# TODO: This is currently not used - maybe remove.defshould_prefix_space_be_added_to_labels(docs
labels_to_be_generated:list[str],tokenizer:"PreTrainedTokenizer")->bool:    """Determine if we should add a prefix space to the labels.    This is the case if the prompts are stripped and the tokenizer doesn't    automatically add prefix whitespaces to the labels.    Args:        labels_to_be_generated:            The labels that are to be generated.        tokenizer:            The tokenizer used to tokenize the labels.    Returns:        Whether we should add a prefix space to the labels.    """ifnotshould_prompts_be_stripped(labels_to_be_generated=labels_to_be_generated,tokenizer=tokenizer):returnFalsewhitespace_token=tokenizer.convert_ids_to_tokens(ids=tokenizer(" ",add_special_tokens=False).input_ids[0])[0]add_prefix_space=Trueforlabelinlabels_to_be_generated:label_tokens=tokenizer(label,add_special_tokens=False).input_idsifisinstance(label_tokens,torch.Tensor):label_tokens=list(label_tokens.squeeze(0))first_label_token:int=int(label_tokens[0])first_character_of_label=tokenizer.convert_ids_to_tokens(first_label_token)[0]has_prefix_space=first_character_of_label==whitespace_tokenifhas_prefix_space:add_prefix_space=Falsebreakreturnadd_prefix_spacedocs
defget_end_of_chat_token_ids(tokenizer:"PreTrainedTokenizer")->list[int]|None:    """Get the end token ID for chat models.    This is only relevant for tokenizers with a chat template.    Args:        tokenizer:            The tokenizer.    Returns:        The token IDs used to end chats, or None if the tokenizer does not have a chat        template.    Raises:        ValueError:            If the end-of-chat token could not be located.    """iftokenizer.chat_templateisNone:returnNoneuser_message:dict[t.Literal["role","content"],str]=dict()user_message["role"]="user"user_message["content"]="X"token_ids=tokenizer.apply_chat_template(conversation=[user_message])assertisinstance(token_ids,list)foridx,tokeninenumerate(tokenizer.convert_ids_to_tokens(token_ids)):token_id=tokenizer.convert_tokens_to_ids(token)assertisinstance(token_id,int)token=tokenizer.decode([token_id])if"X"intoken:x_token_index=idxbreakelse:raiseValueError("Could not locate the end-of-chat token for the model.")end_of_chat_tokens=token_ids[x_token_index+1:]iflen(end_of_chat_tokens)==0:returnNonereturnend_of_chat_tokensdefscramble(text:str)->str:docs
    """Scramble a string in a bijective manner.    Args:        text:            The string to scramble.    Returns:        The scrambled string.    """rng=np.random.default_rng(seed=4242)permutation=rng.permutation(x=len(text))scrambled="".join(text[i]foriinpermutation)returnscrambleddefunscramble(scrambled_text:str)->str:docs
    """Unscramble a string in a bijective manner.    Args:        scrambled_text:            The scrambled string to unscramble.    Returns:        The unscrambled string.    """rng=np.random.default_rng(seed=4242)permutation=rng.permutation(x=len(scrambled_text))inverse_permutation=np.argsort(permutation)unscrambled="".join(scrambled_text[i]foriininverse_permutation)returnunscrambled@cachedeflog_once(message:str,level:int=logging.INFO)->None:docs
    """Log a message once.    This is ensured by caching the input/output pairs of this function, using the    `functools.cache` decorator.    Args:        message:            The message to log.        level:            The logging level. Defaults to logging.INFO.    """matchlevel:caselogging.DEBUG:logger.debug(message)caselogging.INFO:logger.info(message)caselogging.WARNING:logger.warning(message)caselogging.ERROR:logger.error(message)caselogging.CRITICAL:logger.critical(message)case _:raiseValueError(f"Invalid logging level: {level}")