scandeval.benchmark_modules.vllm

docs module scandeval.benchmark_modules.vllm
"""Generative models using the vLLM inference framework."""importcollections.abcascimportimportlib.utilimportitertoolsasitimportjsonimportloggingimportosimportrandomimportreimportsysimporttypingastfromfunctoolsimportpartialfrompathlibimportPathfromtimeimportsleepfromtypesimportMethodTypeimporttorchfromdatasetsimportDatasetDictfromhuggingface_hubimportsnapshot_downloadfrompydanticimportconlist,create_modelfromtqdm.autoimporttqdmfromtransformersimportAutoConfig,AutoTokenizer,PreTrainedTokenizer,Trainerfromurllib3.exceptionsimportRequestErrorfrom..constantsimport(GENERATIVE_PIPELINE_TAGS,MAX_LOGPROBS,MERGE_TAGS,REASONING_MAX_TOKENS,TASK_GROUPS_USING_LOGPROBS,TASKS_USING_JSON,)from..data_modelsimport(BenchmarkConfig,DatasetConfig,GenerativeModelOutput,ModelConfig,Task,)from..enumsimport(BatchingPreference,GenerativeType,InferenceBackend,ModelType,TaskGroup,)from..exceptionsimport(InvalidBenchmark,InvalidModel,NeedsEnvironmentVariable,NeedsExtraInstalled,)from..languagesimportget_all_languagesfrom..task_utilsimport(question_answering,sequence_classification,text_to_text,token_classification,)from..typesimportExtractLabelsFunctionfrom..utilsimport(clear_memory,create_model_cache_dir,get_end_of_chat_token_ids,log_once,should_prompts_be_stripped,)from.hfimportHuggingFaceEncoderModel,get_model_repo_info,load_hf_model_configift.TYPE_CHECKINGorimportlib.util.find_spec("vllm")isnotNone:fromvllmimportLLM,RequestOutput,SamplingParamsfromvllm.lora.requestimportLoRARequestfromvllm.sampling_paramsimportGuidedDecodingParamstry:fromvllm.model_executor.parallel_utils.parallel_stateimport(destroy_model_parallel,)exceptImportError:fromvllm.distributed.parallel_stateimportdestroy_model_parallelift.TYPE_CHECKINGorimportlib.util.find_spec("ray")isnotNone:importraylogger=logging.getLogger("scandeval")classVLLMModel(HuggingFaceEncoderModel):docs
    """A generative model using the vLLM inference framework."""fresh_model=Falsebatching_preference=BatchingPreference.ALL_AT_ONCEhigh_priority=Truedef__init__(self,model_config:ModelConfig,dataset_config:DatasetConfig,benchmark_config:BenchmarkConfig,)->None:        """Initialise the vLLM model.        Args:            model_config:                The model configuration.            dataset_config:                The dataset configuration.            benchmark_config:                The benchmark configuration.        """if(importlib.util.find_spec("vllm")isNoneorimportlib.util.find_spec("ray")isNone):raiseNeedsExtraInstalled(extra="generative")output_scores=dataset_config.task.task_groupinTASK_GROUPS_USING_LOGPROBSmodel,tokenizer=load_model_and_tokenizer(model_config=model_config,benchmark_config=benchmark_config,output_scores=output_scores,)self._model:LLM=modelself._tokenizer:PreTrainedTokenizer=tokenizerself.end_of_reasoning_token_id=get_end_of_reasoning_token_id(model=self._model,tokenizer=self._tokenizer)# We specify `HuggingFaceEncoderModel` here instead of `VLLMModel`, as we want# to call the `__init__` method of the `BenchmarkModule` class.super(HuggingFaceEncoderModel,self).__init__(model_config=model_config,dataset_config=dataset_config,benchmark_config=benchmark_config,)self.buffer["output_scores"]=output_scoresself.buffer["instruction_model"]=self._tokenizer.chat_templateisnotNoneifself.model_config.adapter_base_model_idisnotNone:adapter_path=snapshot_download(repo_id=self.model_config.model_id,cache_dir=Path(self.model_config.model_cache_dir),)self.buffer["lora_request"]=LoRARequest(lora_name="adapter",lora_int_id=1,lora_path=adapter_path)@propertydefgenerative_type(self)->GenerativeType|None:docs
        """Get the generative type of the model.        Returns:            The generative type of the model, or None if it has not been set yet.        """ifnothasattr(self,"_tokenizer"):returnNoneelifself.end_of_reasoning_token_idisnotNone:returnGenerativeType.REASONINGelifself._tokenizer.chat_templateisnotNone:returnGenerativeType.INSTRUCTION_TUNEDelse:returnGenerativeType.BASE@propertydefextract_labels_from_generation(self)->ExtractLabelsFunction:docs
        """The function used to extract the labels from the generated output.        Returns:            The function used to extract the labels from the generated output.        """matchself.dataset_config.task.task_group:case(TaskGroup.SEQUENCE_CLASSIFICATION|TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION):returnpartial(sequence_classification.extract_labels_from_generation,dataset_config=self.dataset_config,)caseTaskGroup.TEXT_TO_TEXT:returntext_to_text.extract_labels_from_generationcaseTaskGroup.TOKEN_CLASSIFICATION:returnpartial(token_classification.extract_labels_from_generation,dataset_config=self.dataset_config,)caseTaskGroup.QUESTION_ANSWERING:returnquestion_answering.extract_labels_from_generationcase _:raiseNotImplementedError(f"Unsupported task group: {self.dataset_config.task.task_group}.")defprepare_dataset(docs
self,dataset:DatasetDict,task:Task,itr_idx:int)->DatasetDict:        """Prepare the dataset for the model.        This includes things like tokenisation.        Args:            dataset:                The dataset to prepare.            task:                The task to prepare the dataset for.            itr_idx:                The index of the dataset in the iterator.        Returns:            The prepared dataset.        """iftask.task_group==TaskGroup.QUESTION_ANSWERING:dataset=dataset.map(lambdaexamples:dict(label=[dict(id=id,answers=dict(answer_start=answer_dct["answer_start"],text=[answer_text.lower()foranswer_textinanswer_dct["text"]],),)forid,answer_dctinzip(examples["id"],examples["answers"])]),batched=True,load_from_cache_file=False,keep_in_memory=True,)ifself.benchmark_config.few_shot:few_shot_examples=self._extract_few_shot_examples(dataset=dataset,task=task,itr_idx=itr_idx)else:few_shot_examples=list()dataset["test"]=dataset["test"].map(partial(self._apply_prompt,few_shot_examples=few_shot_examples,task=task),batched=True,load_from_cache_file=False,keep_in_memory=True,)returndatasetdefgenerate(self,inputs:dict)->GenerativeModelOutput:docs
        """Generate outputs from the model.        Args:            inputs:                A batch of inputs to pass through the model.        Returns:            The generated model outputs.        """# Define which tokens to use as stopping criteria. We want to use the padding# token, end-of-sentence token, and a double newline if the model isn't# instruction tuned (since these separate the few-shot examples in the input in# this case)stop_tokens:list[str]=list()ifself.buffer["instruction_model"]isFalse:stop_tokens.append("\n\n")ifself._tokenizer.pad_token_idisnotNone:stop_tokens.append(self._tokenizer.pad_token)ifself._tokenizer.eos_token_idisnotNone:stop_tokens.append(self._tokenizer.eos_token)ifself._tokenizer.pad_token_idisNone:self._tokenizer.pad_token_id=self._tokenizer.eos_token_idself._tokenizer.pad_token=self._tokenizer.eos_tokenif(self._tokenizer.bos_token_idisnotNoneandself._tokenizer.pad_token_idisNone):self._tokenizer.pad_token_id=self._tokenizer.bos_token_idself._tokenizer.pad_token=self._tokenizer.bos_tokenelif(self._tokenizer.eos_token_idisnotNoneandself._tokenizer.pad_token_idisNone):self._tokenizer.pad_token_id=self._tokenizer.eos_token_idself._tokenizer.pad_token=self._tokenizer.eos_tokenelifself._tokenizer.pad_token_idisNone:pad_token_candidates=["<pad>","[pad]","<|endoftext|>","<|im_end|>"]pad_token_candidates.extend([c.upper()forcinpad_token_candidates])forcandidateinpad_token_candidates:ifcandidateinself._tokenizer.get_vocab():pad_token_id=self._tokenizer.get_vocab()[candidate]self._tokenizer.pad_token=candidateself._tokenizer.pad_token_id=pad_token_idbreakelse:raiseInvalidModel("Could not find a suitable token to use as a padding token, since ""the model does not have a BOS, EOS, or padding token, and does "f"not have any of the following tokens in its vocabulary: "f"{pad_token_candidates}.")assertself._tokenizer.pad_token_idisnotNone# Add end of chat token as a stopping token, if it existsend_of_chat_token_ids=get_end_of_chat_token_ids(tokenizer=self._tokenizer)ifend_of_chat_token_idsisnotNone:end_of_chat_token=self._tokenizer.decode(end_of_chat_token_ids).strip()ifend_of_chat_token:stop_tokens.append(end_of_chat_token)ifself.dataset_config.taskinTASKS_USING_JSON:ner_tag_names=list(self.dataset_config.prompt_label_mapping.values())keys_and_their_types:dict[str,t.Any]={tag_name:(conlist(str,max_length=5),...)fortag_nameinner_tag_names}pydantic_class=create_model("AnswerFormat",**keys_and_their_types)schema=pydantic_class.model_json_schema()guided_decoding=GuidedDecodingParams(json=schema,backend="outlines",whitespace_pattern=r" ?")else:guided_decoding=None# Define the parameters used for vLLM generationmax_tokens:int=(REASONING_MAX_TOKENSifself.generative_type==GenerativeType.REASONINGelseself.dataset_config.max_generated_tokens)sampling_params=SamplingParams(max_tokens=max_tokens,logprobs=MAX_LOGPROBSifself.buffer["output_scores"]elseNone,temperature=0.0,stop=[stop_tokenforstop_tokeninstop_tokensifstop_token],guided_decoding=guided_decoding,)# If any of the prompts are empty then we need to replace them with a BOS token# so that the vLLM model can generate from themprompts:list[str]=inputs["text"]ifany(len(prompt)==0forpromptinprompts):logger.debug("Found empty prompts, replacing with BOS token.")prompts=[promptiflen(prompt)>0elsestr(self._tokenizer.bos_token)forpromptinprompts]# Strip the prompts if the model's tokeniser requires itlabels_to_be_generated=list(self.dataset_config.prompt_label_mapping.values())iflen(labels_to_be_generated)==0:labels_to_be_generated=["negative","positive"]ifnotself.buffer.get("instruction_model",False)andshould_prompts_be_stripped(labels_to_be_generated=labels_to_be_generated,tokenizer=self._tokenizer):log_once(message="Stripping prompts.",level=logging.DEBUG)prompts=[prompt.strip()forpromptinprompts]# Generate sequences using vLLMinput_is_a_test=len(prompts)==1andlen(set(prompts[0]))==1raw_outputs=self._model.generate(prompts=prompts,sampling_params=sampling_params,use_tqdm=(notinput_is_a_test),lora_request=self.buffer.get("lora_request"),)completion_ids:list[list[int]]=[output.outputs[0].token_idsforoutputinraw_outputs]ifself.end_of_reasoning_token_idincompletion_ids[0]:completion_ids=[token_ids[token_ids.index(self.end_of_reasoning_token_id)+2:]ifself.end_of_reasoning_token_idintoken_idselsetoken_idsfortoken_idsincompletion_ids]completions=self._tokenizer.batch_decode(sequences=[torch.LongTensor(completion_id)forcompletion_idincompletion_ids],skip_special_tokens=True,)completions=[completion.strip()forcompletionincompletions]# Add logprobs scores to the outputifself.buffer["output_scores"]:scores:list[list[list[tuple[str,float]]]]=[[[(obj.decoded_token,obj.logprob)forobjintoken_logprobs_dict.values()]fortoken_logprobs_dictinraw_output.outputs[0].logprobs]forraw_outputinraw_outputs]scores=[score_list[raw_output.outputs[0].token_ids.index(self.end_of_reasoning_token_id)+2:]ifself.end_of_reasoning_token_idinraw_output.outputs[0].token_idselsescore_listforraw_output,score_listinzip(raw_outputs,scores)]output=GenerativeModelOutput(sequences=completions,scores=scores)else:output=GenerativeModelOutput(sequences=completions)returnoutput@classmethoddefmodel_exists(docs
cls,model_id:str,benchmark_config:BenchmarkConfig)->bool|NeedsExtraInstalled|NeedsEnvironmentVariable:        """Check if a model exists.        Args:            model_id:                The model ID.            benchmark_config:                The benchmark configuration.        Returns:            Whether the model exists, or an error describing why we cannot check            whether the model exists.        """using_api=(benchmark_config.api_baseisnotNoneorbenchmark_config.api_versionisnotNone)ifusing_api:returnFalsemodel_id,revision=(model_id.split("@")if"@"inmodel_idelse(model_id,"main"))model_info=get_model_repo_info(model_id=model_id,revision=revision,benchmark_config=benchmark_config)return(model_infoisnotNoneandmodel_info.pipeline_taginGENERATIVE_PIPELINE_TAGS)@classmethoddefget_model_config(docs
cls,model_id:str,benchmark_config:BenchmarkConfig)->ModelConfig:        """Fetch the model configuration.        Args:            model_id:                The model ID.            benchmark_config:                The benchmark configuration.        Returns:            The model configuration.        """model_id,revision=(model_id.split("@")if"@"inmodel_idelse(model_id,"main"))model_info=get_model_repo_info(model_id=model_id,revision=revision,benchmark_config=benchmark_config)ifmodel_infoisNone:raiseInvalidModel(f"The model {model_id!r} could not be found.")language_mapping=get_all_languages()language_codes=list(language_mapping.keys())model_config=ModelConfig(model_id=model_id,revision=revision,task=model_info.pipeline_tag,languages=[language_mapping[tag]fortaginmodel_info.tagsiftaginlanguage_codes],merge=any(taginmodel_info.tagsfortaginMERGE_TAGS),inference_backend=InferenceBackend.VLLM,model_type=ModelType.GENERATIVE,fresh=False,model_cache_dir=create_model_cache_dir(cache_dir=benchmark_config.cache_dir,model_id=model_id),adapter_base_model_id=model_info.adapter_base_model_id,)returnmodel_configdef_extract_few_shot_examples(self,dataset:DatasetDict,task:Task,itr_idx:int)->list[dict[str,t.Any]]:        """Extract few-shot examples from a dataset.        This will always extract the examples from the training split.        We ensure that the few-shot examples are unique by picking them one at a time.        Args:            dataset:                The dataset to extract the few-shot examples from.            task:                The task that is being benchmarked.            itr_idx:                The index of the dataset in the iterator.        Returns:            The few-shot examples.        """random_seed=4242+itr_idxnum_few_shots=self.dataset_config.num_few_shot_examplesfew_shot_examples:list[dict[str,t.Any]]=list()shuffled_train=dataset["train"].shuffle(seed=random_seed)matchtask.task_group:case(TaskGroup.SEQUENCE_CLASSIFICATION|TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION):labels=it.cycle(self.dataset_config.labels)while(len(few_shot_examples)<num_few_shotsandlen(shuffled_train)>0):label=next(labels)possible_examples=shuffled_train.filter(lambdax:x["label"].lower()==label.lower())iflen(possible_examples)==0:continueexample=possible_examples.select(range(1))[0]few_shot_examples.append(example)shuffled_train=shuffled_train.filter(lambdax:x["text"]!=example["text"])caseTaskGroup.TEXT_TO_TEXT:while(len(few_shot_examples)<num_few_shotsandlen(shuffled_train)>0):example=shuffled_train.select(range(1))[0]few_shot_examples.append(example)shuffled_train=shuffled_train.filter(lambdax:x["text"]!=example["text"])caseTaskGroup.TOKEN_CLASSIFICATION:labels=it.cycle([label.lower()forlabelinself.dataset_config.labelsiflabel.lower().startswith("b-")])while(len(few_shot_examples)<num_few_shotsandlen(shuffled_train)>0):label=next(labels)possible_examples=shuffled_train.filter(lambdax:labelin[tag.lower()fortaginx["labels"]])iflen(possible_examples)==0:continueexample=possible_examples.select(range(1))[0]few_shot_examples.append(example)shuffled_train=shuffled_train.filter(lambdax:x["tokens"]!=example["tokens"])caseTaskGroup.QUESTION_ANSWERING:# Locate the maximum number of tokens that constitutes a short exampleformax_num_tokensin[512,1024,2048,4096,8192]:train_with_short_examples=dataset["train"].filter(lambdaexample:len(example["context"])<max_num_tokens)num_short_examples=len(train_with_short_examples)ifnum_short_examples>=self.dataset_config.num_few_shot_examples:breakelse:raiseInvalidBenchmark("Could not find enough short examples for few-shot learning.")shuffled_train=train_with_short_examples.shuffle(seed=random_seed)while(len(few_shot_examples)<num_few_shotsandlen(shuffled_train)>0):example=shuffled_train.select(range(1))[0]few_shot_examples.append(example)shuffled_train=shuffled_train.filter(lambdax:x["context"]!=example["context"])case _:raiseNotImplementedError(f"Unsupported task group: {task.task_group}.")random.seed(random_seed)random.shuffle(few_shot_examples)returnfew_shot_examplesdef_apply_prompt(self,examples:dict[str,t.Any],few_shot_examples:list[dict[str,t.Any]],task:Task,)->dict[str,t.Any]:        """Apply prompt template to an example, potentially with few-shot examples.        Args:            examples:                The examples to apply the few-shot examples to.            few_shot_examples:                The few-shot examples to apply.            task:                The task that is being benchmarked.        Returns:            The example with the few-shot examples applied.        """defcreate_prompt(**kwargs)->tuple[str,str]:            """Create a prompt from the given keyword arguments.            Args:                kwargs:                    The keyword arguments to use in the prompt.            Returns:                A pair (prompt, label), where "label" is an empty string if the model is                not instruction tuned (as in this case it is included in the prompt).            """label_key="label"if"label"inkwargselse"target_text"label=kwargs.pop(label_key)assert(labelisnotNone),f"Found a None label for the prompt: {kwargs}. This should not happen."label_mapping=self.dataset_config.prompt_label_mappinglabel=label_mapping.get(label,label)ifself.buffer["instruction_model"]:prompt=self.dataset_config.instruction_prompt.format(**kwargs)returnprompt,labelelse:kwargs[label_key]=labelreturnself.dataset_config.prompt_template.format(**kwargs),""matchtask.task_group:case(TaskGroup.SEQUENCE_CLASSIFICATION|TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION):few_shot_sections=[create_prompt(text=example["text"].replace("\n"," ").strip(),label=example["label"].replace("\n"," ").strip(),)forexampleinfew_shot_examples]new_sections=[create_prompt(text=text.replace("\n"," ").strip(),label="")fortextinexamples["text"]]caseTaskGroup.TEXT_TO_TEXT:few_shot_sections=[create_prompt(text=example["text"].replace("\n"," ").strip(),target_text=example["target_text"].replace("\n"," ").strip(),)forexampleinfew_shot_examples]new_sections=[create_prompt(text=text.replace("\n"," ").strip(),target_text="")fortextinexamples["text"]]caseTaskGroup.TOKEN_CLASSIFICATION:defcreate_label(example:dict)->str:prompt_labels=self.dataset_config.prompt_label_mapping.values()labels:dict[str,list[str]]={prompt_label:list()forprompt_labelinprompt_labels}fortoken,labelinzip(example["tokens"],example["labels"]):label=label.lower()iflabel=="o":continueprompt_label=self.dataset_config.prompt_label_mapping[label]iflabel.startswith("b-"):labels[prompt_label].append(token)eliflabel.startswith("i-"):labels[prompt_label][-1]+=" "+tokenreturnjson.dumps(labels,ensure_ascii=False)few_shot_sections=[create_prompt(text=" ".join(example["tokens"]).replace("\n"," ").strip(),label=create_label(example=example),)forexampleinfew_shot_examples]new_sections=[create_prompt(text=" ".join(tokens).replace("\n"," ").strip(),label="")fortokensinexamples["tokens"]]caseTaskGroup.QUESTION_ANSWERING:few_shot_sections=[create_prompt(text=example["context"].replace("\n"," ").strip(),question=example["question"].replace("\n"," ").strip(),label=example["answers"]["text"][0].replace("\n"," "),)forexampleinfew_shot_examples]new_sections=[create_prompt(text=context.replace("\n"," ").strip(),question=question.replace("\n"," ").strip(),label="",)forcontext,questioninzip(examples["context"],examples["question"])]case _:raiseNotImplementedError(f"Unsupported task group: {task.task_group}.")ifself.buffer["instruction_model"]:few_shot_messages=[dict(role=role,content=content)forprompt,labelinfew_shot_sectionsforrole,contentin[("user",prompt),("assistant",label)]]messages_list=[few_shot_messages+[dict(role="user",content=prompt)]forprompt,_innew_sections]# Pick the chat template that matches the language of the dataset, if such a# template existschat_template:str|None=Noneifisinstance(self._tokenizer.chat_template,dict):language_codes=[language.codeforlanguageinself.dataset_config.languages]forname,candidate_templateinself._tokenizer.chat_template.items():ifname.lower()inlanguage_codes:chat_template=candidate_templatelog_once(f"Using the {name!r} chat template for the tokenizer.",level=logging.DEBUG,)breaktexts=[self._tokenizer.apply_chat_template(conversation=messages,tokenize=False,add_generation_prompt=True,chat_template=chat_template,)formessagesinmessages_list]examples["text"]=textselse:prompt_prefix=""ifself.dataset_config.prompt_prefix:prompt_prefix=self.dataset_config.prompt_prefix+"\n\n"few_shot_prompt="\n\n".join([promptforprompt,_infew_shot_sections])iffew_shot_prompt:few_shot_prompt+="\n\n"examples["text"]=[prompt_prefix+few_shot_prompt+new_promptfornew_prompt,_innew_sections]returnexamples@propertydocs
defdata_collator(self)->c.Callable[[list[t.Any]],dict[str,t.Any]]:        """The data collator used to prepare samples during finetuning.        Returns:            The data collator.        """raiseNotImplementedError("The `data_collator` property has not been implemented for vLLM models.")@propertydeftrainer_class(self)->t.Type["Trainer"]:docs
        """The Trainer class to use for finetuning.        Returns:            The Trainer class.        """raiseNotImplementedError("The `trainer_class` property has not been implemented for vLLM models.")defload_model_and_tokenizer(docs
model_config:ModelConfig,benchmark_config:BenchmarkConfig,output_scores:bool)->"tuple[LLM, PreTrainedTokenizer]":    """Load the model and tokenizer.    Args:        model_config:            The model configuration.        benchmark_config:            The benchmark configuration.        output_scores:            Whether to output scores.    Returns:        The loaded model and tokenizer.    """# Prefer base model ID if the model is an adapter - the adapter will be added on# during inference in this casemodel_id=model_config.adapter_base_model_idormodel_config.model_idhf_model_config=load_hf_model_config(model_id=model_id,num_labels=0,id2label=dict(),label2id=dict(),revision=model_config.revision,model_cache_dir=model_config.model_cache_dir,api_key=benchmark_config.api_key,trust_remote_code=benchmark_config.trust_remote_code,run_with_cli=benchmark_config.run_with_cli,)quantization=Noneifhasattr(hf_model_config,"quantization_config"):quantization=hf_model_config.quantization_config.get("quant_method")# The quantised models require extra dependenciesifquantization=="gptq"and(importlib.util.find_spec("auto_gptq")isNoneorimportlib.util.find_spec("optimum")isNone):raiseNeedsExtraInstalled(extra="quantization")ifquantization=="awq"andimportlib.util.find_spec("awq")isNone:raiseNeedsExtraInstalled(extra="quantization")dtype:str|torch.dtype="auto"ifquantizationisnotNoneandhf_model_config.torch_dtype!=torch.float16:logger.info("You are loading a quantized model with dtype "f"{hf_model_config.torch_dtype}, which vLLM does not support. Setting ""dtype to float16 instead.")dtype=torch.float16ifmodel_config.adapter_base_model_idisnotNone:download_dir=str(Path(model_config.model_cache_dir)/"base_model")else:download_dir=str(model_config.model_cache_dir)potential_max_model_length_config_names=["max_position_embeddings","max_sequence_length","model_max_length","sliding_window","sliding_window_size","n_positions",]true_max_model_len_candidates:list[int]=list()forconfig_nameinpotential_max_model_length_config_names:ifhasattr(hf_model_config,config_name):model_len=getattr(hf_model_config,config_name)ifmodel_lenisnotNone:true_max_model_len_candidates.append(model_len)iflen(true_max_model_len_candidates)>0:true_max_model_len=min(true_max_model_len_candidates)else:true_max_model_len=5_000clear_vllm()executor_backend="ray"iftorch.cuda.device_count()>1else"mp"try:model=LLM(model=model_id,tokenizer=model_id,gpu_memory_utilization=0.95,max_model_len=min(true_max_model_len,5_000),download_dir=download_dir,trust_remote_code=benchmark_config.trust_remote_code,revision=model_config.revision,seed=4242,distributed_executor_backend=executor_backend,tensor_parallel_size=torch.cuda.device_count(),disable_custom_all_reduce=True,quantization=quantization,dtype=dtype,enforce_eager=True,max_logprobs=MAX_LOGPROBSifoutput_scoreselseNone,# TEMP: Prefix caching isn't supported with sliding window in vLLM yet,# so we disable it for nowenable_prefix_caching=False,enable_lora=model_config.adapter_base_model_idisnotNone,max_lora_rank=256,)except(ValueError,OSError)ase:if"awaiting a review from the repo authors"instr(e):raiseInvalidModel(f"The model {model_id!r} is awaiting a review from the repository ""authors. Please try again later.")elif"trust_remote_code"instr(e):raiseInvalidModel(f"Loading the model {model_id!r} needs to trust remote code. ""If you trust the suppliers of this model, then you can enable ""this by setting the `--trust-remote-code` flag.")raiseInvalidModel(f"The model {model_id!r} could not be loaded. The error was {e!r}.")model._run_engine=MethodType(_run_engine_with_fixed_progress_bars,model)model.config=hf_model_configtokenizer=load_tokenizer(model_id=model_config.model_id,revision=model_config.revision,adapter_base_model_id=model_config.adapter_base_model_id,trust_remote_code=benchmark_config.trust_remote_code,model_max_length=true_max_model_len,model_cache_dir=model_config.model_cache_dir,token=benchmark_config.api_keyoros.getenv("HUGGINGFACE_API_KEY")orTrue,)returnmodel,tokenizerdefload_tokenizer(docs
model_id:str,revision:str,adapter_base_model_id:str|None,trust_remote_code:bool,model_max_length:int,model_cache_dir:str,token:str|bool,)->"PreTrainedTokenizer":    """Load the tokenizer.    Args:        model_id:            The model identifier.        revision:            The revision of the model.        adapter_base_model_id:            The base model ID for the adapter model. Can be None if the model is not an            adapter model.        trust_remote_code:            Whether to trust remote code.        model_max_length:            The maximum length of the model.        model_cache_dir:            The cache directory for the model.        token:            The Hugging Face API token.    Returns:        The loaded tokenizer.    """config=AutoConfig.from_pretrained(adapter_base_model_idormodel_id,revision=revision,cache_dir=model_cache_dir,token=token,trust_remote_code=trust_remote_code,)num_retries=5for_inrange(num_retries):try:tokenizer=AutoTokenizer.from_pretrained(model_id,use_fast=True,verbose=False,trust_remote_code=trust_remote_code,padding_side="left",truncation_side="left",model_max_length=model_max_length,config=config,token=token,)breakexcept(json.JSONDecodeError,OSError,TypeError)ase:ifadapter_base_model_idisNoneormodel_id==adapter_base_model_id:raiseInvalidModel(f"Could not load tokenizer for model {model_id!r}. The error was "f"{str(e)}.")logger.debug(f"Could not load tokenizer for {model_id!r}. Falling back to "f"{adapter_base_model_id!r}.")model_id=adapter_base_model_idexcept(TimeoutError,RequestError):logger.info(f"Couldn't load tokenizer for {model_id!r}. Retrying.")sleep(5)continueelse:raiseInvalidModel(f"Could not load tokenizer for model {model_id!r} after {num_retries} ""attempts.")iftokenizer.pad_token_idisNone:tokenizer.pad_token=tokenizer.eos_tokenreturntokenizerdef_run_engine_with_fixed_progress_bars(self:"LLM",use_tqdm:bool)->list["RequestOutput"]:ifuse_tqdm:num_requests=self.llm_engine.get_num_unfinished_requests()pbar=tqdm(total=num_requests,leave=False,disable=hasattr(sys,"_called_from_test"))else:pbar=None# Run the engine.outputs:list["RequestOutput"]=list()whileself.llm_engine.has_unfinished_requests():step_outputs=self.llm_engine.step()foroutputinstep_outputs:ifoutput.finished:outputs.append(output)ifpbarisnotNone:pbar.update(1)ifpbarisnotNone:pbar.close()# Sort the outputs by request ID. This is necessary because some requests may be# finished earlier than its previous requests.outputs=sorted(outputs,key=lambdax:int(x.request_id))returnoutputsdefclear_vllm()->None:docs
    """Clear the GPU memory used by the vLLM model, enabling re-initialisation."""try:destroy_model_parallel()exceptImportError:passclear_memory()ifray.is_initialized():ray.shutdown()defget_end_of_reasoning_token_id(docs
model:"LLM",tokenizer:"PreTrainedTokenizer")->int|None:    """Get the end of reasoning token ID for a generative model.    This assumes that the reasoning token is of the form <X> and that the end of    reasoning token is </X> (for X being any string without spaces).    Args:        model:            The vLLM model.        tokenizer:            The tokenizer.    Returns:        The end of reasoning token ID, or None if it could not be found.    """iftokenizer.chat_templateisNone:prompt="What is your name?"else:prompt=tokenizer.apply_chat_template(conversation=[dict(role="user",content="What is your name?")],add_generation_prompt=True,tokenize=False,)# Generate a completion and remove the BOS token from it, to not confuse it with the# potential reasoning tokencompletion=(model.generate(prompts=[prompt],sampling_params=SamplingParams(max_tokens=3,temperature=0.0),use_tqdm=False,)[0].outputs[0].text)iftokenizer.bos_tokenisnotNone:completion=completion.replace(tokenizer.bos_token,"").strip()# If it doesn't contain a reasoning token, we can't find the end of reasoning tokenmatch=re.search(pattern=r"<\w+>",string=completion)ifmatchisNone:log_once(message=("Could not find a reasoning token, so assuming the model is not a ""reasoning model."),level=logging.DEBUG,)returnNone# Check that the found reasoning token and its associated end-of-reasoning tokens# are both special tokensreasoning_token=match.group()end_of_reasoning_token=f"</{reasoning_token[1:-1]}>"special_tokens=[decoder_token.contentfordecoder_tokenintokenizer.added_tokens_decoder.values()]special_tokens.extend([encoder_tokenforencoder_tokenintokenizer.added_tokens_encoder.keys()])special_tokens.extend(tokenizer.all_special_tokens)if(reasoning_tokennotinspecial_tokensorend_of_reasoning_tokennotinspecial_tokens):log_once(message=(f"Detected reasoning token {reasoning_token!r} and end of reasoning "f"token {end_of_reasoning_token!r}, but one of them is not registered ""as a special token, so assuming it is not a real reasoning token."),level=logging.DEBUG,)returnNonelog_once(message=f"Detected reasoning token {reasoning_token!r}.",level=logging.DEBUG)# Encode the end of reasoning token and return its IDend_of_reasoning_token_id=tokenizer.encode(text=end_of_reasoning_token,add_special_tokens=False)[0]returnend_of_reasoning_token_id