scandeval.human_evaluation

docs module scandeval.human_evaluation
"""Gradio app for conducting human evaluation of the tasks."""importimportlib.utilimportjsonimportloggingfromcollectionsimportdefaultdictfromfunctoolsimportpartialfrompathlibimportPathimportclickfromdatasetsimportDatasetfrom.benchmark_config_factoryimportbuild_benchmark_configfrom.data_loadingimportload_datafrom.data_modelsimportBenchmarkResult,GenerativeModelOutputfrom.dataset_configsimportSPEED_CONFIG,get_all_dataset_configsfrom.enumsimportGenerativeType,TaskGroupfrom.exceptionsimportNeedsExtraInstalledfrom.scoresimportaggregate_scoresfrom.task_utilsimport(question_answering,sequence_classification,text_to_text,token_classification,)from.tasksimportNERfrom.typesimportComputeMetricsFunction,ExtractLabelsFunction,ScoreDictfrom.utilsimportenforce_reproducibilityifimportlib.util.find_spec("gradio")isnotNone:importgradioasgrlogger=logging.getLogger("scandeval")classHumanEvaluator:docs
    """An app for evaluating human performance on the ScandEval benchmark."""def__init__(self,annotator_id:int,title:str,description:str,dummy_model_id:str="mistralai/Mistral-7B-v0.1",)->None:        """Initialize the HumanEvaluator.        Args:            annotator_id:                The annotator ID for the evaluation.            title:                The title of the app.            description:                The description of the app.            dummy_model_id:                The model ID to use for generating prompts.        """self.annotator_id=annotator_idself.title=titleself.description=descriptionself.dummy_model_id=dummy_model_idself.sample_idx:intself.active_dataset:Datasetself.dataset_configs={name:cfgforname,cfginget_all_dataset_configs().items()ifnotcfg.unofficial}self.tasks=sorted({cfg.task.name.replace("-"," ").title()forcfginself.dataset_configs.values()ifcfg!=SPEED_CONFIG})self.languages=sorted({language.nameforcfginself.dataset_configs.values()ifcfg!=SPEED_CONFIGforlanguageincfg.languagesiflanguage.namenotin{"Norwegian Bokmål","Norwegian Nynorsk"}})self.extract_labels_from_generation:ExtractLabelsFunctionself.compute_metrics:ComputeMetricsFunctiondefcreate_app(self)->"gr.Blocks":docs
        """Create the Gradio app for human evaluation.        Returns:            The Gradio app for human evaluation.        """withgr.Blocks(title=self.title,theme=gr.themes.Monochrome())asapp:gr.components.HTML(f"<center><h1>{self.title}</h1></center>")gr.components.Markdown(self.description)withgr.Row(variant="panel"):language_dropdown=gr.Dropdown(label="Language",choices=self.languages)task_dropdown=gr.Dropdown(label="Task",choices=self.tasks)dataset_dropdown=gr.Dropdown(label="Dataset",choices=[""])withgr.Row(variant="panel"):withgr.Column():task_examples=gr.Markdown("Task Examples",visible=False)withgr.Column():question=gr.Markdown(label="Question",visible=False)withgr.Row():ner_tag_dropdown=gr.Dropdown(label="Entity type",choices=[""],interactive=True,visible=False,scale=0.5,# type: ignore[arg-type])ner_tag_answer=gr.Textbox(label="Entity",interactive=True,visible=False,scale=1)withgr.Column(scale=0.2):# type: ignore[arg-type]ner_tag_add_button=gr.Button("Add entity",visible=False)ner_tag_reset_button=gr.Button("Reset entities",visible=False)answer=gr.Textbox(label="Answer",visible=False)submit_button=gr.Button("Submit",visible=False)language_dropdown.change(fn=self.update_dataset_choices,inputs=[language_dropdown,task_dropdown],outputs=dataset_dropdown,)task_dropdown.change(fn=self.update_dataset_choices,inputs=[language_dropdown,task_dropdown],outputs=dataset_dropdown,)dataset_dropdown.change(fn=partial(self.update_dataset,iteration=self.annotator_id),inputs=dataset_dropdown,outputs=[task_examples,question,ner_tag_dropdown,ner_tag_answer,ner_tag_add_button,ner_tag_reset_button,answer,submit_button,],)ner_tag_add_button.click(fn=self.add_entity_to_answer,inputs=[question,ner_tag_dropdown,ner_tag_answer,answer],outputs=[ner_tag_answer,answer],)ner_tag_answer.submit(fn=self.add_entity_to_answer,inputs=[question,ner_tag_dropdown,ner_tag_answer,answer],outputs=[ner_tag_answer,answer],)ner_tag_reset_button.click(fn=self.reset_entities,outputs=answer)submit_button.click(fn=partial(self.submit_answer,annotator_id=self.annotator_id),inputs=[dataset_dropdown,question,answer],outputs=[question,answer],)answer.submit(fn=partial(self.submit_answer,annotator_id=self.annotator_id),inputs=[dataset_dropdown,question,answer],outputs=[question,answer],)returnappdefupdate_dataset_choices(docs
self,language:str|None,task:str|None)->"gr.Dropdown":        """Update the dataset choices based on the selected language and task.        Args:            language:                The language selected by the user.            task:                The task selected by the user.        Returns:            A list of dataset names that match the selected language and task.        """iflanguageisNoneortaskisNone:returngr.Dropdown(choices=[])dataset_configs=[cfgforcfginget_all_dataset_configs().values()iflanguagein{language.nameforlanguageincfg.languages}andtask.lower().replace(" ","-")==cfg.task.nameandnotcfg.unofficial]assertlen(dataset_configs)>0choices=sorted([cfg.nameforcfgindataset_configs])logger.info(f"User selected {language} and {task}, which resulted in the datasets "f"{choices}, with {choices[0]!r} being chosen by default.")returngr.Dropdown(choices=choices,value=choices[0])defupdate_dataset(docs
self,dataset_name:str,iteration:int)->"tuple[gr.Markdown, gr.Markdown, gr.Dropdown, gr.Textbox, gr.Button, gr.Button, gr.Textbox, gr.Button]":        """Update the dataset based on a selected dataset name.        Args:            dataset_name:                The dataset name selected by the user.            iteration:                The iteration index of the datasets to evaluate.        Returns:            A tuple (task_examples, question, entity_type, entity, entity_add_button,            entity_reset_button, answer, submit_button) for the selected dataset.        """blank_answer=(gr.Markdown("",visible=False),gr.Markdown("",visible=False),gr.Dropdown(visible=False),gr.Textbox(visible=False),gr.Button(visible=False),gr.Button(visible=False),gr.Textbox("",visible=False),gr.Button(visible=False),)ifnotdataset_name:returnblank_answerlogger.info(f"User selected dataset {dataset_name} - loading dataset...")gr.Info(f"Loading dataset {dataset_name}...")benchmark_config=build_benchmark_config(progress_bar=False,save_results=True,task=None,dataset=None,language=[language.codeforcfginget_all_dataset_configs().values()forlanguageincfg.languagesifnotcfg.unofficial],model_language=None,dataset_language=None,device=None,batch_size=1,raise_errors=False,cache_dir=".scandeval_cache",api_key=None,force=False,verbose=False,trust_remote_code=False,use_flash_attention=None,clear_model_cache=False,evaluate_test_split=False,few_shot=True,num_iterations=iteration+1,api_base=None,api_version=None,debug=False,run_with_cli=True,only_allow_safetensors=False,)self.dataset_config=get_all_dataset_configs()[dataset_name]# TODO: Is this needed?# model_id = f"human-{iteration}"# model_config = ModelConfig(#     model_id=model_id,#     revision="main",#     task="text-generation",#     languages=dataset_config.languages,#     model_type=ModelType.HUMAN,#     model_cache_dir=create_model_cache_dir(#         cache_dir=benchmark_config.cache_dir, model_id=model_id#     ),#     adapter_base_model_id=None,# )self.sample_idx=0dataset_path=(Path(".scandeval_cache")/"human-evaluation"/dataset_name/f"human-{iteration}.csv")ifdataset_path.exists():active_dataset=Dataset.from_csv(str(dataset_path))assertisinstance(active_dataset,Dataset)self.active_dataset=active_datasettry:whileself.active_dataset["answer"][self.sample_idx]isnotNone:self.sample_idx+=1exceptIndexError:self.compute_and_log_scores()returnblank_answerelse:rng=enforce_reproducibility()datasets=load_data(rng=rng,dataset_config=self.dataset_config,benchmark_config=benchmark_config,)# TODO: Prepare data?self.active_dataset=(datasets[iteration]["test"].remove_columns(column_names=["input_ids","attention_mask"],new_fingerprint=datasets[iteration]["test"]._fingerprint,).add_column(name="answer",column=[None]*len(datasets[iteration]["test"]),new_fingerprint=datasets[iteration]["test"]._fingerprint,))ifself.dataset_config.task==NER:labels_in_train:set[str]={tagfortag_listinself.active_dataset["labels"]fortagintag_list}self.has_misc_tags=("B-MISC"inlabels_in_trainor"I-MISC"inlabels_in_train)matchself.dataset_config.task.task_group:caseTaskGroup.SEQUENCE_CLASSIFICATION:self.compute_metrics=partial(sequence_classification.compute_metrics,dataset_config=self.dataset_config,benchmark_config=benchmark_config,)self.extract_labels_from_generation=partial(sequence_classification.extract_labels_from_generation,dataset_config=self.dataset_config,)caseTaskGroup.TEXT_TO_TEXT:self.compute_metrics=partial(text_to_text.compute_metrics,dataset_config=self.dataset_config,benchmark_config=benchmark_config,)self.extract_labels_from_generation=(text_to_text.extract_labels_from_generation)caseTaskGroup.TOKEN_CLASSIFICATION:self.compute_metrics=partial(token_classification.compute_metrics,has_misc_tags=self.has_misc_tags,dataset_config=self.dataset_config,benchmark_config=benchmark_config,)self.extract_labels_from_generation=partial(token_classification.extract_labels_from_generation,dataset_config=self.dataset_config,)caseTaskGroup.QUESTION_ANSWERING:self.compute_metrics=partial(question_answering.compute_metrics,dataset_config=self.dataset_config,benchmark_config=benchmark_config,)self.extract_labels_from_generation=(question_answering.extract_labels_from_generation)caseTaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:raiseNotImplementedErrorcase _:raiseNotImplementedError(f"Task group {self.dataset_config.task.task_group} is not ""supported.")task_examples,question=self.example_to_markdown(example=self.active_dataset[self.sample_idx])logger.info(f"Loaded dataset {dataset_name}, with the following task examples:\n\n"f"{task_examples}")ifself.dataset_config.task==NER:ner_tags=list()forner_taginself.dataset_config.prompt_label_mapping.values():ifner_tagnotinner_tags:ner_tags.append(ner_tag)return(gr.Markdown(task_examples,visible=True),gr.Markdown(question,visible=True),gr.Dropdown(label="Entity type",choices=ner_tags,value=ner_tags[0],visible=True,),gr.Textbox(label="Entity",interactive=True,visible=True),gr.Button("Add entity",visible=True),gr.Button("Reset entities",visible=True),gr.Textbox(json.dumps({ner_tag:[]forner_taginner_tags}),interactive=False,visible=True,),gr.Button("Submit",visible=True),)else:return(gr.Markdown(task_examples,visible=True),gr.Markdown(question,visible=True),gr.Dropdown(label="Entity type",choices=[],visible=False),gr.Textbox(label="Entity",interactive=True,visible=False),gr.Button("Add entity",visible=False),gr.Button("Reset entities",visible=False),gr.Textbox("",interactive=True,visible=True),gr.Button("Submit",visible=True),)defadd_entity_to_answer(docs
self,question:str,entity_type:str,entity:str,answer:str)->"tuple[gr.Textbox, gr.Textbox]":        """Add an entity to the answer.        Args:            question:                The current question.            entity_type:                The entity type selected by the user.            entity:                The entity provided by the user.            answer:                The current answer.        Returns:            A tuple (entity, answer) with a (blank) entity and answer.        """ifnotentity_typeornotentity:returngr.Textbox(""),gr.Textbox("")ifentitynotinquestion:gr.Warning(f"The entity {entity!r} is not present in the question. Please ""write it *exactly* as it appears in the question.")returngr.Textbox(entity),gr.Textbox(answer)current_answer_obj=json.loads(answer)ifentitynotincurrent_answer_obj[entity_type]:current_answer_obj[entity_type].append(entity)answer=json.dumps(current_answer_obj)returngr.Textbox(""),gr.Textbox(answer)defreset_entities(self)->"gr.Textbox":docs
        """Reset the entities in the answer.        Returns:            A blank answer.        """ner_tags=list()forner_taginself.dataset_config.prompt_label_mapping.values():ifner_tagnotinner_tags:ner_tags.append(ner_tag)returngr.Textbox(json.dumps({ner_tag:[]forner_taginner_tags}))defsubmit_answer(docs
self,dataset_name:str,question:str,answer:str,annotator_id:int)->tuple[str,str]:        """Submit an answer to the dataset.        Args:            dataset_name:                The name of the dataset.            question:                The question for the dataset.            answer:                The answer to the question.            annotator_id:                The annotator ID for the evaluation.        Returns:            A tuple (question, answer), with `question` being the next question, and            `answer` being an empty string.        """ifnotanswer:gr.Warning("Please provide an answer before submitting.")logger.info("User tried to submit without providing an answer.")returnquestion,answer# Custom NER validationifself.dataset_config.task==NER:try:json.loads(answer)exceptjson.JSONDecodeError:gr.Warning("Please provide a valid JSON object as an answer.")logger.info("User tried to submit an invalid JSON object as an answer.")returnquestion,answerifnotisinstance(json.loads(answer),dict):gr.Warning("Please provide a JSON object with a dictionary as an answer.")logger.info("User tried to submit a JSON object without a dictionary as an answer.")returnquestion,answerner_tags=list(self.dataset_config.prompt_label_mapping.values())forner_taginner_tags:ifner_tagnotinjson.loads(answer).keys():gr.Warning(f"Please provide a JSON object with the key {ner_tag!r}.")logger.info("User tried to submit a JSON object without the key "f"{ner_tag!r}.")returnquestion,answersamples_left=len(self.active_dataset)-self.sample_idx-1ifsamples_left:gr.Info(f"Submitted - {samples_left} to go!")# Store the user's answeranswers=self.active_dataset["answer"]answers[self.sample_idx]=answerself.active_dataset=self.active_dataset.remove_columns(column_names=["answer"],new_fingerprint=self.active_dataset._fingerprint).add_column(name="answer",column=answers,new_fingerprint=self.active_dataset._fingerprint,)logger.info(f"User submitted the answer {answer!r} to the question {question!r}, with "f"sample index {self.sample_idx}.")dataset_path=(Path(".scandeval_cache")/"human-evaluation"/dataset_name/f"human-{annotator_id}.csv")dataset_path.parent.mkdir(parents=True,exist_ok=True)self.active_dataset.to_csv(dataset_path)# Attempt to get the next questiontry:self.sample_idx+=1_,question=self.example_to_markdown(example=self.active_dataset[self.sample_idx])ifself.dataset_config.task==NER:ner_tags=list()forner_taginself.dataset_config.prompt_label_mapping.values():ifner_tagnotinner_tags:ner_tags.append(ner_tag)answer=json.dumps({ner_tag:[]forner_taginner_tags})else:answer=""# If we fail to get the next question it means that the user has finished# annotating the dataset, so we compute and log the scoresexceptIndexError:self.compute_and_log_scores()question=""answer=""returnquestion,answerdefexample_to_markdown(self,example:dict)->tuple[str,str]:docs
        """Convert an example to a Markdown string.        Args:            example:                The example to convert.        Returns:            A tuple (task_examples, question) for the example.        """task_examples:str|list[str]=[sample.replace("\n","\n\n")forsampleinexample["text"].split("\n\n")[:-1]]task_examples="\n\n**Example**\n\n".join(task_examples)question="**Question**\n\n"question+="\n\n".join(example["text"].split("\n\n")[-1].split("\n")[:-1])question+="\n\n"+example["text"].split("\n\n")[-1].split("\n")[-1]returntask_examples,questiondefcompute_and_log_scores(self)->None:docs
        """Computes and logs the scores for the dataset."""model_output=GenerativeModelOutput(sequences=self.active_dataset["answer"])active_dataset_dict=self.active_dataset.to_dict()assertisinstance(active_dataset_dict,dict)all_preds=self.extract_labels_from_generation(input_batch=active_dataset_dict,model_output=model_output)ground_truth=self.active_dataset["label"]itr_scores:dict[str,float]=self.compute_metrics(model_outputs_and_labels=(all_preds,ground_truth))# We reverse the order, as the Info messages are printed in reverse orderscores=list(itr_scores.items())scores.reverse()gr.Info("If you want to evaluate another dataset then please select a new ""one from the menus.")formetric_name,scoreinscores:gr.Info(f"\n\n{metric_name}: {score:.2%}")gr.Info("You have completed this dataset! Here are your scores:")logger.info(f"User completed the dataset {self.dataset_config.name!r}"f", with the following scores: {itr_scores}")# Load previous human results, if any. We do this since the human evaluation is# only a single iteration, so the results from the current annotation should be# added to the previous results.results_path=Path.cwd()/"scandeval_benchmark_results.jsonl"results:ScoreDict=defaultdict(list)ifresults_path.exists():all_results=[json.loads(line.strip())forlineinresults_path.read_text().strip().split("\n")ifline.strip()]human_result_candidates=[resultforresultinall_resultsifresult["model"]=="human"andresult["dataset"]==self.dataset_config.name]ifhuman_result_candidates:results=human_result_candidates[0]["results"]# Append to resultsresults["raw"].append(# type: ignore[union-attr]{f"test_{metric_name}":scoreformetric_name,scoreinitr_scores.items()})# Aggregate scorestotal_dict:dict[str,float]=dict()formetric_cfginself.dataset_config.task.metrics:test_score,test_se=aggregate_scores(scores=results["raw"],# type: ignore[arg-type]metric_config=metric_cfg,)test_score,_=metric_cfg.postprocessing_fn(test_score)test_se,_=metric_cfg.postprocessing_fn(test_se)total_dict[f"test_{metric_cfg.name}"]=test_scoretotal_dict[f"test_{metric_cfg.name}_se"]=test_seresults["total"]=total_dictbenchmark_result=BenchmarkResult(dataset=self.dataset_config.name,task=self.dataset_config.task.name,dataset_languages=[language.codeforlanguageinself.dataset_config.languages],model="human",results=results,num_model_parameters=-1,max_sequence_length=-1,vocabulary_size=-1,merge=False,generative=True,generative_type=GenerativeType.INSTRUCTION_TUNED,few_shot=True,validation_split=True,)benchmark_result.append_to_results(results_path=results_path)@click.command()@click.option("--annotator-id","-id",type=int,required=True,help="""The annotator ID to use for the evaluation. Needs to be between 0 and 10,    inclusive.""",)defmain(annotator_id:int)->None:docs
    """Start the Gradio app for human evaluation."""ifimportlib.util.find_spec("gradio")isNone:raiseNeedsExtraInstalled(extra="human_evaluation")evaluator=HumanEvaluator(annotator_id=annotator_id,title="ScandEval Human Evaluation",description="""        In this app we will evaluate your performance on a variety of tasks, with the        goal of comparing human performance to language model performance.        When you select a language and a task then you will be given a brief        description of the task, as well as examples of how to solve it. Please read        through these examples before proceeding with the task.        Please do not use any additional aids (such as search engines) when completing        these tasks.        Note that several examples appear more than once - this is intentional, as it        allows us to compare your performance across multiple examples.        Note that the Enter key will also submit your answer!        """,)evaluator.create_app().queue().launch()if__name__=="__main__":main()