1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358 | """Factory class for creating dataset configurations."""
import importlib.util
import logging
import sys
import typing as t
import torch
from .data_models import BenchmarkConfig
from .dataset_configs import get_all_dataset_configs
from .enums import Device
from .exceptions import InvalidBenchmark
from .languages import get_all_languages
from .tasks import get_all_tasks
from .utils import log_once
if t.TYPE_CHECKING:
from .data_models import Language, Task
logger = logging.getLogger("scandeval")
def build_benchmark_config(
progress_bar: bool,
save_results: bool,
task: str | list[str] | None,
dataset: str | list[str] | None,
language: str | list[str],
model_language: str | list[str] | None,
dataset_language: str | list[str] | None,
device: Device | None,
batch_size: int,
raise_errors: bool,
cache_dir: str,
api_key: str | None,
force: bool,
verbose: bool,
trust_remote_code: bool,
use_flash_attention: bool | None,
clear_model_cache: bool,
evaluate_test_split: bool,
few_shot: bool,
num_iterations: int,
api_base: str | None,
api_version: str | None,
debug: bool,
run_with_cli: bool,
only_allow_safetensors: bool,
first_time: bool = False,
) -> BenchmarkConfig:
"""Create a benchmark configuration.
Args:
progress_bar:
Whether to show a progress bar when running the benchmark.
save_results:
Whether to save the benchmark results to a file.
task:
The tasks to include for dataset. If None then datasets will not be
filtered based on their task.
dataset:
The datasets to include for task. If None then all datasets will be
included, limited by the `task` parameter.
language:
The language codes of the languages to include, both for models and
datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this
to 'all' if all languages (also non-Scandinavian) should be considered.
model_language:
The language codes of the languages to include for models. If None then
the `language` parameter will be used.
dataset_language:
The language codes of the languages to include for datasets. If None then
the `language` parameter will be used.
device:
The device to use for running the models. If None then the device will be
set automatically.
batch_size:
The batch size to use for running the models.
raise_errors:
Whether to raise errors when running the benchmark.
cache_dir:
The directory to use for caching the models.
api_key:
The API key to use for a given inference server.
force:
Whether to force the benchmark to run even if the results are already
cached.
verbose:
Whether to print verbose output when running the benchmark. This is
automatically set if `debug` is True.
trust_remote_code:
Whether to trust remote code when running the benchmark.
use_flash_attention:
Whether to use Flash Attention for the models. If None then it will be used
if it is available.
clear_model_cache:
Whether to clear the model cache before running the benchmark.
evaluate_test_split:
Whether to use the test split for the datasets.
few_shot:
Whether to use few-shot learning for the models.
num_iterations:
The number of iterations each model should be evaluated for.
api_base:
The base URL for a given inference API. Only relevant if `model` refers to a
model on an inference API.
api_version:
The version of the API to use for a given inference API.
debug:
Whether to run the benchmark in debug mode.
run_with_cli:
Whether the benchmark is being run with the CLI.
only_allow_safetensors:
Whether to only allow evaluations of models stored as safetensors.
first_time:
Whether this is the first time the benchmark configuration is being created.
Defaults to False.
Returns:
The benchmark configuration.
"""
language_codes = get_correct_language_codes(language_codes=language)
model_languages = prepare_languages(
language_codes=model_language, default_language_codes=language_codes
)
dataset_languages = prepare_languages(
language_codes=dataset_language, default_language_codes=language_codes
)
tasks, datasets = prepare_tasks_and_datasets(
task=task, dataset=dataset, dataset_languages=dataset_languages
)
torch_device = prepare_device(device=device)
if use_flash_attention is None:
if torch_device.type != "cuda":
use_flash_attention = False
elif (
importlib.util.find_spec("flash_attn") is None
and importlib.util.find_spec("vllm_flash_attn") is None
):
use_flash_attention = False
if first_time and torch_device.type == "cuda":
message = (
"Flash attention has not been installed, so this will not be used. "
"To install it, run `pip install -U wheel && "
"FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install flash-attn "
"--no-build-isolation`. Alternatively, you can disable this "
"message by setting "
)
if run_with_cli:
message += "the flag `--no-use-flash-attention`."
else:
message += (
"the argument `use_flash_attention=False` in the `Benchmarker`."
)
log_once(message=message, level=logging.INFO)
# Set variable with number of iterations
if hasattr(sys, "_called_from_test"):
num_iterations = 1
return BenchmarkConfig(
model_languages=model_languages,
dataset_languages=dataset_languages,
tasks=tasks,
datasets=datasets,
batch_size=batch_size,
raise_errors=raise_errors,
cache_dir=cache_dir,
api_key=api_key,
force=force,
progress_bar=progress_bar,
save_results=save_results,
verbose=verbose or debug,
device=torch_device,
trust_remote_code=trust_remote_code,
use_flash_attention=use_flash_attention,
clear_model_cache=clear_model_cache,
evaluate_test_split=evaluate_test_split,
few_shot=few_shot,
num_iterations=num_iterations,
api_base=api_base,
api_version=api_version,
debug=debug,
run_with_cli=run_with_cli,
only_allow_safetensors=only_allow_safetensors,
)
def get_correct_language_codes(language_codes: str | list[str]) -> list[str]:
"""Get correct language code(s).
Args:
language_codes:
The language codes of the languages to include, both for models and
datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this
to 'all' if all languages (also non-Scandinavian) should be considered.
Returns:
The correct language codes.
"""
# Create a dictionary that maps languages to their associated language objects
language_mapping = get_all_languages()
# Create the list `languages`
if "all" in language_codes:
languages = list(language_mapping.keys())
elif isinstance(language_codes, str):
languages = [language_codes]
else:
languages = language_codes
# If `languages` contains 'no' then also include 'nb' and 'nn'. Conversely, if
# either 'nb' or 'nn' are specified then also include 'no'.
if "no" in languages:
languages = list(set(languages) | {"nb", "nn"})
elif "nb" in languages or "nn" in languages:
languages = list(set(languages) | {"no"})
return languages
def prepare_languages(
language_codes: str | list[str] | None, default_language_codes: list[str]
) -> list["Language"]:
"""Prepare language(s) for benchmarking.
Args:
language_codes:
The language codes of the languages to include for models or datasets.
If specified then this overrides the `language` parameter for model or
dataset languages.
default_language_codes:
The default language codes of the languages to include.
Returns:
The prepared model or dataset languages.
"""
# Create a dictionary that maps languages to their associated language objects
language_mapping = get_all_languages()
# Create the list `languages_str` of language codes to use for models or datasets
languages_str: list[str]
if language_codes is None:
languages_str = default_language_codes
elif isinstance(language_codes, str):
languages_str = [language_codes]
else:
languages_str = language_codes
# Convert the model languages to language objects
if "all" in languages_str:
prepared_languages = list(language_mapping.values())
else:
prepared_languages = [language_mapping[language] for language in languages_str]
return prepared_languages
def prepare_tasks_and_datasets(
task: str | list[str] | None,
dataset_languages: list["Language"],
dataset: str | list[str] | None,
) -> tuple[list["Task"], list[str]]:
"""Prepare task(s) and dataset(s) for benchmarking.
Args:
task:
The tasks to include for dataset. If None then datasets will not be
filtered based on their task.
dataset_languages:
The languages of the datasets in the benchmark.
dataset:
The datasets to include for task. If None then all datasets will be
included, limited by the `task` and `dataset_languages` parameters.
Returns:
The prepared tasks and datasets.
Raises:
InvalidBenchmark:
If the task or dataset is not found in the benchmark tasks or datasets.
"""
# Create a dictionary that maps benchmark tasks to their associated benchmark
# task objects, and a dictionary that maps dataset names to their associated
# dataset configuration objects
task_mapping = get_all_tasks()
all_dataset_configs = get_all_dataset_configs()
# Create the list of dataset tasks
try:
if task is None:
tasks = list(task_mapping.values())
elif isinstance(task, str):
tasks = [task_mapping[task]]
else:
tasks = [task_mapping[t] for t in task]
except KeyError as e:
raise InvalidBenchmark(f"Task {e} not found in the benchmark tasks.") from e
all_official_datasets = [
dataset_name
for dataset_name, dataset_config in all_dataset_configs.items()
if not dataset_config.unofficial
]
if dataset is None:
dataset = all_official_datasets
elif isinstance(dataset, str):
dataset = [dataset]
all_datasets = list(all_dataset_configs.keys())
invalid_datasets = set(dataset) - set(all_datasets)
if invalid_datasets:
raise InvalidBenchmark(
f"Dataset(s) {', '.join(invalid_datasets)} not found in the benchmark "
"datasets."
)
datasets = [
dataset_name
for dataset_name, dataset_config in all_dataset_configs.items()
if dataset_name in dataset
and dataset_config.task in tasks
and set(dataset_config.languages).intersection(dataset_languages)
]
return tasks, datasets
def prepare_device(device: Device | None) -> torch.device:
"""Prepare device for benchmarking.
Args:
device:
The device to use for running the models. If None then the device will be
set automatically.
Returns:
The prepared device.
"""
device_mapping = {
Device.CPU: torch.device("cpu"),
Device.CUDA: torch.device("cuda"),
Device.MPS: torch.device("mps"),
}
if isinstance(device, Device):
return device_mapping[device]
if torch.cuda.is_available():
return torch.device("cuda")
elif torch.backends.mps.is_available():
return torch.device("mps")
else:
return torch.device("cpu")
|