hub-docs/eval_results.yaml at main · huggingface/hub-docs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
- dataset:
    id: cais/hle                  # Required. A valid dataset id from the Hub, which should have a "Benchmark" tag.
                                  # ^Basically, this is where the leaderboard lives.
    task_id: {task_id}            # Required. ID of the Task, as defined in the dataset's eval.yaml
                                  # A single dataset can define several tasks or leaderboards.
                                  # Example: GPQA defines gpqa_diamond, gpqa_main, gpqa_extended.
    revision: {dataset_revision}  # Optional. Example: 5503434ddd753f426f4b38109466949a1217c2bb

  value: {metric_value}           # Required. Example: 20.90

  verifyToken: {verify_token}     # Optional. If present, this is a signature that can be used to prove that evaluation is provably auditable and reproducible.
                                  # (For example, was run in a HF Job using inspect-ai or lighteval)

  date: "{date}"                  # Optional. When was this eval run (ISO-8601 date or datetime). Format this as a string. If not provided, can default to this file creation time in git.

  source:                         # Optional. Attribution for this result, for instance a repo containing output traces or a Paper
    url: {source_url}             # Required if source is provided. A link to the source. Example: https://huggingface.co/spaces/SaylorTwift/smollm3-mmlu-pro.
    name: {source_name}           # Optional. The name of the source. Example: Eval Logs.
    user: {username}              # Optional. A HF user name.
    org: {orgname}                # Optional. A HF org name.

  notes: "{notes}"                # Optional. Details about the evaluation setup (e.g., "tools", "no-tools", "chain-of-thought", etc.)

# or, with only the required attributes:

- dataset:
    id: Idavidrein/gpqa
    task_id: gpqa_diamond
  value: 0.412