leaderboard / tests /test_datasets.py
hotchpotch's picture
Deploy Docker leaderboard viewer
e8aa13a verified
from __future__ import annotations
from pathlib import Path
import pytest
from hakari_bench.datasets import (
DatasetRegistry,
NanoDatasetSpec,
REFERENCE_SOURCE_CONFIDENCE_LABELS,
validate_builtin_metadata,
resolve_dataset_revision,
resolve_dataset_splits,
resolve_eval_tasks,
)
def test_builtin_registry_contains_requested_benchmarks() -> None:
registry = DatasetRegistry.load_builtin()
assert registry.get_dataset("NanoBEIR-en").dataset_id == "hakari-bench/NanoBEIR-en"
assert registry.get_dataset("NanoMIRACL").dataset_id == "hakari-bench/NanoMIRACL"
assert registry.get_dataset("NanoMLDR").dataset_id == "hakari-bench/NanoMLDR"
assert registry.get_dataset("NanoJMTEB-v2").dataset_id == "hakari-bench/NanoJMTEB-v2"
assert registry.get_dataset("NanoRTEB").dataset_id == "hakari-bench/NanoRTEB"
assert registry.get_dataset("NanoMTEB-v2").dataset_id == "hakari-bench/NanoMTEB-v2"
assert registry.get_dataset("NanoMMTEB-v2").dataset_id == "hakari-bench/NanoMMTEB-v2"
assert registry.get_dataset("NanoCMTEB").dataset_id == "hakari-bench/NanoCMTEB"
assert registry.get_dataset("NanoLongEmbed").dataset_id == "hakari-bench/NanoLongEmbed"
assert registry.get_dataset("NanoCoIR").dataset_id == "hakari-bench/NanoCoIR"
assert registry.get_dataset("NanoIFIR").dataset_id == "hakari-bench/NanoIFIR"
assert registry.get_dataset("NanoLaw").dataset_id == "hakari-bench/NanoLaw"
assert registry.get_dataset("NanoMedical").dataset_id == "hakari-bench/NanoMedical"
assert registry.get_dataset("NanoRARb").dataset_id == "hakari-bench/NanoRARb"
assert registry.get_dataset("NanoBRIGHT").dataset_id == "hakari-bench/NanoBRIGHT"
assert registry.get_dataset("NanoCodeRAG").dataset_id == "hakari-bench/NanoCodeRAG"
assert registry.get_dataset("NanoChemTEB").dataset_id == "hakari-bench/NanoChemTEB"
assert registry.get_dataset("NanoR2MED").dataset_id == "hakari-bench/NanoR2MED"
assert registry.get_dataset("NanoBuiltBench").dataset_id == "hakari-bench/NanoBuiltBench"
assert registry.get_dataset("NanoBIRCO").dataset_id == "hakari-bench/NanoBIRCO"
assert registry.get_dataset("NanoDAPFAM").dataset_id == "hakari-bench/NanoDAPFAM"
assert registry.get_dataset("NanoFaMTEB-v2").dataset_id == "hakari-bench/NanoFaMTEB-v2"
assert registry.get_dataset("NanoIndicQA").dataset_id == "hakari-bench/NanoIndicQA"
assert registry.get_dataset("NanoMuPLeR").dataset_id == "hakari-bench/NanoMuPLeR"
assert registry.get_dataset("NanoMTEB-Dutch").dataset_id == "hakari-bench/NanoMTEB-Dutch"
assert registry.get_dataset("NanoMTEB-Misc").dataset_id == "hakari-bench/NanoMTEB-Misc"
assert registry.get_dataset("NanoMTEB-Polish").dataset_id == "hakari-bench/NanoMTEB-Polish"
assert len(registry.get_collection("MNanoBEIR").datasets) == 14
with pytest.raises(KeyError):
registry.get_collection("NanoMTEB_Family")
def test_builtin_config_lives_in_repo_config() -> None:
config_root = Path("config")
assert config_root.joinpath("datasets", "nanobeir_en.yaml").is_file()
assert config_root.joinpath("dataset_collections", "mnanobeir.yaml").is_file()
assert not config_root.joinpath("dataset_collections", "nanomteb_family.yaml").exists()
assert config_root.joinpath("viewer", "benchmarks.yaml").is_file()
def test_resolve_eval_tasks_for_builtin_nanomteb_uses_declared_splits() -> None:
registry = DatasetRegistry.load_builtin()
tasks = resolve_eval_tasks(registry=registry, dataset_values=["NanoMTEB-v2"], collection_values=[], split_values=[])
assert [(task.dataset_name, task.split_name) for task in tasks] == [
("NanoMTEB-v2", "argu_ana"),
("NanoMTEB-v2", "climate_fever"),
("NanoMTEB-v2", "cqadupstack_gaming"),
("NanoMTEB-v2", "cqadupstack_unix"),
("NanoMTEB-v2", "fever"),
("NanoMTEB-v2", "fi_qa2018"),
("NanoMTEB-v2", "hotpot_qa"),
("NanoMTEB-v2", "scidocs"),
("NanoMTEB-v2", "touche2020_v3"),
("NanoMTEB-v2", "treccovid"),
]
def test_resolve_eval_tasks_for_builtin_nanommteb_uses_declared_splits() -> None:
registry = DatasetRegistry.load_builtin()
tasks = resolve_eval_tasks(registry=registry, dataset_values=["NanoMMTEB-v2"], collection_values=[], split_values=[])
assert [(task.dataset_name, task.split_name) for task in tasks] == [
("NanoMMTEB-v2", "ailastatutes"),
("NanoMMTEB-v2", "argu_ana"),
("NanoMMTEB-v2", "belebele"),
("NanoMMTEB-v2", "covid"),
("NanoMMTEB-v2", "hagrid"),
("NanoMMTEB-v2", "legal_bench_corporate_lobbying"),
("NanoMMTEB-v2", "lembpasskey"),
("NanoMMTEB-v2", "miracl"),
("NanoMMTEB-v2", "mlqa"),
("NanoMMTEB-v2", "scidocs"),
("NanoMMTEB-v2", "spart_qa"),
("NanoMMTEB-v2", "stack_overflow_qa"),
("NanoMMTEB-v2", "statcan_dialogue_dataset"),
("NanoMMTEB-v2", "temp_reason_l1"),
("NanoMMTEB-v2", "treccovid"),
("NanoMMTEB-v2", "twitter_hjerne"),
("NanoMMTEB-v2", "wikipedia_multilingual"),
("NanoMMTEB-v2", "wino_grande"),
]
def test_resolve_eval_tasks_for_builtin_nanomteb_chinese_uses_declared_splits() -> None:
registry = DatasetRegistry.load_builtin()
tasks = resolve_eval_tasks(registry=registry, dataset_values=["NanoCMTEB"], collection_values=[], split_values=[])
assert [(task.dataset_name, task.split_name) for task in tasks] == [
("NanoCMTEB", "cmedqa"),
("NanoCMTEB", "covid"),
("NanoCMTEB", "du"),
("NanoCMTEB", "ecom"),
("NanoCMTEB", "medical"),
("NanoCMTEB", "mmarco"),
("NanoCMTEB", "t2"),
("NanoCMTEB", "video"),
]
def test_resolve_eval_tasks_for_builtin_nanorteb_uses_declared_splits() -> None:
registry = DatasetRegistry.load_builtin()
tasks = resolve_eval_tasks(registry=registry, dataset_values=["NanoRTEB"], collection_values=[], split_values=[])
assert [(task.dataset_name, task.split_name) for task in tasks] == [
("NanoRTEB", "NanoAILACasedocs"),
("NanoRTEB", "NanoAILAStatutes"),
("NanoRTEB", "NanoLegalSummarization"),
("NanoRTEB", "NanoFinanceBench"),
("NanoRTEB", "NanoHC3Finance"),
("NanoRTEB", "NanoFinQA"),
("NanoRTEB", "NanoApps"),
("NanoRTEB", "NanoDS1000"),
("NanoRTEB", "NanoHumanEval"),
("NanoRTEB", "NanoMBPP"),
("NanoRTEB", "NanoWikiSQL"),
("NanoRTEB", "NanoFreshStack"),
("NanoRTEB", "NanoChatDoctor"),
("NanoRTEB", "NanoCUREv1"),
]
def test_resolve_eval_tasks_for_builtin_nanomteb_japanese_uses_declared_splits() -> None:
registry = DatasetRegistry.load_builtin()
tasks = resolve_eval_tasks(registry=registry, dataset_values=["NanoJMTEB-v2"], collection_values=[], split_values=[])
assert [(task.dataset_name, task.split_name) for task in tasks] == [
("NanoJMTEB-v2", "ja_cwir"),
("NanoJMTEB-v2", "ja_gov_faqs"),
("NanoJMTEB-v2", "jaqket"),
("NanoJMTEB-v2", "mintaka_ja"),
("NanoJMTEB-v2", "miracl_ja"),
("NanoJMTEB-v2", "mr_tidy_japanese"),
("NanoJMTEB-v2", "multi_long_doc_ja"),
("NanoJMTEB-v2", "nlpjournal_abs_article"),
("NanoJMTEB-v2", "nlpjournal_abs_intro"),
("NanoJMTEB-v2", "nlpjournal_title_abs"),
("NanoJMTEB-v2", "nlpjournal_title_intro"),
]
def test_resolve_eval_tasks_for_builtin_nanomldr_uses_declared_splits() -> None:
registry = DatasetRegistry.load_builtin()
tasks = resolve_eval_tasks(registry=registry, dataset_values=["NanoMLDR"], collection_values=[], split_values=[])
assert [(task.dataset_name, task.split_name) for task in tasks] == [
("NanoMLDR", "ar"),
("NanoMLDR", "de"),
("NanoMLDR", "en"),
("NanoMLDR", "es"),
("NanoMLDR", "fr"),
("NanoMLDR", "hi"),
("NanoMLDR", "it"),
("NanoMLDR", "ja"),
("NanoMLDR", "ko"),
("NanoMLDR", "pt"),
("NanoMLDR", "ru"),
("NanoMLDR", "th"),
("NanoMLDR", "zh"),
]
def test_resolve_eval_tasks_for_builtin_nanolongembed_uses_declared_splits() -> None:
registry = DatasetRegistry.load_builtin()
tasks = resolve_eval_tasks(registry=registry, dataset_values=["NanoLongEmbed"], collection_values=[], split_values=[])
assert [(task.dataset_name, task.split_name) for task in tasks] == [
("NanoLongEmbed", "NanoNarrativeQA"),
("NanoLongEmbed", "NanoSummScreenFD"),
("NanoLongEmbed", "NanoQMSum"),
("NanoLongEmbed", "Nano2WikiMultihopQA"),
("NanoLongEmbed", "NanoPasskey"),
("NanoLongEmbed", "NanoNeedle"),
]
def test_resolve_eval_tasks_for_builtin_nanocoir_uses_declared_splits() -> None:
registry = DatasetRegistry.load_builtin()
tasks = resolve_eval_tasks(registry=registry, dataset_values=["NanoCoIR"], collection_values=[], split_values=[])
assert [(task.dataset_name, task.split_name) for task in tasks] == [
("NanoCoIR", "NanoApps"),
("NanoCoIR", "NanoCodeFeedbackMT"),
("NanoCoIR", "NanoCodeFeedbackST"),
("NanoCoIR", "NanoCodeTransOceanContest"),
("NanoCoIR", "NanoCodeTransOceanDL"),
("NanoCoIR", "NanoCosQA"),
("NanoCoIR", "NanoStackOverflowQA"),
("NanoCoIR", "NanoSyntheticText2SQL"),
("NanoCoIR", "NanoCodeSearchNet"),
("NanoCoIR", "NanoCodeSearchNetCCR"),
]
def test_resolve_eval_tasks_for_new_builtin_nano_datasets_use_declared_splits() -> None:
registry = DatasetRegistry.load_builtin()
expected = {
"NanoIFIR": [
"NanoIFIRAila",
"NanoIFIRCds",
"NanoIFIRFiQA",
"NanoIFIRFire",
"NanoIFIRNFCorpus",
"NanoIFIRPm",
"NanoIFIRScifact",
],
"NanoLaw": [
"NanoAILACasedocs",
"NanoAILAStatutes",
"NanoGerDaLIRSmall",
"NanoLeCaRDv2",
"NanoLegalBenchConsumerContractsQA",
"NanoLegalBenchCorporateLobbying",
"NanoLegalQuAD",
"NanoLegalSummarization",
],
"NanoMedical": [
"NanoCMedQAv2reranking",
"NanoCUREv1",
"NanoCmedqa",
"NanoMedicalQA",
"NanoNFCorpus",
"NanoPublicHealthQA",
"NanoSciFact",
"NanoSciFactPL",
"NanoTRECCOVID",
"NanoTRECCOVIDPL",
],
"NanoRARb": [
"NanoARCChallenge",
"NanoAlphaNLI",
"NanoHellaSwag",
"NanoPIQA",
"NanoQuail",
"NanoRARbCode",
"NanoRARbMath",
"NanoSIQA",
"NanoSpartQA",
"NanoTempReasonL1",
"NanoTempReasonL2Context",
"NanoTempReasonL2Fact",
"NanoTempReasonL2Pure",
"NanoTempReasonL3Context",
"NanoTempReasonL3Fact",
"NanoTempReasonL3Pure",
"NanoWinoGrande",
],
"NanoBRIGHT": [
"NanoBrightAops",
"NanoBrightBiology",
"NanoBrightBiologyLong",
"NanoBrightEarthScience",
"NanoBrightEarthScienceLong",
"NanoBrightEconomics",
"NanoBrightEconomicsLong",
"NanoBrightLeetcode",
"NanoBrightPony",
"NanoBrightPonyLong",
"NanoBrightPsychology",
"NanoBrightPsychologyLong",
"NanoBrightRobotics",
"NanoBrightRoboticsLong",
"NanoBrightStackoverflow",
"NanoBrightStackoverflowLong",
"NanoBrightSustainableLiving",
"NanoBrightSustainableLivingLong",
"NanoBrightTheoremQAQuestions",
"NanoBrightTheoremQATheorems",
],
"NanoCodeRAG": [
"NanoCodeRAGLibraryDocumentationSolutions",
"NanoCodeRAGOnlineTutorials",
"NanoCodeRAGProgrammingSolutions",
"NanoCodeRAGStackoverflowPosts",
],
"NanoChemTEB": ["NanoChemHotpotQA", "NanoChemNQ", "NanoChemRxiv"],
"NanoR2MED": [
"NanoR2MEDBioinformatics",
"NanoR2MEDBiology",
"NanoR2MEDIIYiClinical",
"NanoR2MEDMedQADiag",
"NanoR2MEDMedXpertQAExam",
"NanoR2MEDMedicalSciences",
"NanoR2MEDPMCClinical",
"NanoR2MEDPMCTreatment",
],
"NanoBuiltBench": ["NanoBuiltBench", "NanoBuiltBenchReranking"],
}
for dataset_name, split_names in expected.items():
tasks = resolve_eval_tasks(registry=registry, dataset_values=[dataset_name], collection_values=[], split_values=[])
assert [(task.dataset_name, task.split_name) for task in tasks] == [
(dataset_name, split_name) for split_name in split_names
]
def test_resolve_eval_tasks_expands_mnanobeir_collection() -> None:
registry = DatasetRegistry.load_builtin()
tasks = resolve_eval_tasks(registry=registry, dataset_values=[], collection_values=["MNanoBEIR"], split_values=["msmarco"])
assert len(tasks) == 14
assert tasks[0].dataset_id == "hakari-bench/NanoBEIR-en"
assert tasks[0].split_name == "NanoMSMARCO"
assert tasks[0].task_name == "msmarco"
def test_resolve_eval_tasks_accepts_direct_dataset_id(monkeypatch: pytest.MonkeyPatch) -> None:
registry = DatasetRegistry.load_builtin()
monkeypatch.setattr(
"hakari_bench.datasets.get_dataset_split_names",
lambda dataset_id, subset: ["ja", "en"],
)
tasks = resolve_eval_tasks(
registry=registry,
dataset_values=["example/NanoToy"],
collection_values=[],
split_values=[],
)
assert [(task.dataset_name, task.split_name) for task in tasks] == [
("NanoToy", "ja"),
("NanoToy", "en"),
]
def test_resolve_dataset_splits_uses_yaml_splits_without_network() -> None:
spec = NanoDatasetSpec(
name="Toy",
dataset_id="local/toy",
corpus_config="corpus",
queries_config="queries",
qrels_config="qrels",
candidate_config="bm25",
splits=["a", "b"],
)
assert resolve_dataset_splits(spec) == ["a", "b"]
def test_resolve_dataset_revision_uses_huggingface_hub_sha(monkeypatch: pytest.MonkeyPatch) -> None:
class FakeDatasetInfo:
sha = "abc123"
class FakeHfApi:
def dataset_info(self, repo_id: str, revision: str | None = None) -> FakeDatasetInfo:
assert repo_id == "owner/dataset"
assert revision == "main"
return FakeDatasetInfo()
resolve_dataset_revision.cache_clear()
monkeypatch.setattr("hakari_bench.datasets.HfApi", FakeHfApi)
assert resolve_dataset_revision("owner/dataset", requested_revision="main") == {
"requested": "main",
"resolved": "abc123",
"source": "huggingface_hub",
}
def test_resolve_dataset_revision_returns_unknown_when_hub_fails(monkeypatch: pytest.MonkeyPatch) -> None:
class FakeHfApi:
def dataset_info(self, repo_id: str, revision: str | None = None) -> object:
raise RuntimeError(f"cannot resolve {repo_id}@{revision}")
resolve_dataset_revision.cache_clear()
monkeypatch.setattr("hakari_bench.datasets.HfApi", FakeHfApi)
revision = resolve_dataset_revision("local/dataset", requested_revision=None)
assert revision["requested"] is None
assert revision["resolved"] is None
assert revision["source"] == "huggingface_hub"
assert revision["error"].startswith("RuntimeError:")
def test_registry_loads_yaml_files(tmp_path: Path) -> None:
(tmp_path / "datasets").mkdir()
(tmp_path / "dataset_collections").mkdir()
(tmp_path / "datasets" / "toy.yaml").write_text(
"""
name: Toy
dataset_id: local/toy
corpus_config: corpus
queries_config: queries
qrels_config: qrels
candidate_config: bm25
splits: [a]
""".strip(),
encoding="utf-8",
)
(tmp_path / "dataset_collections" / "toy_collection.yaml").write_text(
"""
name: ToyCollection
datasets:
- Toy
""".strip(),
encoding="utf-8",
)
registry = DatasetRegistry.load_from_root(tmp_path)
assert registry.get_dataset("Toy").dataset_id == "local/toy"
assert registry.get_collection("ToyCollection").datasets == ["Toy"]
def test_registry_rejects_unknown_dataset_config_keys(tmp_path: Path) -> None:
(tmp_path / "datasets").mkdir()
(tmp_path / "datasets" / "toy.yaml").write_text(
"""
name: Toy
dataset_id: local/toy
unknown_key: value
""".strip(),
encoding="utf-8",
)
with pytest.raises(ValueError, match="unknown_key"):
DatasetRegistry.load_from_root(tmp_path)
def test_registry_preserves_dataset_and_task_metadata(tmp_path: Path) -> None:
(tmp_path / "datasets").mkdir()
(tmp_path / "dataset_collections").mkdir()
(tmp_path / "datasets" / "toy.yaml").write_text(
"""
name: Toy
dataset_id: local/toy
metadata:
language: en
category: natural_language
short_description: Toy retrieval group.
description: Toy dataset used to verify metadata loading.
references:
- title: Toy Paper
authors: [Ada Example]
year: 2024
url: https://example.com/toy
citation_keys: [toy2024]
bibtex: |
@misc{toy2024,
title = {Toy Paper},
year = {2024}
}
splits: [a]
task_metadata:
a:
language: en
category: natural_language
short_description: Toy split.
description: Toy split metadata survives loading and task resolution.
query_text_stats:
count: 2
min_chars: 3
max_chars: 5
mean_chars: 4.0
median_chars: 4.0
document_text_stats:
count: 3
min_chars: 8
max_chars: 10
mean_chars: 9.0
median_chars: 9.0
""".strip(),
encoding="utf-8",
)
(tmp_path / "dataset_collections" / "toy_collection.yaml").write_text(
"""
name: ToyCollection
metadata:
language: en
category: natural_language
short_description: Toy collection.
description: Toy collection metadata survives loading.
datasets:
- Toy
- name: InlineToy
dataset_id: local/inline-toy
metadata:
language: en
category: natural_language
short_description: Inline toy.
description: Inline dataset metadata survives collection loading.
splits: [inline]
task_metadata:
inline:
language: en
category: natural_language
short_description: Inline toy split.
description: Inline toy task metadata survives loading.
""".strip(),
encoding="utf-8",
)
registry = DatasetRegistry.load_from_root(tmp_path)
toy = registry.get_dataset("Toy")
inline = registry.get_dataset("InlineToy")
collection = registry.get_collection("ToyCollection")
tasks = resolve_eval_tasks(registry=registry, dataset_values=["Toy"], collection_values=[], split_values=[])
assert toy.metadata is not None
assert toy.task_metadata is not None
assert inline.metadata is not None
assert collection.metadata is not None
assert toy.metadata["citation_keys"] == ["toy2024"]
assert toy.task_metadata["a"]["query_text_stats"]["median_chars"] == 4.0
assert inline.metadata["short_description"] == "Inline toy."
assert collection.metadata["short_description"] == "Toy collection."
assert tasks[0].metadata["short_description"] == "Toy split."
def test_builtin_metadata_is_complete_and_valid() -> None:
errors = validate_builtin_metadata()
assert errors == []
def test_metadata_validation_rejects_unknown_category() -> None:
spec = NanoDatasetSpec(
name="Toy",
dataset_id="local/toy",
metadata={
"language": "en",
"category": "other",
"short_description": "Toy.",
"description": "Toy metadata with an invalid category.",
},
)
errors = spec.validate_metadata()
assert errors == ["Toy metadata has invalid category 'other'."]
def test_metadata_validation_accepts_language_detection_fields() -> None:
spec = NanoDatasetSpec(
name="Toy",
dataset_id="local/toy",
metadata={
"language": "unknown",
"languages": ["en", "ja"],
"category": "natural_language",
"short_description": "Toy metadata.",
"description": "Toy metadata with detected language distributions.",
"language_detection": {
"detector": "fast-langdetect",
"min_language_percent": 0.5,
"main_language_percent": 10.0,
"query": {"sample_count": 10, "languages": {"ja": 80.0, "en": 20.0}},
"document": {"sample_count": 100, "languages": {"en": 81.0, "ja": 19.0}},
},
},
)
assert spec.validate_metadata() == []
def test_metadata_validation_rejects_invalid_language_detection_fields() -> None:
spec = NanoDatasetSpec(
name="Toy",
dataset_id="local/toy",
metadata={
"language": "en",
"languages": ["english"],
"category": "natural_language",
"short_description": "Toy metadata.",
"description": "Toy metadata with invalid language detection.",
"language_detection": {
"detector": 1,
"min_language_percent": "0.5",
"main_language_percent": 10.0,
"query": {"sample_count": 10, "languages": {"engl": 100.0}},
"document": {"sample_count": "100", "languages": {"en": "100"}},
},
},
)
assert spec.validate_metadata() == [
"Toy metadata has invalid languages[0] 'english'.",
"Toy metadata language_detection.detector must be string.",
"Toy metadata language_detection.min_language_percent must be numeric.",
"Toy metadata has invalid language_detection.query.languages key 'engl'.",
"Toy metadata language_detection.document.sample_count must be integer.",
"Toy metadata language_detection.document.languages['en'] must be numeric.",
]
def test_metadata_validation_requires_reference_is_paper_boolean() -> None:
spec = NanoDatasetSpec(
name="Toy",
dataset_id="local/toy",
metadata={
"language": "en",
"category": "natural_language",
"short_description": "Toy.",
"description": "Toy metadata with references.",
"references": [
{
"title": "Toy Paper",
"authors": ["A. Author"],
"year": 2024,
"url": "https://example.com/paper",
"source_confidence": "probably_correct",
},
{
"title": "Toy Blog",
"authors": ["B. Author"],
"year": 2024,
"url": "https://example.com/blog",
"is_paper": "no",
"source_confidence": "probably_correct",
},
],
},
)
errors = spec.validate_metadata()
assert errors == [
"Toy metadata references[0] is missing is_paper.",
"Toy metadata references[1].is_paper must be boolean.",
]
def test_metadata_validation_requires_reference_source_confidence_label() -> None:
spec = NanoDatasetSpec(
name="Toy",
dataset_id="local/toy",
metadata={
"language": "en",
"category": "natural_language",
"short_description": "Toy.",
"description": "Toy metadata with references.",
"references": [
{
"title": "Toy Paper",
"authors": ["A. Author"],
"year": 2024,
"url": "https://example.com/paper",
"is_paper": True,
},
{
"title": "Toy Blog",
"authors": ["B. Author"],
"year": 2024,
"url": "https://example.com/blog",
"is_paper": False,
"source_confidence": "unchecked",
},
],
},
)
errors = spec.validate_metadata()
assert errors == [
"Toy metadata references[0] is missing source_confidence.",
"Toy metadata references[1].source_confidence has invalid label 'unchecked'.",
]
def test_reference_source_confidence_labels_are_documented() -> None:
assert set(REFERENCE_SOURCE_CONFIDENCE_LABELS) == {
"source_uncertain",
"probably_correct",
"definitive_paper_link",
"human_verified",
}
assert "AI agents must not assign this label" in REFERENCE_SOURCE_CONFIDENCE_LABELS["human_verified"]