from __future__ import annotations from pathlib import Path import pytest from hakari_bench.datasets import ( DatasetRegistry, NanoDatasetSpec, REFERENCE_SOURCE_CONFIDENCE_LABELS, validate_builtin_metadata, resolve_dataset_revision, resolve_dataset_splits, resolve_eval_tasks, ) def test_builtin_registry_contains_requested_benchmarks() -> None: registry = DatasetRegistry.load_builtin() assert registry.get_dataset("NanoBEIR-en").dataset_id == "hakari-bench/NanoBEIR-en" assert registry.get_dataset("NanoMIRACL").dataset_id == "hakari-bench/NanoMIRACL" assert registry.get_dataset("NanoMLDR").dataset_id == "hakari-bench/NanoMLDR" assert registry.get_dataset("NanoJMTEB-v2").dataset_id == "hakari-bench/NanoJMTEB-v2" assert registry.get_dataset("NanoRTEB").dataset_id == "hakari-bench/NanoRTEB" assert registry.get_dataset("NanoMTEB-v2").dataset_id == "hakari-bench/NanoMTEB-v2" assert registry.get_dataset("NanoMMTEB-v2").dataset_id == "hakari-bench/NanoMMTEB-v2" assert registry.get_dataset("NanoCMTEB").dataset_id == "hakari-bench/NanoCMTEB" assert registry.get_dataset("NanoLongEmbed").dataset_id == "hakari-bench/NanoLongEmbed" assert registry.get_dataset("NanoCoIR").dataset_id == "hakari-bench/NanoCoIR" assert registry.get_dataset("NanoIFIR").dataset_id == "hakari-bench/NanoIFIR" assert registry.get_dataset("NanoLaw").dataset_id == "hakari-bench/NanoLaw" assert registry.get_dataset("NanoMedical").dataset_id == "hakari-bench/NanoMedical" assert registry.get_dataset("NanoRARb").dataset_id == "hakari-bench/NanoRARb" assert registry.get_dataset("NanoBRIGHT").dataset_id == "hakari-bench/NanoBRIGHT" assert registry.get_dataset("NanoCodeRAG").dataset_id == "hakari-bench/NanoCodeRAG" assert registry.get_dataset("NanoChemTEB").dataset_id == "hakari-bench/NanoChemTEB" assert registry.get_dataset("NanoR2MED").dataset_id == "hakari-bench/NanoR2MED" assert registry.get_dataset("NanoBuiltBench").dataset_id == "hakari-bench/NanoBuiltBench" assert registry.get_dataset("NanoBIRCO").dataset_id == "hakari-bench/NanoBIRCO" assert registry.get_dataset("NanoDAPFAM").dataset_id == "hakari-bench/NanoDAPFAM" assert registry.get_dataset("NanoFaMTEB-v2").dataset_id == "hakari-bench/NanoFaMTEB-v2" assert registry.get_dataset("NanoIndicQA").dataset_id == "hakari-bench/NanoIndicQA" assert registry.get_dataset("NanoMuPLeR").dataset_id == "hakari-bench/NanoMuPLeR" assert registry.get_dataset("NanoMTEB-Dutch").dataset_id == "hakari-bench/NanoMTEB-Dutch" assert registry.get_dataset("NanoMTEB-Misc").dataset_id == "hakari-bench/NanoMTEB-Misc" assert registry.get_dataset("NanoMTEB-Polish").dataset_id == "hakari-bench/NanoMTEB-Polish" assert len(registry.get_collection("MNanoBEIR").datasets) == 14 with pytest.raises(KeyError): registry.get_collection("NanoMTEB_Family") def test_builtin_config_lives_in_repo_config() -> None: config_root = Path("config") assert config_root.joinpath("datasets", "nanobeir_en.yaml").is_file() assert config_root.joinpath("dataset_collections", "mnanobeir.yaml").is_file() assert not config_root.joinpath("dataset_collections", "nanomteb_family.yaml").exists() assert config_root.joinpath("viewer", "benchmarks.yaml").is_file() def test_resolve_eval_tasks_for_builtin_nanomteb_uses_declared_splits() -> None: registry = DatasetRegistry.load_builtin() tasks = resolve_eval_tasks(registry=registry, dataset_values=["NanoMTEB-v2"], collection_values=[], split_values=[]) assert [(task.dataset_name, task.split_name) for task in tasks] == [ ("NanoMTEB-v2", "argu_ana"), ("NanoMTEB-v2", "climate_fever"), ("NanoMTEB-v2", "cqadupstack_gaming"), ("NanoMTEB-v2", "cqadupstack_unix"), ("NanoMTEB-v2", "fever"), ("NanoMTEB-v2", "fi_qa2018"), ("NanoMTEB-v2", "hotpot_qa"), ("NanoMTEB-v2", "scidocs"), ("NanoMTEB-v2", "touche2020_v3"), ("NanoMTEB-v2", "treccovid"), ] def test_resolve_eval_tasks_for_builtin_nanommteb_uses_declared_splits() -> None: registry = DatasetRegistry.load_builtin() tasks = resolve_eval_tasks(registry=registry, dataset_values=["NanoMMTEB-v2"], collection_values=[], split_values=[]) assert [(task.dataset_name, task.split_name) for task in tasks] == [ ("NanoMMTEB-v2", "ailastatutes"), ("NanoMMTEB-v2", "argu_ana"), ("NanoMMTEB-v2", "belebele"), ("NanoMMTEB-v2", "covid"), ("NanoMMTEB-v2", "hagrid"), ("NanoMMTEB-v2", "legal_bench_corporate_lobbying"), ("NanoMMTEB-v2", "lembpasskey"), ("NanoMMTEB-v2", "miracl"), ("NanoMMTEB-v2", "mlqa"), ("NanoMMTEB-v2", "scidocs"), ("NanoMMTEB-v2", "spart_qa"), ("NanoMMTEB-v2", "stack_overflow_qa"), ("NanoMMTEB-v2", "statcan_dialogue_dataset"), ("NanoMMTEB-v2", "temp_reason_l1"), ("NanoMMTEB-v2", "treccovid"), ("NanoMMTEB-v2", "twitter_hjerne"), ("NanoMMTEB-v2", "wikipedia_multilingual"), ("NanoMMTEB-v2", "wino_grande"), ] def test_resolve_eval_tasks_for_builtin_nanomteb_chinese_uses_declared_splits() -> None: registry = DatasetRegistry.load_builtin() tasks = resolve_eval_tasks(registry=registry, dataset_values=["NanoCMTEB"], collection_values=[], split_values=[]) assert [(task.dataset_name, task.split_name) for task in tasks] == [ ("NanoCMTEB", "cmedqa"), ("NanoCMTEB", "covid"), ("NanoCMTEB", "du"), ("NanoCMTEB", "ecom"), ("NanoCMTEB", "medical"), ("NanoCMTEB", "mmarco"), ("NanoCMTEB", "t2"), ("NanoCMTEB", "video"), ] def test_resolve_eval_tasks_for_builtin_nanorteb_uses_declared_splits() -> None: registry = DatasetRegistry.load_builtin() tasks = resolve_eval_tasks(registry=registry, dataset_values=["NanoRTEB"], collection_values=[], split_values=[]) assert [(task.dataset_name, task.split_name) for task in tasks] == [ ("NanoRTEB", "NanoAILACasedocs"), ("NanoRTEB", "NanoAILAStatutes"), ("NanoRTEB", "NanoLegalSummarization"), ("NanoRTEB", "NanoFinanceBench"), ("NanoRTEB", "NanoHC3Finance"), ("NanoRTEB", "NanoFinQA"), ("NanoRTEB", "NanoApps"), ("NanoRTEB", "NanoDS1000"), ("NanoRTEB", "NanoHumanEval"), ("NanoRTEB", "NanoMBPP"), ("NanoRTEB", "NanoWikiSQL"), ("NanoRTEB", "NanoFreshStack"), ("NanoRTEB", "NanoChatDoctor"), ("NanoRTEB", "NanoCUREv1"), ] def test_resolve_eval_tasks_for_builtin_nanomteb_japanese_uses_declared_splits() -> None: registry = DatasetRegistry.load_builtin() tasks = resolve_eval_tasks(registry=registry, dataset_values=["NanoJMTEB-v2"], collection_values=[], split_values=[]) assert [(task.dataset_name, task.split_name) for task in tasks] == [ ("NanoJMTEB-v2", "ja_cwir"), ("NanoJMTEB-v2", "ja_gov_faqs"), ("NanoJMTEB-v2", "jaqket"), ("NanoJMTEB-v2", "mintaka_ja"), ("NanoJMTEB-v2", "miracl_ja"), ("NanoJMTEB-v2", "mr_tidy_japanese"), ("NanoJMTEB-v2", "multi_long_doc_ja"), ("NanoJMTEB-v2", "nlpjournal_abs_article"), ("NanoJMTEB-v2", "nlpjournal_abs_intro"), ("NanoJMTEB-v2", "nlpjournal_title_abs"), ("NanoJMTEB-v2", "nlpjournal_title_intro"), ] def test_resolve_eval_tasks_for_builtin_nanomldr_uses_declared_splits() -> None: registry = DatasetRegistry.load_builtin() tasks = resolve_eval_tasks(registry=registry, dataset_values=["NanoMLDR"], collection_values=[], split_values=[]) assert [(task.dataset_name, task.split_name) for task in tasks] == [ ("NanoMLDR", "ar"), ("NanoMLDR", "de"), ("NanoMLDR", "en"), ("NanoMLDR", "es"), ("NanoMLDR", "fr"), ("NanoMLDR", "hi"), ("NanoMLDR", "it"), ("NanoMLDR", "ja"), ("NanoMLDR", "ko"), ("NanoMLDR", "pt"), ("NanoMLDR", "ru"), ("NanoMLDR", "th"), ("NanoMLDR", "zh"), ] def test_resolve_eval_tasks_for_builtin_nanolongembed_uses_declared_splits() -> None: registry = DatasetRegistry.load_builtin() tasks = resolve_eval_tasks(registry=registry, dataset_values=["NanoLongEmbed"], collection_values=[], split_values=[]) assert [(task.dataset_name, task.split_name) for task in tasks] == [ ("NanoLongEmbed", "NanoNarrativeQA"), ("NanoLongEmbed", "NanoSummScreenFD"), ("NanoLongEmbed", "NanoQMSum"), ("NanoLongEmbed", "Nano2WikiMultihopQA"), ("NanoLongEmbed", "NanoPasskey"), ("NanoLongEmbed", "NanoNeedle"), ] def test_resolve_eval_tasks_for_builtin_nanocoir_uses_declared_splits() -> None: registry = DatasetRegistry.load_builtin() tasks = resolve_eval_tasks(registry=registry, dataset_values=["NanoCoIR"], collection_values=[], split_values=[]) assert [(task.dataset_name, task.split_name) for task in tasks] == [ ("NanoCoIR", "NanoApps"), ("NanoCoIR", "NanoCodeFeedbackMT"), ("NanoCoIR", "NanoCodeFeedbackST"), ("NanoCoIR", "NanoCodeTransOceanContest"), ("NanoCoIR", "NanoCodeTransOceanDL"), ("NanoCoIR", "NanoCosQA"), ("NanoCoIR", "NanoStackOverflowQA"), ("NanoCoIR", "NanoSyntheticText2SQL"), ("NanoCoIR", "NanoCodeSearchNet"), ("NanoCoIR", "NanoCodeSearchNetCCR"), ] def test_resolve_eval_tasks_for_new_builtin_nano_datasets_use_declared_splits() -> None: registry = DatasetRegistry.load_builtin() expected = { "NanoIFIR": [ "NanoIFIRAila", "NanoIFIRCds", "NanoIFIRFiQA", "NanoIFIRFire", "NanoIFIRNFCorpus", "NanoIFIRPm", "NanoIFIRScifact", ], "NanoLaw": [ "NanoAILACasedocs", "NanoAILAStatutes", "NanoGerDaLIRSmall", "NanoLeCaRDv2", "NanoLegalBenchConsumerContractsQA", "NanoLegalBenchCorporateLobbying", "NanoLegalQuAD", "NanoLegalSummarization", ], "NanoMedical": [ "NanoCMedQAv2reranking", "NanoCUREv1", "NanoCmedqa", "NanoMedicalQA", "NanoNFCorpus", "NanoPublicHealthQA", "NanoSciFact", "NanoSciFactPL", "NanoTRECCOVID", "NanoTRECCOVIDPL", ], "NanoRARb": [ "NanoARCChallenge", "NanoAlphaNLI", "NanoHellaSwag", "NanoPIQA", "NanoQuail", "NanoRARbCode", "NanoRARbMath", "NanoSIQA", "NanoSpartQA", "NanoTempReasonL1", "NanoTempReasonL2Context", "NanoTempReasonL2Fact", "NanoTempReasonL2Pure", "NanoTempReasonL3Context", "NanoTempReasonL3Fact", "NanoTempReasonL3Pure", "NanoWinoGrande", ], "NanoBRIGHT": [ "NanoBrightAops", "NanoBrightBiology", "NanoBrightBiologyLong", "NanoBrightEarthScience", "NanoBrightEarthScienceLong", "NanoBrightEconomics", "NanoBrightEconomicsLong", "NanoBrightLeetcode", "NanoBrightPony", "NanoBrightPonyLong", "NanoBrightPsychology", "NanoBrightPsychologyLong", "NanoBrightRobotics", "NanoBrightRoboticsLong", "NanoBrightStackoverflow", "NanoBrightStackoverflowLong", "NanoBrightSustainableLiving", "NanoBrightSustainableLivingLong", "NanoBrightTheoremQAQuestions", "NanoBrightTheoremQATheorems", ], "NanoCodeRAG": [ "NanoCodeRAGLibraryDocumentationSolutions", "NanoCodeRAGOnlineTutorials", "NanoCodeRAGProgrammingSolutions", "NanoCodeRAGStackoverflowPosts", ], "NanoChemTEB": ["NanoChemHotpotQA", "NanoChemNQ", "NanoChemRxiv"], "NanoR2MED": [ "NanoR2MEDBioinformatics", "NanoR2MEDBiology", "NanoR2MEDIIYiClinical", "NanoR2MEDMedQADiag", "NanoR2MEDMedXpertQAExam", "NanoR2MEDMedicalSciences", "NanoR2MEDPMCClinical", "NanoR2MEDPMCTreatment", ], "NanoBuiltBench": ["NanoBuiltBench", "NanoBuiltBenchReranking"], } for dataset_name, split_names in expected.items(): tasks = resolve_eval_tasks(registry=registry, dataset_values=[dataset_name], collection_values=[], split_values=[]) assert [(task.dataset_name, task.split_name) for task in tasks] == [ (dataset_name, split_name) for split_name in split_names ] def test_resolve_eval_tasks_expands_mnanobeir_collection() -> None: registry = DatasetRegistry.load_builtin() tasks = resolve_eval_tasks(registry=registry, dataset_values=[], collection_values=["MNanoBEIR"], split_values=["msmarco"]) assert len(tasks) == 14 assert tasks[0].dataset_id == "hakari-bench/NanoBEIR-en" assert tasks[0].split_name == "NanoMSMARCO" assert tasks[0].task_name == "msmarco" def test_resolve_eval_tasks_accepts_direct_dataset_id(monkeypatch: pytest.MonkeyPatch) -> None: registry = DatasetRegistry.load_builtin() monkeypatch.setattr( "hakari_bench.datasets.get_dataset_split_names", lambda dataset_id, subset: ["ja", "en"], ) tasks = resolve_eval_tasks( registry=registry, dataset_values=["example/NanoToy"], collection_values=[], split_values=[], ) assert [(task.dataset_name, task.split_name) for task in tasks] == [ ("NanoToy", "ja"), ("NanoToy", "en"), ] def test_resolve_dataset_splits_uses_yaml_splits_without_network() -> None: spec = NanoDatasetSpec( name="Toy", dataset_id="local/toy", corpus_config="corpus", queries_config="queries", qrels_config="qrels", candidate_config="bm25", splits=["a", "b"], ) assert resolve_dataset_splits(spec) == ["a", "b"] def test_resolve_dataset_revision_uses_huggingface_hub_sha(monkeypatch: pytest.MonkeyPatch) -> None: class FakeDatasetInfo: sha = "abc123" class FakeHfApi: def dataset_info(self, repo_id: str, revision: str | None = None) -> FakeDatasetInfo: assert repo_id == "owner/dataset" assert revision == "main" return FakeDatasetInfo() resolve_dataset_revision.cache_clear() monkeypatch.setattr("hakari_bench.datasets.HfApi", FakeHfApi) assert resolve_dataset_revision("owner/dataset", requested_revision="main") == { "requested": "main", "resolved": "abc123", "source": "huggingface_hub", } def test_resolve_dataset_revision_returns_unknown_when_hub_fails(monkeypatch: pytest.MonkeyPatch) -> None: class FakeHfApi: def dataset_info(self, repo_id: str, revision: str | None = None) -> object: raise RuntimeError(f"cannot resolve {repo_id}@{revision}") resolve_dataset_revision.cache_clear() monkeypatch.setattr("hakari_bench.datasets.HfApi", FakeHfApi) revision = resolve_dataset_revision("local/dataset", requested_revision=None) assert revision["requested"] is None assert revision["resolved"] is None assert revision["source"] == "huggingface_hub" assert revision["error"].startswith("RuntimeError:") def test_registry_loads_yaml_files(tmp_path: Path) -> None: (tmp_path / "datasets").mkdir() (tmp_path / "dataset_collections").mkdir() (tmp_path / "datasets" / "toy.yaml").write_text( """ name: Toy dataset_id: local/toy corpus_config: corpus queries_config: queries qrels_config: qrels candidate_config: bm25 splits: [a] """.strip(), encoding="utf-8", ) (tmp_path / "dataset_collections" / "toy_collection.yaml").write_text( """ name: ToyCollection datasets: - Toy """.strip(), encoding="utf-8", ) registry = DatasetRegistry.load_from_root(tmp_path) assert registry.get_dataset("Toy").dataset_id == "local/toy" assert registry.get_collection("ToyCollection").datasets == ["Toy"] def test_registry_rejects_unknown_dataset_config_keys(tmp_path: Path) -> None: (tmp_path / "datasets").mkdir() (tmp_path / "datasets" / "toy.yaml").write_text( """ name: Toy dataset_id: local/toy unknown_key: value """.strip(), encoding="utf-8", ) with pytest.raises(ValueError, match="unknown_key"): DatasetRegistry.load_from_root(tmp_path) def test_registry_preserves_dataset_and_task_metadata(tmp_path: Path) -> None: (tmp_path / "datasets").mkdir() (tmp_path / "dataset_collections").mkdir() (tmp_path / "datasets" / "toy.yaml").write_text( """ name: Toy dataset_id: local/toy metadata: language: en category: natural_language short_description: Toy retrieval group. description: Toy dataset used to verify metadata loading. references: - title: Toy Paper authors: [Ada Example] year: 2024 url: https://example.com/toy citation_keys: [toy2024] bibtex: | @misc{toy2024, title = {Toy Paper}, year = {2024} } splits: [a] task_metadata: a: language: en category: natural_language short_description: Toy split. description: Toy split metadata survives loading and task resolution. query_text_stats: count: 2 min_chars: 3 max_chars: 5 mean_chars: 4.0 median_chars: 4.0 document_text_stats: count: 3 min_chars: 8 max_chars: 10 mean_chars: 9.0 median_chars: 9.0 """.strip(), encoding="utf-8", ) (tmp_path / "dataset_collections" / "toy_collection.yaml").write_text( """ name: ToyCollection metadata: language: en category: natural_language short_description: Toy collection. description: Toy collection metadata survives loading. datasets: - Toy - name: InlineToy dataset_id: local/inline-toy metadata: language: en category: natural_language short_description: Inline toy. description: Inline dataset metadata survives collection loading. splits: [inline] task_metadata: inline: language: en category: natural_language short_description: Inline toy split. description: Inline toy task metadata survives loading. """.strip(), encoding="utf-8", ) registry = DatasetRegistry.load_from_root(tmp_path) toy = registry.get_dataset("Toy") inline = registry.get_dataset("InlineToy") collection = registry.get_collection("ToyCollection") tasks = resolve_eval_tasks(registry=registry, dataset_values=["Toy"], collection_values=[], split_values=[]) assert toy.metadata is not None assert toy.task_metadata is not None assert inline.metadata is not None assert collection.metadata is not None assert toy.metadata["citation_keys"] == ["toy2024"] assert toy.task_metadata["a"]["query_text_stats"]["median_chars"] == 4.0 assert inline.metadata["short_description"] == "Inline toy." assert collection.metadata["short_description"] == "Toy collection." assert tasks[0].metadata["short_description"] == "Toy split." def test_builtin_metadata_is_complete_and_valid() -> None: errors = validate_builtin_metadata() assert errors == [] def test_metadata_validation_rejects_unknown_category() -> None: spec = NanoDatasetSpec( name="Toy", dataset_id="local/toy", metadata={ "language": "en", "category": "other", "short_description": "Toy.", "description": "Toy metadata with an invalid category.", }, ) errors = spec.validate_metadata() assert errors == ["Toy metadata has invalid category 'other'."] def test_metadata_validation_accepts_language_detection_fields() -> None: spec = NanoDatasetSpec( name="Toy", dataset_id="local/toy", metadata={ "language": "unknown", "languages": ["en", "ja"], "category": "natural_language", "short_description": "Toy metadata.", "description": "Toy metadata with detected language distributions.", "language_detection": { "detector": "fast-langdetect", "min_language_percent": 0.5, "main_language_percent": 10.0, "query": {"sample_count": 10, "languages": {"ja": 80.0, "en": 20.0}}, "document": {"sample_count": 100, "languages": {"en": 81.0, "ja": 19.0}}, }, }, ) assert spec.validate_metadata() == [] def test_metadata_validation_rejects_invalid_language_detection_fields() -> None: spec = NanoDatasetSpec( name="Toy", dataset_id="local/toy", metadata={ "language": "en", "languages": ["english"], "category": "natural_language", "short_description": "Toy metadata.", "description": "Toy metadata with invalid language detection.", "language_detection": { "detector": 1, "min_language_percent": "0.5", "main_language_percent": 10.0, "query": {"sample_count": 10, "languages": {"engl": 100.0}}, "document": {"sample_count": "100", "languages": {"en": "100"}}, }, }, ) assert spec.validate_metadata() == [ "Toy metadata has invalid languages[0] 'english'.", "Toy metadata language_detection.detector must be string.", "Toy metadata language_detection.min_language_percent must be numeric.", "Toy metadata has invalid language_detection.query.languages key 'engl'.", "Toy metadata language_detection.document.sample_count must be integer.", "Toy metadata language_detection.document.languages['en'] must be numeric.", ] def test_metadata_validation_requires_reference_is_paper_boolean() -> None: spec = NanoDatasetSpec( name="Toy", dataset_id="local/toy", metadata={ "language": "en", "category": "natural_language", "short_description": "Toy.", "description": "Toy metadata with references.", "references": [ { "title": "Toy Paper", "authors": ["A. Author"], "year": 2024, "url": "https://example.com/paper", "source_confidence": "probably_correct", }, { "title": "Toy Blog", "authors": ["B. Author"], "year": 2024, "url": "https://example.com/blog", "is_paper": "no", "source_confidence": "probably_correct", }, ], }, ) errors = spec.validate_metadata() assert errors == [ "Toy metadata references[0] is missing is_paper.", "Toy metadata references[1].is_paper must be boolean.", ] def test_metadata_validation_requires_reference_source_confidence_label() -> None: spec = NanoDatasetSpec( name="Toy", dataset_id="local/toy", metadata={ "language": "en", "category": "natural_language", "short_description": "Toy.", "description": "Toy metadata with references.", "references": [ { "title": "Toy Paper", "authors": ["A. Author"], "year": 2024, "url": "https://example.com/paper", "is_paper": True, }, { "title": "Toy Blog", "authors": ["B. Author"], "year": 2024, "url": "https://example.com/blog", "is_paper": False, "source_confidence": "unchecked", }, ], }, ) errors = spec.validate_metadata() assert errors == [ "Toy metadata references[0] is missing source_confidence.", "Toy metadata references[1].source_confidence has invalid label 'unchecked'.", ] def test_reference_source_confidence_labels_are_documented() -> None: assert set(REFERENCE_SOURCE_CONFIDENCE_LABELS) == { "source_uncertain", "probably_correct", "definitive_paper_link", "human_verified", } assert "AI agents must not assign this label" in REFERENCE_SOURCE_CONFIDENCE_LABELS["human_verified"]