ai4data
/

datause-extraction

@@ -45,8 +45,8 @@ from huggingface_hub import snapshot_download
 # Install the patched GLiNER2 library:
 # pip install git+https://github.com/rafmacalaba/GLiNER2.git@feat/main-mirror
-BASE_MODEL  = "fastino/gliner2-large-v1"
-ADAPTER_ID  = "ai4data/datause-extraction"
 extractor = GLiNER2.from_pretrained(BASE_MODEL)
 extractor.load_adapter(snapshot_download(ADAPTER_ID))
@@ -62,60 +62,59 @@ CLASSIFICATION_TASKS = {
     "usage_context": ["primary", "supporting", "background"],
 }
-text = "We use the Demographic and Health Survey (DHS) 2020 as our primary data source."
-# Pass 1 — extract entity spans
-entity_result = extractor.extract_entities(
-    text, ["data_mention"], threshold=0.3, include_confidence=True
-)
-spans = entity_result.get("entities", {}).get("data_mention", [])
-# Pass 2 — classify each span using its context window
-CONTEXT = 150
-results = []
-for span in spans:
-    mention = span.get("text", "")
-    start   = text.find(mention)
-    ctx     = text[max(0, start - CONTEXT) : start + len(mention) + CONTEXT]
-    context_str = f"Mention: {mention} | Context: {ctx}"
-    classes = extractor.classify_text(context_str, CLASSIFICATION_TASKS, threshold=0.3)
-    results.append({
-        "mention_name":   mention,
-        "confidence":     span.get("confidence", 0),
-        "specificity_tag": classes.get("specificity_tag", ("", 0))[0],
-        "typology_tag":    classes.get("typology_tag",    ("", 0))[0],
-        "is_used":         classes.get("is_used",         ("", 0))[0],
-        "usage_context":   classes.get("usage_context",   ("", 0))[0],
-    })
-print(results)
-```
-### Batch inference (recommended for documents)
-```python
-# Pass 1 — batched
-all_res_ent = extractor.batch_extract_entities(
-    texts, ["data_mention"], threshold=0.3, batch_size=8, include_confidence=True
-)
-# Build context strings for every extracted span, then Pass 2 — batched
 classification_queue = []
 for idx, (res_ent, text) in enumerate(zip(all_res_ent, texts)):
-    for span in res_ent.get("entities", {}).get("data_mention", []):
-        mention = span.get("text", "")
-        start   = text.find(mention)
-        ctx     = text[max(0, start - 150) : start + len(mention) + 150]
-        classification_queue.append((idx, mention, span.get("confidence", 0),
-                                     f"Mention: {mention} | Context: {ctx}"))
-all_classes = extractor.batch_classify_text(
-    [q[3] for q in classification_queue],
-    CLASSIFICATION_TASKS,
-    threshold=0.3,
-    batch_size=8,
-)
 ```
 ## Training Details

 # Install the patched GLiNER2 library:
 # pip install git+https://github.com/rafmacalaba/GLiNER2.git@feat/main-mirror
+BASE_MODEL = "fastino/gliner2-large-v1"
+ADAPTER_ID = "ai4data/datause-extraction"
 extractor = GLiNER2.from_pretrained(BASE_MODEL)
 extractor.load_adapter(snapshot_download(ADAPTER_ID))
     "usage_context": ["primary", "supporting", "background"],
 }
+# texts: list of passage strings to run extraction on
+texts = ["We use the Demographic and Health Survey (DHS) 2020 as our primary data source."]
+BZ = 8  # batch size
+# Pass 1: batched entity extraction
+all_res_ent = []
+for i in range(0, len(texts), BZ):
+    batch = texts[i : i + BZ]
+    res = extractor.batch_extract_entities(
+        batch, ["data_mention"],
+        threshold=0.3,
+        batch_size=BZ,
+        include_confidence=True,
+    )
+    all_res_ent.extend(res)
+# Build classification queue — one entry per valid extracted span
 classification_queue = []
 for idx, (res_ent, text) in enumerate(zip(all_res_ent, texts)):
+    spans = (
+        res_ent.get("entities", {}).get("data_mention", [])
+        if isinstance(res_ent, dict)
+        else res_ent
+    )
+    for span_data in spans:
+        span_text = span_data.get("text", "") if isinstance(span_data, dict) else str(span_data)
+        span_conf = span_data.get("confidence", 0.0) if isinstance(span_data, dict) else 1.0
+        if len(span_text) < 3:
+            continue
+        start     = text.find(span_text)
+        ctx_start = max(0, start - 150) if start != -1 else 0
+        ctx_end   = min(len(text), start + len(span_text) + 150) if start != -1 else len(text)
+        context_str = f"Mention: {span_text} | Context: {text[ctx_start:ctx_end]}"
+        classification_queue.append((idx, span_text, span_conf, context_str))
+# Pass 2: batched zero-shot classification on context windows
+all_classes = []
+for i in range(0, len(classification_queue), BZ):
+    batch_ctx = [q[3] for q in classification_queue[i : i + BZ]]
+    res = extractor.batch_classify_text(
+        batch_ctx, CLASSIFICATION_TASKS, threshold=0.3, batch_size=BZ
+    )
+    all_classes.extend(res)
+# Assemble results grouped by source chunk index
+chunk_results = {i: [] for i in range(len(texts))}
+for q_item, classes in zip(classification_queue, all_classes):
+    idx, span_text, conf, _ = q_item
+    mention = {"mention_name": span_text, "confidence": conf}
+    for task, out in classes.items():
+        mention[task] = out[0] if isinstance(out, tuple) and len(out) == 2 else out
+    chunk_results[idx].append(mention)
 ```
 ## Training Details