ai4data
/

datause-extraction

@@ -62,85 +62,34 @@ CLASSIFICATION_TASKS = {
     "usage_context": ["primary", "supporting", "background"],
 }
-# texts: list of passage strings to run extraction on
-texts = ["We use the Demographic and Health Survey (DHS) 2020 as our primary data source."]
-BZ = 8  # batch size
-# Pass 1: batched entity extraction
-all_res_ent = []
-for i in range(0, len(texts), BZ):
-    batch = texts[i : i + BZ]
-    res = extractor.batch_extract_entities(
-        batch, ["data_mention"],
-        threshold=0.3,
-        batch_size=BZ,
-        include_confidence=True,
-    )
-    all_res_ent.extend(res)
-# Build classification queue — one entry per valid extracted span
-classification_queue = []
-for idx, (res_ent, text) in enumerate(zip(all_res_ent, texts)):
-    spans = (
-        res_ent.get("entities", {}).get("data_mention", [])
-        if isinstance(res_ent, dict)
-        else res_ent
-    )
-    for span_data in spans:
-        span_text = span_data.get("text", "") if isinstance(span_data, dict) else str(span_data)
-        span_conf = span_data.get("confidence", 0.0) if isinstance(span_data, dict) else 1.0
-        if len(span_text) < 3:
-            continue
-        start     = text.find(span_text)
-        ctx_start = max(0, start - 150) if start != -1 else 0
-        ctx_end   = min(len(text), start + len(span_text) + 150) if start != -1 else len(text)
-        context_str = f"Mention: {span_text} | Context: {text[ctx_start:ctx_end]}"
-        classification_queue.append((idx, span_text, span_conf, context_str))
-# Pass 2: batched zero-shot classification on context windows
-all_classes = []
-for i in range(0, len(classification_queue), BZ):
-    batch_ctx = [q[3] for q in classification_queue[i : i + BZ]]
-    res = extractor.batch_classify_text(
-        batch_ctx, CLASSIFICATION_TASKS, threshold=0.3, batch_size=BZ
-    )
-    all_classes.extend(res)
-# Assemble results grouped by source chunk index
-chunk_results = {i: [] for i in range(len(texts))}
-for q_item, classes in zip(classification_queue, all_classes):
-    idx, span_text, conf, _ = q_item
-    mention = {"mention_name": span_text, "confidence": conf}
     for task, out in classes.items():
         mention[task] = out[0] if isinstance(out, tuple) and len(out) == 2 else out
-    chunk_results[idx].append(mention)
-```
-## Training Details
-| Property | Value |
-|---|---|
-| Base model | `fastino/gliner2-large-v1` |
-| Method | LoRA (r=16, alpha=32.0) |
-| Target modules | `encoder`, `span_rep`, `classifier`, `count_embed`, `count_pred` |
-| Training examples | 8,791 |
-| Validation examples | 651 |
-| Best val loss | 439.45 |
-| GLiNER2 branch | `rafmacalaba/GLiNER2@feat/main-mirror` |
-| Training dataset | [ai4data/datause-train](https://huggingface.co/datasets/ai4data/datause-train) |
-## Evaluation
-Evaluated on a 630-chunk human-annotated holdout set using Jaccard similarity
-matching (threshold 0.5) at confidence threshold 0.30:
-| Metric | Score |
-|---|---|
-| F1 | see [DataUse Evaluation Hub](https://github.com/rafmacalaba/monitoring_of_datause) |
-| Precision | — |
-| Recall | — |
-## Citation
-If you use this model, please cite the monitoring_of_datause project.

     "usage_context": ["primary", "supporting", "background"],
 }
+text = "We use the Demographic and Health Survey (DHS) 2020 as our primary data source."
+# Pass 1: entity extraction
+res_ent = extractor.extract_entities(text, ["data_mention"], threshold=0.3, include_confidence=True)
+spans = (
+    res_ent.get("entities", {}).get("data_mention", [])
+    if isinstance(res_ent, dict)
+    else res_ent
+)
+# Build classification inputs for each valid span
+results = []
+for span_data in spans:
+    span_text = span_data.get("text", "") if isinstance(span_data, dict) else str(span_data)
+    span_conf = span_data.get("confidence", 0.0) if isinstance(span_data, dict) else 1.0
+    if len(span_text) < 3:
+        continue
+    start     = text.find(span_text)
+    ctx_start = max(0, start - 150) if start != -1 else 0
+    ctx_end   = min(len(text), start + len(span_text) + 150) if start != -1 else len(text)
+    context_str = f"Mention: {span_text} | Context: {text[ctx_start:ctx_end]}"
+    # Pass 2: classify the span's context window
+    classes = extractor.classify_text(context_str, CLASSIFICATION_TASKS, threshold=0.3)
+    mention = {"mention_name": span_text, "confidence": span_conf}
     for task, out in classes.items():
         mention[task] = out[0] if isinstance(out, tuple) and len(out) == 2 else out
+    results.append(mention)
+print(results)
+```