rafmacalaba commited on
Commit
9f17956
·
verified ·
1 Parent(s): dcddeb3

Mirror rafmacalaba/gliner2-datause-large-v1-deval-synth-v2 -> production

Browse files
Files changed (1) hide show
  1. README.md +28 -79
README.md CHANGED
@@ -62,85 +62,34 @@ CLASSIFICATION_TASKS = {
62
  "usage_context": ["primary", "supporting", "background"],
63
  }
64
 
65
- # texts: list of passage strings to run extraction on
66
- texts = ["We use the Demographic and Health Survey (DHS) 2020 as our primary data source."]
67
-
68
- BZ = 8 # batch size
69
-
70
- # Pass 1: batched entity extraction
71
- all_res_ent = []
72
- for i in range(0, len(texts), BZ):
73
- batch = texts[i : i + BZ]
74
- res = extractor.batch_extract_entities(
75
- batch, ["data_mention"],
76
- threshold=0.3,
77
- batch_size=BZ,
78
- include_confidence=True,
79
- )
80
- all_res_ent.extend(res)
81
-
82
- # Build classification queue — one entry per valid extracted span
83
- classification_queue = []
84
- for idx, (res_ent, text) in enumerate(zip(all_res_ent, texts)):
85
- spans = (
86
- res_ent.get("entities", {}).get("data_mention", [])
87
- if isinstance(res_ent, dict)
88
- else res_ent
89
- )
90
- for span_data in spans:
91
- span_text = span_data.get("text", "") if isinstance(span_data, dict) else str(span_data)
92
- span_conf = span_data.get("confidence", 0.0) if isinstance(span_data, dict) else 1.0
93
- if len(span_text) < 3:
94
- continue
95
- start = text.find(span_text)
96
- ctx_start = max(0, start - 150) if start != -1 else 0
97
- ctx_end = min(len(text), start + len(span_text) + 150) if start != -1 else len(text)
98
- context_str = f"Mention: {span_text} | Context: {text[ctx_start:ctx_end]}"
99
- classification_queue.append((idx, span_text, span_conf, context_str))
100
-
101
- # Pass 2: batched zero-shot classification on context windows
102
- all_classes = []
103
- for i in range(0, len(classification_queue), BZ):
104
- batch_ctx = [q[3] for q in classification_queue[i : i + BZ]]
105
- res = extractor.batch_classify_text(
106
- batch_ctx, CLASSIFICATION_TASKS, threshold=0.3, batch_size=BZ
107
- )
108
- all_classes.extend(res)
109
-
110
- # Assemble results grouped by source chunk index
111
- chunk_results = {i: [] for i in range(len(texts))}
112
- for q_item, classes in zip(classification_queue, all_classes):
113
- idx, span_text, conf, _ = q_item
114
- mention = {"mention_name": span_text, "confidence": conf}
115
  for task, out in classes.items():
116
  mention[task] = out[0] if isinstance(out, tuple) and len(out) == 2 else out
117
- chunk_results[idx].append(mention)
118
- ```
119
-
120
- ## Training Details
121
-
122
- | Property | Value |
123
- |---|---|
124
- | Base model | `fastino/gliner2-large-v1` |
125
- | Method | LoRA (r=16, alpha=32.0) |
126
- | Target modules | `encoder`, `span_rep`, `classifier`, `count_embed`, `count_pred` |
127
- | Training examples | 8,791 |
128
- | Validation examples | 651 |
129
- | Best val loss | 439.45 |
130
- | GLiNER2 branch | `rafmacalaba/GLiNER2@feat/main-mirror` |
131
- | Training dataset | [ai4data/datause-train](https://huggingface.co/datasets/ai4data/datause-train) |
132
-
133
- ## Evaluation
134
 
135
- Evaluated on a 630-chunk human-annotated holdout set using Jaccard similarity
136
- matching (threshold 0.5) at confidence threshold 0.30:
137
-
138
- | Metric | Score |
139
- |---|---|
140
- | F1 | see [DataUse Evaluation Hub](https://github.com/rafmacalaba/monitoring_of_datause) |
141
- | Precision | — |
142
- | Recall | — |
143
-
144
- ## Citation
145
-
146
- If you use this model, please cite the monitoring_of_datause project.
 
62
  "usage_context": ["primary", "supporting", "background"],
63
  }
64
 
65
+ text = "We use the Demographic and Health Survey (DHS) 2020 as our primary data source."
66
+
67
+ # Pass 1: entity extraction
68
+ res_ent = extractor.extract_entities(text, ["data_mention"], threshold=0.3, include_confidence=True)
69
+ spans = (
70
+ res_ent.get("entities", {}).get("data_mention", [])
71
+ if isinstance(res_ent, dict)
72
+ else res_ent
73
+ )
74
+
75
+ # Build classification inputs for each valid span
76
+ results = []
77
+ for span_data in spans:
78
+ span_text = span_data.get("text", "") if isinstance(span_data, dict) else str(span_data)
79
+ span_conf = span_data.get("confidence", 0.0) if isinstance(span_data, dict) else 1.0
80
+ if len(span_text) < 3:
81
+ continue
82
+ start = text.find(span_text)
83
+ ctx_start = max(0, start - 150) if start != -1 else 0
84
+ ctx_end = min(len(text), start + len(span_text) + 150) if start != -1 else len(text)
85
+ context_str = f"Mention: {span_text} | Context: {text[ctx_start:ctx_end]}"
86
+
87
+ # Pass 2: classify the span's context window
88
+ classes = extractor.classify_text(context_str, CLASSIFICATION_TASKS, threshold=0.3)
89
+ mention = {"mention_name": span_text, "confidence": span_conf}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  for task, out in classes.items():
91
  mention[task] = out[0] if isinstance(out, tuple) and len(out) == 2 else out
92
+ results.append(mention)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
+ print(results)
95
+ ```