import gradio as gr
import pandas as pd
import tempfile
import ast
from grewtse.pipeline import GrewTSEPipe
from grewtse.evaluators import GrewTSEvaluator
grewtse = GrewTSEPipe()
def parse_treebank(path: str, treebank_selection: str) -> pd.DataFrame:
if treebank_selection == "None":
parsed_treebank = grewtse.parse_treebank(path)
else:
treebank_selection = f"./datasets/{treebank_selection}"
parsed_treebank = grewtse.parse_treebank(treebank_selection)
return grewtse.get_morphological_features().tail()
def to_masked_dataset(query, node) -> pd.DataFrame:
df = grewtse.generate_masked_dataset(query, node)
return df
def to_prompt_dataset(query, node) -> pd.DataFrame:
df = grewtse.generate_prompt_dataset(query, node)
return df
def safe_str_to_dict(s):
try:
return ast.literal_eval(s)
except (ValueError, SyntaxError):
return None
def truncate_text(text, max_len=50):
"""
Truncate a string to max_len characters and append '...' if it was longer.
"""
if not isinstance(text, str):
return text # Keep non-string values unchanged
return text[:max_len] + "..." if len(text) > max_len else text
def generate_minimal_pairs(query: str, node: str, alt_features: str, task_type: str):
if not grewtse.is_treebank_parsed():
raise ValueError("Please parse a treebank first.")
# determine whether an alternative LI should be found
alt_features_as_dict = safe_str_to_dict(alt_features)
# a hack just to convert features to lowercase (for now)
# TODO: Improve
for k in list(alt_features_as_dict.keys()):
alt_features_as_dict[k.lower()] = alt_features_as_dict.pop(k)
if alt_features_as_dict is None:
raise Exception("Invalid features provided.")
has_leading_whitespace = False
is_encoder = False
masked_or_prompt_df = None
if task_type.lower() == "masked":
# mask the target word in the sentence
masked_or_prompt_df = to_masked_dataset(query, node)
has_leading_whitespace = False
is_encoder = True
elif task_type.lower() == "prompt":
# create prompts from each sentence (i.e. cut them off right at the target word)
masked_or_prompt_df = to_prompt_dataset(query, node)
has_leading_whitespace = True
else:
raise Exception("Invalid task type.")
full_dataset = grewtse.generate_minimal_pair_dataset(
alt_features_as_dict,
ood_pairs=None,
has_leading_whitespace=has_leading_whitespace,
)
# save to a temporary CSV file
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
full_dataset.to_csv(temp_file.name, index=False)
if is_encoder:
dataset_for_vis = full_dataset[
["masked_text", "form_grammatical", "form_ungrammatical"]
]
dataset_for_vis["masked_text"] = dataset_for_vis["masked_text"].apply(
truncate_text
)
else:
dataset_for_vis = full_dataset[
["prompt_text", "form_grammatical", "form_ungrammatical"]
]
dataset_for_vis["prompt_text"] = dataset_for_vis["prompt_text"].apply(
truncate_text
)
num_exceptions = grewtse.get_num_exceptions()
num_targets_parsed = len(masked_or_prompt_df)
num_success = len(full_dataset)
exceptions_info = f"{num_targets_parsed+num_exceptions} targets identified and turned into masks/prompts. {num_exceptions} of these could not be used due to treebank structure issues. After searching for minimal pairs, a total of
{num_success} minimal-pair syntactic tests were successfully generated."
gr.Info(exceptions_info, duration=60, title="Grew-TSE Results")
return dataset_for_vis, temp_file.name
def evaluate_model(model_repo: str, task_type: str, probability_calculation: str):
if not grewtse.are_minimal_pairs_generated():
raise ValueError(
"Please parse a treebank, mask a dataset and generate minimal pairs first."
)
g_eval = GrewTSEvaluator()
task_type = "mlm" if task_type.lower() == "masked" else "ntp"
mp_with_eval_dataset = g_eval.evaluate_model(
grewtse.get_minimal_pair_dataset(), model_repo, task_type, probability_calculation
)
accuracy = g_eval.get_accuracy()
metrics = pd.DataFrame([accuracy], columns=["Accuracy"])
print("===METRICS===")
print(metrics)
print("----")
# save to a temporary CSV file
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
mp_with_eval_dataset.to_csv(temp_file.name, index=False)
return metrics, temp_file.name
def show_df():
return gr.update(visible=True)
with gr.Blocks(theme=gr.themes.Ocean()) as demo:
is_treebank_parse_success = False
with gr.Row():
gr.Markdown(
"""# Grew-TSE Dashboard
Grew-TSE is a tool for the query-based generation of custom minimal-pair syntactic tests from treebanks for Targeted Syntactic Evaluation of LLMs. The query language of choice is [GREW (Graph Rewriting for NLP)](https://grew.fr/).
This dashboard allows you to use it to generate such tests through a User Interface.
If you feel like it, you can pronounce it a bit like the german word Grütze, meaning grits or groats.
The general research question that Grew-TSE aims to help answer is:
_Can language models distinguish grammatical from ungrammatical sentences across syntactic phenomena and languages?_
This means that if you speak a language, especially one that is low-resource but has a UD treebank, then you likely have something novel you could test in this area.
The pipeline generally looks something like the following:
1. Parse a Universal Dependencies treebank in CoNLL-U format
2. Isolate a specific syntactic phenomenon (e.g. verbal agreement) using a [GREW query](http://grew.fr/).
3. Convert these isolated sentences into masked- or prompt-based datasets.
3. Search the original treebank for words that differ by one syntactic feature to form a minimal pair.
4. Evaluate a model available on the Hugging Face platform and view metrics such as accuracy, precision, recall, and the F1 score.
See the full documentation [here](https://grew-tse.readthedocs.io/en/latest/).
It is available on the Python Package Index [here](https://pypi.org/project/grew-tse/).
"""
)
with gr.Row():
with gr.Column():
gr.Markdown(
"""
## Load a Treebank
You can begin by loading up a particular treebank that you'd like to work with.
You can either select a treebank from the pre-loaded options below, or upload your own.
"""
)
with gr.Column():
with gr.Tabs():
with gr.TabItem("Choose Treebank"):
treebank_selection = gr.Dropdown(
choices=[
# "None",
# "de/German-UD-HDT-SM.conllu",
# "de/German-UD-HDT-MED.conllu",
"de/German-UD-HDT-25K.conllu",
"de/German-UD-HDT-100K.conllu",
"en/English-UD-EWT-25K.conllu",
"en/English-UD-EWT-100K.conllu",
"ukr/Ukrainian-UD-IU-25K.conllu",
"ukr/Ukrainian-UD-IU-100K.conllu",
"fr/French-UD-FTB.conllu",
"es/Spanish-UD-AnCora.conllu"
],
label="Select a treebank",
value="de/German-UD-HDT-25K.conllu"
)
with gr.TabItem("Upload Your Own"):
gr.Markdown("## Upload a .conllu File")
gr.Markdown("If you want to combine multiple `.conllu` files into one, use the command `cat train.conllu dev.conllu test.conllu >> combined.conllu`.")
file_input = gr.File(
label="Upload .conllu file",
file_types=[".conllu"],
type="filepath",
)
parse_file_button = gr.Button("Parse Treebank", size="sm", scale=0)
morph_table = gr.Dataframe(interactive=False, visible=False)
parse_file_button.click(
fn=parse_treebank,
inputs=[file_input, treebank_selection],
outputs=[morph_table],
)
parse_file_button.click(fn=show_df, outputs=morph_table)
gr.Markdown("## Isolate A Syntactic Phenomenon")
with gr.Row():
with gr.Column():
gr.Markdown(
"""
**Grew** is a query and transformation language used to search within and manipulate dependency treebanks.
A GREW query allows linguists and NLP researchers to find specific syntactic patterns in parsed linguistic data (such as Universal Dependencies treebanks).
Queries are expressed as graph constraints using a concise pattern-matching syntax.
The following short GREW query will find target any verbs. Try it with one of the sample treebanks above.
Make sure to include the variable V as the target that we're trying to isolate.
#### Searching for Types of Words
```js
pattern { V [upos="VERB"]; }
```
In the above example, V constitutes the "nodes" in our graph, for which we can define properties using the square-bracket notation.
These often correspond to individual words, but depending on the treebank's design choices these nodes may be divided up differently e.g. as morphemes.
Here we will stick to referring to them as words for simplicity.
#### Searching for Relations between Words
We can search for specific types of relations between word types using the `X-[relation]->Y` notation.
A few example relations are `aux` for auxiliaries (e.g. "would"), `conj` for conjunctions e.g. "and", as well as `nsubj` and `obj` for subject-object relations.
```js
pattern { X -[advcl]-> Y; }
```
#### Filtering Out Types of Words or Relations
You can search for anything that is -not- of a specific type using the `<>` operator.
The pipe `|` can be used as the OR operator. For instance:
Below we search for adverbial clauses such that the governor's tag is not noun or verb.
```js
pattern { X [upos <> "NOUN" | "VERB"]; X -[advcl]-> Y; }
```
You can choose not to include entire patterns using the `without` clause.
```js
pattern { V [upos=VERB]; }
without { V -[obj]-> O }
```
Try out these examples on many UD treebanks at [Grew-Match](universal.grew.fr).
#### Feeling Brave?
Try out the below query to find minimal-pair tests for German.
You should be able to identify transitive (i.e. subject-object) constructions where the object has a determiner (e.g. "the","a",etc).
Using the below feature change, you can test a model's knowledge of German case.
```js
pattern {
V [upos="VERB"];
O [upos="NOUN"];
D [upos="DET", Case="Acc"];
V -[nsubj]-> S;
V -[obj]-> O;
O -[det]-> D;
}
```
Use the following dictionary to find minimal pairs with case (Accusative, Dative).
In this case _(no pun intended)_, the accusative is grammatical and the dative ungrammatical.
```
{ "case": "Dat" }
```
"""
)
with gr.Column():
query_input = gr.Code(
label="GREW Query",
lines=5,
# placeholder="Enter your GREW query here...",
value="""
pattern {
V [upos="VERB"];
O [upos="NOUN"];
D [upos="DET", Case="Acc"];
V -[nsubj]-> S;
V -[obj]-> O;
O -[det]-> D;
}
""",
language="javascript" # no option for GREW, but this is close enough
)
node_input = gr.Textbox(
label="Target",
placeholder="The variable in your GREW query to isolate, e.g., D for the above determiner.",
value="D",
)
feature_input = gr.Code(
label="Enter Alternative Feature Values for Minimal Pair as a Dictionary",
# placeholder='e.g. {"case": "Acc", "number": "Sing"}',
value="""{ "case": "Dat" }""",
lines=3,
language="python" # again, no GREW option. Default to JavaScript
)
task_type = gr.Dropdown(
choices=[
"Masked",
"Prompt",
],
label="Select whether you want masked- or prompt-based tests.",
value="Masked",
)
run_button = gr.Button("Run Query", size="sm", scale=0)
output_table = gr.Dataframe(label="Output Table", visible=False)
download_file = gr.File(label="Download CSV")
run_button.click(
fn=generate_minimal_pairs,
inputs=[query_input, node_input, feature_input, task_type],
outputs=[output_table, download_file],
)
run_button.click(fn=show_df, outputs=output_table)
with gr.Row():
with gr.Column():
gr.Markdown(
"""
## Evaluate A Model
You can evaluate models trained either for MLM or NTP tasks that are available on the Hugging Face platform.
The primary means of evaluating models is accuracy i.e. the proportion of tests where the model is “correct”. In Targeted Syntactic Evaluation, a model is typically deemed correct when P(Grammatical Item) > P(Ungrammatical Item). How these probabilities are calculated however can lead to differing results. This package allows you to choose between token- or sentence-level; the former takes the joint probability of just the tokens in the target word, while the latter takes the joint probability of all tokens in the sentence.
"""
)
with gr.Column():
repository_input = gr.Textbox(
label="Model Repository",
lines=1,
placeholder="Enter the model repository here...",
value="distilbert/distilbert-base-multilingual-cased",
)
probability_calculation = gr.Dropdown(
choices=[
"token-level",
"sentence-Level",
],
label="Select if the probabilities assigned to words are calculated at the word- or sentence-level.
For decoder models, sentence-level is recommended where the grammaticality is not determined linearly earlier than the target word.",
value="token-level",
)
with gr.Column():
evaluate_button = gr.Button("Evaluate Model", size="sm", scale=0)
mp_with_eval_output_dataset = gr.Dataframe(
label="Output Table", visible=False
)
mp_with_eval_output_download = gr.File(label="Download CSV")
evaluate_button.click(
fn=evaluate_model,
inputs=[repository_input, task_type, probability_calculation],
outputs=[
gr.DataFrame(),
mp_with_eval_output_download,
],
)
if __name__ == "__main__":
demo.launch(share=True)