import base64 import re import json import pandas as pd import gradio as gr import pyterrier as pt pt.init() import pyt_splade from pyterrier_gradio import Demo, MarkdownFile, interface, df2code, code2md, EX_Q, EX_D, df2list from transformers import AutoTokenizer, AutoModel import os os.environ["HF_ENDPOINT"] = "https://huggingface.co" _ = AutoModel.from_pretrained( "naver/splade-cocondenser-ensembledistil", trust_remote_code=True ) _ = AutoTokenizer.from_pretrained( "naver/splade-cocondenser-ensembledistil", trust_remote_code=True ) factory_max = pyt_splade.Splade(agg='max') factory_sum = pyt_splade.Splade(agg='sum') COLAB_NAME = 'pyterrier_splade.ipynb' COLAB_INSTALL = ''' !pip install -q git+https://github.com/naver/splade !pip install -q git+https://github.com/cmacdonald/pyt_splade '''.strip() def generate_vis(df, mode='Document'): if len(df) == 0: return '' result = [] if mode == 'Document': max_score = max(max(t.values()) for t in df['toks']) for row in df.itertuples(index=False): if mode == 'Query': tok_scores = row.query_toks orig_tokens = factory_max.tokenizer.tokenize(row.query) max_score = max(tok_scores.values()) id = row.qid else: tok_scores = row.toks orig_tokens = factory_max.tokenizer.tokenize(row.text) id = row.docno def toks2span(toks): return ' '.join(f'{t}' for t in toks) orig_tokens_set = set(orig_tokens) exp_tokens = [t for t, v in sorted(tok_scores.items(), key=lambda x: (-x[1], x[0])) if t not in orig_tokens_set] result.append(f'''