Spaces:

dngan0365
/

informationretrieval

Paused

App Files Files Community

dngan0365 commited on Dec 30, 2025

Commit

d9255cf

1 Parent(s): 1f5eadf

Initial deploy FastAPI backend to Hugging Face Space

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.env +0 -0
.gitattributes +2 -0
.gitignore +4 -0
Dockerfile +10 -0
main.py +502 -0
requirements.txt +123 -0
retrieval/data/0.html +0 -0
retrieval/data/1.html +0 -0
retrieval/data/10.html +0 -0
retrieval/data/100.html +0 -0
retrieval/data/101.html +0 -0
retrieval/data/102.html +0 -0
retrieval/data/103.html +0 -0
retrieval/data/104.html +0 -0
retrieval/data/105.html +0 -0
retrieval/data/106.html +0 -0
retrieval/data/107.html +0 -0
retrieval/data/108.html +0 -0
retrieval/data/109.html +0 -0
retrieval/data/11.html +0 -0
retrieval/data/110.html +0 -0
retrieval/data/111.html +0 -0
retrieval/data/112.html +0 -0
retrieval/data/113.html +0 -0
retrieval/data/114.html +0 -0
retrieval/data/115.html +0 -0
retrieval/data/116.html +0 -0
retrieval/data/117.html +0 -0
retrieval/data/118.html +0 -0
retrieval/data/119.html +0 -0
retrieval/data/12.html +0 -0
retrieval/data/120.html +0 -0
retrieval/data/121.html +0 -0
retrieval/data/122.html +0 -0
retrieval/data/123.html +0 -0
retrieval/data/124.html +0 -0
retrieval/data/125.html +0 -0
retrieval/data/126.html +0 -0
retrieval/data/127.html +0 -0
retrieval/data/128.html +0 -0
retrieval/data/129.html +0 -0
retrieval/data/13.html +0 -0
retrieval/data/130.html +2 -0
retrieval/data/131.html +0 -0
retrieval/data/132.html +0 -0
retrieval/data/133.html +0 -0
retrieval/data/134.html +0 -0
retrieval/data/135.html +0 -0
retrieval/data/136.html +0 -0
retrieval/data/137.html +0 -0

.env ADDED Viewed

File without changes

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+retrieval/docs_no_stop.json filter=lfs diff=lfs merge=lfs -text
+retrieval/ind/*.seg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+__pycache__/
+.venv/
+venv/
+env/

Dockerfile ADDED Viewed

	@@ -0,0 +1,10 @@

+FROM python:3.11
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

main.py ADDED Viewed

	@@ -0,0 +1,502 @@

+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from typing import List, Dict, Optional
+import pandas as pd
+from whoosh import index, qparser
+from whoosh.qparser import MultifieldParser
+from whoosh.scoring import BM25F
+from whoosh.index import open_dir
+import os
+import nltk
+from nltk import sent_tokenize
+import re
+import unicodedata
+from pyvi import ViTokenizer
+nltk.download('punkt_tab')
+nltk.download('stopwords')
+app = FastAPI(title="Document Search API")
+# CORS configuration
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=[
+        "http://localhost:3000",  # Next.js development
+        "http://localhost:3001",  # Alternative port
+        "https://blue-information-retrieval.vercel.app", # Production frontend URL
+    ],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Configuration
+INDEX_DIR = "./retrieval/ind"
+META_CSV = "./retrieval/final_document_tfidf_pagerank.csv"
+DATA_CLEAN_DIR = "./retrieval/data_clean"
+IMAGE_CSV = "./retrieval/docs_with_images.csv"  # File CSV chứa URL ảnh
+STOPWORDS_PATH = "./retrieval/vietnamese-stopwords-dash.txt"
+# Global variables
+ix = None
+meta_df = None
+image_df = None  # DataFrame chứa mapping doc_id -> image_url
+docs_cache = {}
+pagerank_dict = {}
+vi_stopwords = None
+class SearchRequest(BaseModel):
+    query: str
+    model: str = "bm25"
+    limit: int = 100
+    B: float = 0.75
+    K1: float = 1.2
+    title_boost: float = 1.5
+    content_boost: float = 1.0
+class SearchResult(BaseModel):
+    doc_id: str
+    title: str
+    url: str
+    snippet: str
+    score: float
+    relevance_percentage: float
+    image_url: Optional[str] = None
+    pagerank_score: Optional[float] = None
+    total_words: Optional[int] = None
+    unique_words: Optional[int] = None
+    top_words: Optional[str] = None
+    top_tfidf: Optional[str] = None
+    avg_tfidf: Optional[float] = None
+    final_score: Optional[float] = None
+def split_sentences(text):
+    return sent_tokenize(text)
+def tokenize_vi_sentence_level(text: str) -> list[str]:
+    sentences = sent_tokenize(text)
+    tokens = []
+    for sent in sentences:
+        sent = sent.strip()
+        if not sent:
+            continue
+        sent_tokens = ViTokenizer.tokenize(sent)
+        tokens.extend(sent_tokens.split())
+    return tokens
+VI_TOKEN_REGEX = re.compile(
+    r"[a-zàáạảãâầấậẩẫăằắặẳẵ"
+    r"èéẹẻẽêềếệểễ"
+    r"ìíịỉĩ"
+    r"òóọỏõôồốộổỗơờớợởỡ"
+    r"ùúụủũưừứựửữ"
+    r"ỳýỵỷỹđ0-9_]+$"
+)
+def is_valid_vi_token(token: str) -> bool:
+    return bool(VI_TOKEN_REGEX.fullmatch(token))
+def load_stopwords(path):
+    with open(path, "r", encoding="utf-8") as f:
+        stopwords = set(line.strip().lower() for line in f if line.strip())
+    return stopwords
+def clean_text(text):
+    if text is None:
+        return ""
+    text = unicodedata.normalize("NFC", text)
+    text = re.sub(r"http\S+|www\S+", "", text)
+    text = re.sub(r"[.,!?]+", " ", text)
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+def preprocess_query(query: str, stopwords: set[str] | None = None) -> str:
+    query = clean_text(query)
+    tokens = tokenize_vi_sentence_level(query)
+    processed_tokens = []
+    for tok in tokens:
+        tok = tok.lower()
+        if not is_valid_vi_token(tok):
+            continue
+        if tok.isnumeric():
+            continue
+        if stopwords and tok in stopwords:
+            continue
+        processed_tokens.append(tok)
+    return " ".join(processed_tokens)
+def load_pagerank(meta_csv: str) -> Dict[str, float]:
+    """Load PageRank scores from CSV"""
+    try:
+        df = pd.read_csv(meta_csv)
+        if 'pagerank' in df.columns and 'id' in df.columns:
+            return dict(zip(df['id'].astype(str), df['pagerank']))
+        return {}
+    except Exception as e:
+        print(f"Warning: Could not load PageRank scores: {e}")
+        return {}
+def load_images_csv(image_csv: str) -> pd.DataFrame:
+    """Load image URLs from CSV file"""
+    try:
+        if not os.path.exists(image_csv):
+            print(f"⚠️ Image CSV not found: {image_csv}")
+            return pd.DataFrame(columns=['doc_id', 'image_url'])
+        df = pd.read_csv(image_csv)
+        # Đảm bảo có cả 2 cột cần thiết
+        if 'doc_id' not in df.columns or 'image_url' not in df.columns:
+            print("⚠️ Image CSV missing required columns: doc_id, image_url")
+            return pd.DataFrame(columns=['doc_id', 'image_url'])
+        # Convert doc_id to string để dễ mapping
+        df['doc_id'] = df['doc_id'].astype(str)
+        # Loại bỏ các dòng có image_url null/empty
+        df = df[df['image_url'].notna() & (df['image_url'] != '')]
+        print(f"✅ Loaded {len(df)} image URLs from CSV")
+        return df
+    except Exception as e:
+        print(f"❌ Error loading image CSV: {e}")
+        return pd.DataFrame(columns=['doc_id', 'image_url'])
+def get_image_url(doc_id: str) -> Optional[str]:
+    """Get image URL for a document from CSV"""
+    global image_df
+    if image_df is None or image_df.empty:
+        return None
+    try:
+        # Tìm image_url theo doc_id
+        result = image_df[image_df['doc_id'] == str(doc_id)]
+        if not result.empty:
+            image_url = result.iloc[0]['image_url']
+            # Kiểm tra URL hợp lệ
+            if pd.notna(image_url) and str(image_url).strip() != '':
+                return str(image_url)
+        return None
+    except Exception as e:
+        print(f"Error getting image URL for doc {doc_id}: {e}")
+        return None
+def load_document_content(doc_id: str) -> str:
+    """Load document content from data_clean directory"""
+    global docs_cache
+    if doc_id in docs_cache:
+        return docs_cache[doc_id]
+    try:
+        file_path = os.path.join(DATA_CLEAN_DIR, f"{doc_id}.txt")
+        if not os.path.exists(file_path):
+            print(f"Warning: File not found: {file_path}")
+            return ""
+        with open(file_path, 'r', encoding='utf-8') as f:
+            content = f.read()
+        docs_cache[doc_id] = content
+        return content
+    except Exception as e:
+        print(f"Error loading document {doc_id}: {e}")
+        return ""
+def get_snippet(doc_id: str, query_terms: List[str], max_length: int = 200) -> str:
+    """Extract relevant snippet from document based on query terms"""
+    content = load_document_content(doc_id)
+    if not content or content.strip() == "":
+        return "Không có nội dung xem trước."
+    try:
+        content_lower = content.lower()
+        query_lower = [term.lower() for term in query_terms if term.strip()]
+        if not query_lower:
+            words = content.split()
+            snippet_words = words[:30]
+            snippet = ' '.join(snippet_words)
+            if len(snippet) > max_length:
+                snippet = snippet[:max_length] + "..."
+            return snippet
+        best_pos = 0
+        max_matches = 0
+        words = content.split()
+        window_size = min(30, len(words))
+        for i in range(max(1, len(words) - window_size + 1)):
+            window = ' '.join(words[i:i+window_size]).lower()
+            matches = sum(1 for term in query_lower if term in window)
+            if matches > max_matches:
+                max_matches = matches
+                best_pos = i
+        snippet_words = words[best_pos:best_pos+window_size]
+        snippet = ' '.join(snippet_words)
+        if len(snippet) > max_length:
+            snippet = snippet[:max_length] + "..."
+        if best_pos > 0:
+            snippet = "..." + snippet
+        return snippet
+    except Exception as e:
+        print(f"Error generating snippet for doc {doc_id}: {e}")
+        return "Lỗi khi tạo đoạn trích."
+def bm25_search(ix, query_str: str, vi_stopwords: set[str] | None = None, top_k: int = 100,
+                B: float = 0.75, K1: float = 1.2,
+                title_boost: float = 1.5, content_boost: float = 1.0) -> Dict[str, float]:
+    """BM25 search with title and content fields"""
+    query_str = preprocess_query(query_str, stopwords=vi_stopwords)
+    results = {}
+    weighting = BM25F(B=B, K1=K1)
+    with ix.searcher(weighting=weighting) as searcher:
+        field_boosts = {
+            "title": title_boost,
+            "content": content_boost
+        }
+        parser = MultifieldParser(
+            ["title", "content"],
+            schema=ix.schema,
+            fieldboosts=field_boosts,
+            group=qparser.OrGroup
+        )
+        q = parser.parse(query_str)
+        hits = searcher.search(q, limit=top_k)
+        for hit in hits:
+            results[str(hit["docid"])] = float(hit.score)
+    return results
+def normalize_scores(scores: Dict[str, float]) -> Dict[str, float]:
+    """Normalize scores to percentage (0-100)"""
+    if not scores:
+        return {}
+    max_score = max(scores.values())
+    if max_score == 0:
+        return {k: 0.0 for k in scores}
+    return {k: (v / max_score) * 100 for k, v in scores.items()}
+@app.on_event("startup")
+async def startup_event():
+    """Initialize index and load data on startup"""
+    global ix, meta_df, image_df, pagerank_dict, vi_stopwords
+    try:
+        # Load Whoosh index
+        if os.path.exists(INDEX_DIR):
+            ix = open_dir(INDEX_DIR)
+            print("✅ Loaded Whoosh index")
+        else:
+            print("❌ Index directory not found:", INDEX_DIR)
+        # Load metadata
+        if os.path.exists(META_CSV):
+            meta_df = pd.read_csv(META_CSV)
+            print(f"✅ Loaded {len(meta_df)} documents metadata")
+        else:
+            print("❌ Metadata CSV not found:", META_CSV)
+        # Load image CSV
+        image_df = load_images_csv(IMAGE_CSV)
+        # Check data_clean directory
+        if os.path.exists(DATA_CLEAN_DIR):
+            num_files = len([f for f in os.listdir(DATA_CLEAN_DIR) if f.endswith('.txt')])
+            print(f"✅ Found {num_files} text files in {DATA_CLEAN_DIR}")
+        else:
+            print("❌ Data clean directory not found:", DATA_CLEAN_DIR)
+        # Load PageRank scores
+        pagerank_dict = load_pagerank(META_CSV)
+        print(f"✅ Loaded PageRank scores for {len(pagerank_dict)} documents")
+        # Load StopWords
+        if os.path.exists(STOPWORDS_PATH):
+            vi_stopwords = load_stopwords(STOPWORDS_PATH)
+            print(f"✅ Loaded {len(vi_stopwords)} Vietnamese stopwords")
+        else:
+            print("⚠️ Stopwords file not found, continuing without stopwords")
+            vi_stopwords = set()
+    except Exception as e:
+        print(f"❌ Error during startup: {e}")
+        raise
+@app.get("/")
+async def root():
+    """Health check endpoint"""
+    return {
+        "status": "ok",
+        "message": "Document Search API is running",
+        "total_documents": len(meta_df) if meta_df is not None else 0,
+        "total_images": len(image_df) if image_df is not None else 0,
+        "data_clean_dir": DATA_CLEAN_DIR,
+        "image_csv": IMAGE_CSV,
+        "index_dir": INDEX_DIR
+    }
+@app.post("/search", response_model=List[SearchResult])
+async def search(request: SearchRequest):
+    """Search documents using BM25 algorithm"""
+    if not request.query.strip():
+        raise HTTPException(status_code=400, detail="Query cannot be empty")
+    if ix is None or meta_df is None:
+        raise HTTPException(status_code=503, detail="Search index not initialized")
+    try:
+        raw_scores = bm25_search(
+            ix,
+            request.query,
+            vi_stopwords=vi_stopwords,
+            top_k=request.limit,
+            B=request.B,
+            K1=request.K1,
+            title_boost=request.title_boost,
+            content_boost=request.content_boost
+        )
+        if not raw_scores:
+            return []
+        normalized_scores = normalize_scores(raw_scores)
+        query_terms = request.query.split()
+        results = []
+        for doc_id, score in raw_scores.items():
+            doc_row = meta_df[meta_df['id'].astype(str) == doc_id]
+            if doc_row.empty:
+                continue
+            doc_info = doc_row.iloc[0]
+            snippet = get_snippet(doc_id, query_terms, max_length=300)
+            pr_score = pagerank_dict.get(doc_id)
+            # Lấy image_url từ CSV
+            image_url = get_image_url(doc_id)
+            result = SearchResult(
+                doc_id=doc_id,
+                title=str(doc_info.get('title', 'Untitled')),
+                url=str(doc_info.get('url', '')),
+                snippet=snippet,
+                score=round(score, 4),
+                relevance_percentage=round(normalized_scores[doc_id], 2),
+                image_url=image_url,
+                pagerank_score=round(pr_score, 6) if pr_score else None,
+                total_words=int(doc_info.get('total_words', 0)) if pd.notna(doc_info.get('total_words')) else None,
+                unique_words=int(doc_info.get('unique_words', 0)) if pd.notna(doc_info.get('unique_words')) else None,
+                top_words=str(doc_info.get('top_words', '')) if pd.notna(doc_info.get('top_words')) else None,
+                top_tfidf=str(doc_info.get('top_tfidf', '')) if pd.notna(doc_info.get('top_tfidf')) else None,
+                avg_tfidf=round(float(doc_info.get('avg_tfidf', 0)), 6) if pd.notna(doc_info.get('avg_tfidf')) else None,
+                final_score=round(float(doc_info.get('final_score', 0)), 6) if pd.notna(doc_info.get('final_score')) else None
+            )
+            results.append(result)
+        results.sort(key=lambda x: x.score, reverse=True)
+        return results[:request.limit]
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Search error: {str(e)}")
+@app.get("/stats")
+async def get_stats():
+    """Get statistics about the search index"""
+    if meta_df is None:
+        raise HTTPException(status_code=503, detail="Index not initialized")
+    num_cached_docs = len(docs_cache)
+    num_txt_files = 0
+    if os.path.exists(DATA_CLEAN_DIR):
+        num_txt_files = len([f for f in os.listdir(DATA_CLEAN_DIR) if f.endswith('.txt')])
+    return {
+        "total_documents": len(meta_df),
+        "total_images": len(image_df) if image_df is not None else 0,
+        "text_files_available": num_txt_files,
+        "cached_documents": num_cached_docs,
+        "pagerank_scores": len(pagerank_dict),
+        "index_directory": INDEX_DIR,
+        "data_clean_directory": DATA_CLEAN_DIR,
+        "image_csv": IMAGE_CSV
+    }
+@app.get("/document/{doc_id}")
+async def get_document(doc_id: str):
+    """Get full document content and metadata"""
+    if meta_df is None:
+        raise HTTPException(status_code=503, detail="Index not initialized")
+    doc_row = meta_df[meta_df['id'].astype(str) == doc_id]
+    if doc_row.empty:
+        raise HTTPException(status_code=404, detail="Document not found")
+    doc_info = doc_row.iloc[0]
+    content = load_document_content(doc_id)
+    image_url = get_image_url(doc_id)
+    return {
+        "doc_id": doc_id,
+        "title": str(doc_info.get('title', 'Untitled')),
+        "url": str(doc_info.get('url', '')),
+        "content": content,
+        "image_url": image_url,
+        "pagerank": float(doc_info.get('pagerank', 0)) if pd.notna(doc_info.get('pagerank')) else None,
+        "total_words": int(doc_info.get('total_words', 0)) if pd.notna(doc_info.get('total_words')) else None,
+        "unique_words": int(doc_info.get('unique_words', 0)) if pd.notna(doc_info.get('unique_words')) else None,
+        "top_words": str(doc_info.get('top_words', '')) if pd.notna(doc_info.get('top_words')) else None,
+        "top_tfidf": str(doc_info.get('top_tfidf', '')) if pd.notna(doc_info.get('top_tfidf')) else None,
+        "avg_tfidf": float(doc_info.get('avg_tfidf', 0)) if pd.notna(doc_info.get('avg_tfidf')) else None,
+        "final_score": float(doc_info.get('final_score', 0)) if pd.notna(doc_info.get('final_score')) else None
+    }
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

requirements.txt ADDED Viewed

	@@ -0,0 +1,123 @@

+alembic==1.17.2
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anyio==4.12.0
+asttokens==3.0.0
+attrs==25.4.0
+bcrypt==5.0.0
+beautifulsoup4==4.14.2
+certifi==2025.10.5
+cffi==2.0.0
+charset-normalizer==3.4.4
+click==8.3.0
+colorama==0.4.6
+colorlog==6.10.1
+comm==0.2.3
+contourpy==1.3.3
+cryptography==46.0.3
+cycler==0.12.1
+datasketch==1.6.5
+debugpy==1.8.17
+decorator==5.2.1
+ecdsa==0.19.1
+emoji==2.15.0
+executing==2.2.1
+fastapi==0.128.0
+filelock==3.20.0
+fonttools==4.61.1
+fsspec==2025.9.0
+gensim==4.4.0
+greenlet==3.3.0
+h11==0.16.0
+huggingface-hub==0.36.0
+idna==3.11
+ipykernel==7.0.1
+ipython==9.6.0
+ipython_pygments_lexers==1.1.1
+jedi==0.19.2
+Jinja2==3.1.6
+joblib==1.5.2
+jupyter_client==8.6.3
+jupyter_core==5.9.1
+kiwisolver==1.4.9
+Mako==1.3.10
+MarkupSafe==3.0.3
+matplotlib==3.10.8
+matplotlib-inline==0.1.7
+mpmath==1.3.0
+nest-asyncio==1.6.0
+networkx==3.5
+nltk==3.9.2
+numpy==2.3.4
+optuna==4.6.0
+outcome==1.3.0.post0
+packaging==25.0
+pandas==2.3.3
+parso==0.8.5
+passlib==1.7.4
+pillow==12.0.0
+platformdirs==4.5.0
+prompt_toolkit==3.0.52
+protobuf==6.33.0
+psutil==7.1.1
+psycopg2-binary==2.9.11
+pure_eval==0.2.3
+py_vncorenlp==0.1.4
+pyasn1==0.6.1
+pycparser==2.23
+pydantic==2.12.5
+pydantic_core==2.41.5
+Pygments==2.19.2
+pyjnius==1.7.0
+pyparsing==3.2.5
+PySocks==1.7.1
+python-crfsuite==0.9.11
+python-dateutil==2.9.0.post0
+python-dotenv==1.2.1
+python-jose==3.5.0
+pytrec_eval-terrier==0.5.10
+pytz==2025.2
+pyvi==0.1.1
+PyYAML==6.0.3
+pyzmq==27.1.0
+regex==2025.10.23
+requests==2.32.5
+rsa==4.9.1
+safetensors==0.7.0
+scikit-learn==1.7.2
+scipy==1.16.2
+selenium==4.37.0
+sentence-transformers==5.2.0
+setuptools==80.9.0
+six==1.17.0
+sklearn-crfsuite==0.5.0
+smart_open==7.5.0
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soupsieve==2.8
+SQLAlchemy==2.0.45
+stack-data==0.6.3
+stanza==1.11.0
+starlette==0.50.0
+sympy==1.14.0
+tabulate==0.9.0
+threadpoolctl==3.6.0
+tokenizers==0.22.1
+torch==2.9.0
+tornado==6.5.2
+tqdm==4.67.1
+traitlets==5.14.3
+transformers==4.57.3
+trio==0.31.0
+trio-websocket==0.12.2
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+tzdata==2025.2
+urllib3==2.5.0
+uvicorn==0.40.0
+wcwidth==0.2.14
+websocket-client==1.9.0
+Whoosh==2.7.4
+wordcloud==1.9.5
+wrapt==2.0.1
+wsproto==1.2.0

retrieval/data/0.html ADDED Viewed