Spaces:

DIVYA-NSHU99
/

disk

Sleeping

App Files Files Community

DIVYA-NSHU99 commited on Mar 7

Commit

049099a

1 Parent(s): de1718e

Initial deployment

Browse files

Files changed (11) hide show

Dockerfile +33 -0
app/__init__.py +0 -0
app/data/descriptive_keywords.json +0 -0
app/main.py +54 -0
app/src/__init__.py +0 -0
app/src/cross_encoder.py +47 -0
app/src/embeddings.py +73 -0
app/src/heuristics.py +87 -0
app/src/linguistic.py +167 -0
app/src/main.py +57 -0
requirements.txt +10 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,33 @@

+# Use official Python slim image
+FROM python:3.10-slim
+# ---- Set cache directories FIRST ----
+# All model downloads will go to /tmp/.cache (writable runtime disk)
+ENV HF_HOME=/tmp/.cache/huggingface \
+    TRANSFORMERS_CACHE=/tmp/.cache/huggingface \
+    SENTENCE_TRANSFORMERS_HOME=/tmp/.cache/sentence-transformers \
+    SPACY_DATA=/tmp/.cache/spacy \
+    NLTK_DATA=/tmp/.cache/nltk
+# ---- Install system dependencies (if any) ----
+# (none required for this project, but keep if needed)
+# ---- Set working directory ----
+WORKDIR /app
+# ---- Copy requirements first (for Docker layer caching) ----
+COPY requirements.txt .
+# ---- Install Python dependencies ----
+# This does NOT download any models (spacy model is NOT downloaded here)
+RUN pip install --no-cache-dir -r requirements.txt
+# ---- Copy the rest of the application ----
+COPY . .
+# ---- Expose the port Hugging Face Spaces expects ----
+EXPOSE 7860
+# ---- Start the FastAPI server ----
+# Models will be downloaded automatically on first request to /tmp/.cache
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

app/__init__.py ADDED Viewed

File without changes

app/data/descriptive_keywords.json ADDED Viewed

File without changes

app/main.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from typing import Optional
+import os
+# Ensure models are cached to the runtime disk (important for Hugging Face Spaces)
+os.environ["HF_HOME"] = "/tmp/huggingface"
+# Import your analyzer
+from app.src.main import TrademarkAnalyzer
+app = FastAPI(title="Trademark Descriptiveness API")
+# Check that the data file exists (optional, but helpful)
+data_path = "app/data/descriptive_keywords.json"
+if not os.path.exists(data_path):
+    print(f"Warning: Data file not found at {data_path}. Keyword overlap will be disabled.")
+# Initialize analyzer
+analyzer = TrademarkAnalyzer(descriptive_keywords_path=data_path)
+class AnalyzeRequest(BaseModel):
+    mark: str
+    goods: str
+    goods_class: Optional[str] = None
+class AnalyzeResponse(BaseModel):
+    descriptive_score: float
+    generic_score: float
+    reasons: list[str]
+    explanation: str
+    details: dict
+@app.get("/")
+def read_root():
+    return {"message": "Trademark API is running"}
+@app.get("/health")
+def health_check():
+    return {"status": "ok"}
+@app.post("/analyze", response_model=AnalyzeResponse)
+def analyze(request: AnalyzeRequest):
+    try:
+        result = analyzer.analyze(
+            mark=request.mark,
+            goods=request.goods,
+            goods_class=request.goods_class
+        )
+        return AnalyzeResponse(**result)
+    except Exception as e:
+        # Log the error (optional)
+        print(f"Error during analysis: {e}")
+        raise HTTPException(status_code=500, detail=str(e))

app/src/__init__.py ADDED Viewed

File without changes

app/src/cross_encoder.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from sentence_transformers import CrossEncoder
+from nltk import sent_tokenize
+import numpy as np
+class CrossEncoderSimilarity:
+    """
+    Uses a cross‑encoder to compute deep semantic similarity between mark and goods.
+    Supports sentence‑level segmentation and returns attention weights for explainability.
+    """
+    def __init__(self, model_name='cross-encoder/stsb-roberta-large'):
+        self.model = CrossEncoder(model_name, num_labels=1)  # regression output
+        # We'll store the last attention scores if needed (for explainability)
+        self.last_attention = None
+    def similarity(self, mark, goods, return_segments=False):
+        """
+        Returns a score between 0 and 1. If return_segments=True, also returns
+        the maximum segment score and the segment text.
+        """
+        if not goods:
+            return 0.0 if not return_segments else (0.0, None)
+        sentences = sent_tokenize(goods)
+        if not sentences:
+            return 0.0 if not return_segments else (0.0, None)
+        pairs = [(mark, sent) for sent in sentences]
+        scores = self.model.predict(pairs)
+        # Normalize: assume model output range roughly 0-5 (for stsb models)
+        # If using a different model, adjust normalization accordingly.
+        scores_norm = [min(1.0, max(0.0, s / 5.0)) for s in scores]
+        max_score = max(scores_norm)
+        max_idx = int(np.argmax(scores_norm))
+        if return_segments:
+            return max_score, sentences[max_idx]
+        return max_score
+    def similarity_with_explanation(self, mark, goods):
+        """
+        Returns score and the most relevant sentence from goods, plus optionally attention.
+        For attention, we'd need a model that returns cross‑attention; not all do.
+        This method provides a simple explanation.
+        """
+        max_score, best_sentence = self.similarity(mark, goods, return_segments=True)
+        explanation = f"Highest similarity with segment: '{best_sentence}' (score: {max_score:.2f})"
+        return max_score, explanation

app/src/embeddings.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import numpy as np
+from sentence_transformers import SentenceTransformer, util
+from nltk import sent_tokenize
+class EmbeddingSimilarity:
+    """
+    Uses a sentence‑transformer model to compute semantic similarity between
+    the mark and a list of descriptive terms, and also between mark and goods.
+    """
+    def __init__(self, model_name='all-MiniLM-L6-v2'):
+        self.model = SentenceTransformer(model_name)
+        # Cache for pre‑computed class centroids (optional)
+        self.class_centroids = {}
+    def encode(self, text):
+        """Return embedding for a single text."""
+        return self.model.encode(text, convert_to_tensor=True)
+    def similarity(self, emb1, emb2):
+        """Cosine similarity between two embeddings."""
+        return float(util.cos_sim(emb1, emb2)[0][0])
+    def max_similarity_to_terms(self, mark, descriptive_terms):
+        """
+        Compute the maximum cosine similarity between the mark embedding
+        and each individual descriptive term's embedding.
+        """
+        if not descriptive_terms:
+            return 0.0
+        mark_emb = self.encode(mark)
+        term_embs = self.encode(descriptive_terms)
+        sims = util.cos_sim(mark_emb, term_embs)[0]
+        return float(sims.max())
+    def similarity_to_class_centroid(self, mark, class_terms):
+        """
+        Pre‑compute centroid for a class (average of all term embeddings)
+        and compare mark against it. (Useful for speed when class_terms are static.)
+        """
+        if not class_terms:
+            return 0.0
+        # Create a key for the class (e.g., tuple of terms sorted)
+        # For simplicity, we'll just compute on the fly; you can cache.
+        term_embs = self.encode(class_terms)
+        centroid = term_embs.mean(axis=0)
+        mark_emb = self.encode(mark)
+        return self.similarity(mark_emb, centroid)
+    def similarity_to_goods(self, mark, goods):
+        """
+        Compute similarity between mark and goods using the bi‑encoder.
+        This is a fast alternative to the cross‑encoder.
+        """
+        if not goods:
+            return 0.0
+        mark_emb = self.encode(mark)
+        goods_emb = self.encode(goods)
+        return self.similarity(mark_emb, goods_emb)
+    def similarity_to_goods_segments(self, mark, goods):
+        """
+        Split goods into sentences and take the maximum similarity.
+        """
+        if not goods:
+            return 0.0
+        sentences = sent_tokenize(goods)
+        if not sentences:
+            return 0.0
+        mark_emb = self.encode(mark)
+        sent_embs = self.encode(sentences)
+        sims = util.cos_sim(mark_emb, sent_embs)[0]
+        return float(sims.max())

app/src/heuristics.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import numpy as np
+class DescriptivenessHeuristic:
+    """
+    Combines outputs from linguistic, embedding, and cross‑encoder modules
+    to produce final descriptiveness and genericness scores.
+    """
+    def __init__(self, ling_analyzer, emb_similarity, cross_encoder, weights=None):
+        self.ling = ling_analyzer
+        self.emb = emb_similarity
+        self.cross = cross_encoder
+        # Default weights – can be tuned via validation
+        self.weights = weights or {
+            'linguistic': 0.25,
+            'embedding_max_term': 0.25,
+            'embedding_goods': 0.20,
+            'cross_encoder': 0.30
+        }
+    def assess(self, mark, goods, goods_class=None, descriptive_terms=None):
+        """
+        Returns a dict with scores and reasons.
+        """
+        # 1. Linguistic features
+        ling_feat = self.ling.analyze(mark, goods, goods_class)
+        # Construct a linguistic score (example: weighted combination)
+        ling_score = (
+            (0.2 if ling_feat['pos']['adjective_count'] > 0 else 0) +
+            0.3 * ling_feat['dictionary_word_ratio'] +
+            0.2 * ling_feat['descriptive_keyword_overlap'] +
+            0.2 * ling_feat['ngram_overlap_with_goods'] +
+            (0.1 if ling_feat['has_descriptive_suffix'] else 0)
+        )
+        ling_score = min(1.0, ling_score)
+        # 2. Embedding similarity to descriptive terms (if provided)
+        emb_term_score = 0.0
+        if descriptive_terms:
+            emb_term_score = self.emb.max_similarity_to_terms(mark, descriptive_terms)
+        # 3. Embedding similarity to goods (bi‑encoder)
+        emb_goods_score = self.emb.similarity_to_goods_segments(mark, goods)
+        # 4. Cross‑encoder score
+        cross_score = self.cross.similarity(mark, goods)
+        # Weighted combination
+        descriptive_score = (
+            self.weights['linguistic'] * ling_score +
+            self.weights['embedding_max_term'] * emb_term_score +
+            self.weights['embedding_goods'] * emb_goods_score +
+            self.weights['cross_encoder'] * cross_score
+        )
+        # Genericness detection (simplified)
+        generic_score = 0.0
+        reasons = []
+        # If the mark is a dictionary word and highly similar to goods, could be generic
+        if ling_feat['dictionary_word_ratio'] > 0.8 and cross_score > 0.7:
+            generic_score = 0.8
+            reasons.append("High similarity to goods and common word – potential genericness")
+        elif ling_feat['dictionary_word_ratio'] > 0.9:
+            generic_score = 0.4
+            reasons.append("All words are common dictionary terms")
+        # If mark is a hyponym of a goods category? (could be added with WordNet)
+        # Build explanation
+        explanation = f"Descriptiveness score: {descriptive_score:.2f}. "
+        if reasons:
+            explanation += "Reasons: " + "; ".join(reasons)
+        return {
+            'descriptive_score': round(descriptive_score, 2),
+            'generic_score': round(generic_score, 2),
+            'reasons': reasons,
+            'explanation': explanation,
+            'details': {
+                'linguistic': ling_feat,
+                'embedding_term': emb_term_score,
+                'embedding_goods': emb_goods_score,
+                'cross_encoder': cross_score
+            }
+        }

app/src/linguistic.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import spacy
+import json
+import os
+import math
+from collections import Counter
+from nltk import word_tokenize
+from nltk.corpus import wordnet
+from nltk.corpus.reader.wordnet import NOUN, ADJ, ADV, VERB
+# Load spaCy model (download if not present: python -m spacy download en_core_web_sm)
+nlp = spacy.load("en_core_web_sm")
+# Optional: load word frequency data (e.g., SUBTLEX frequency file)
+# If not available, we use a simple fallback (all words equally frequent).
+FREQ_DICT = {}
+FREQ_PATH = os.path.join(os.path.dirname(__file__), '..', 'data', 'word_freq.json')
+if os.path.exists(FREQ_PATH):
+    with open(FREQ_PATH, 'r') as f:
+        FREQ_DICT = json.load(f)
+class LinguisticAnalyzer:
+    """
+    Extracts rich linguistic features from a trademark string.
+    Features include POS tags, dependency relations, dictionary membership,
+    word frequency, n‑gram overlap with goods description, and named entities.
+    """
+    def __init__(self, descriptive_keywords_path=None):
+        self.descriptive_keywords = {}
+        if descriptive_keywords_path and os.path.exists(descriptive_keywords_path):
+            with open(descriptive_keywords_path, 'r', encoding='utf-8') as f:
+                self.descriptive_keywords = json.load(f)  # e.g., {"class_030": ["fresh", "creamy"]}
+        # List of common descriptive suffixes (e.g., -y, -er, -ing)
+        self.descriptive_suffixes = ('y', 'er', 'ing', 'ive', 'ous', 'al', 'ic')
+    def pos_tags(self, text):
+        """Return list of (token, POS, detailed tag) using spaCy."""
+        doc = nlp(text)
+        return [(token.text, token.pos_, token.tag_) for token in doc]
+    def dependency_relations(self, text):
+        """Extract adjective‑noun and other modifier relations."""
+        doc = nlp(text)
+        modifiers = []
+        for token in doc:
+            # amod: adjectival modifier, nmod: nominal modifier
+            if token.dep_ in ('amod', 'nmod') and token.head.pos_ in ('NOUN', 'PROPN'):
+                modifiers.append((token.text, token.head.text, token.dep_))
+        return modifiers
+    def is_dictionary_word(self, word):
+        """Check if word exists in WordNet."""
+        return bool(wordnet.synsets(word))
+    def word_frequency(self, word):
+        """
+        Return log frequency of word (if available). Higher = more common.
+        Defaults to 0 if not in frequency dictionary.
+        """
+        return FREQ_DICT.get(word.lower(), 0)
+    def extract_ngrams(self, text, n=2, use_words=True):
+        """Generate word n‑grams or character n‑grams."""
+        if use_words:
+            words = word_tokenize(text.lower())
+            ngrams = [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]
+        else:
+            # character n‑grams
+            text_clean = text.lower().replace(' ', '')
+            ngrams = [text_clean[i:i+n] for i in range(len(text_clean)-n+1)]
+        return ngrams
+    def ngram_overlap_with_goods(self, mark, goods, n=2):
+        """
+        Compute the fraction of mark word n‑grams that appear verbatim in the goods description.
+        """
+        if not goods:
+            return 0.0
+        mark_ngrams = set(self.extract_ngrams(mark, n=n, use_words=True))
+        goods_ngrams = set(self.extract_ngrams(goods, n=n, use_words=True))
+        if not mark_ngrams:
+            return 0.0
+        overlap = mark_ngrams.intersection(goods_ngrams)
+        return len(overlap) / len(mark_ngrams)
+    def descriptive_keyword_overlap(self, mark, goods_class=None):
+        """
+        Return fraction of mark words that appear (as lemmas) in the descriptive list for the given class.
+        Uses lemmatization to catch inflected forms.
+        """
+        if not self.descriptive_keywords or not goods_class:
+            return 0.0
+        # Lemmatize mark words
+        doc = nlp(mark)
+        mark_lemmas = {token.lemma_.lower() for token in doc if token.is_alpha}
+        desc_words = set(self.descriptive_keywords.get(goods_class, []))
+        if not mark_lemmas or not desc_words:
+            return 0.0
+        overlap = mark_lemmas.intersection(desc_words)
+        return len(overlap) / len(mark_lemmas)
+    def has_descriptive_suffix(self, word):
+        """Check if word ends with a common descriptive suffix."""
+        return any(word.lower().endswith(suf) for suf in self.descriptive_suffixes)
+    def extract_entities(self, text):
+        """Return list of named entities (PERSON, ORG, GPE, etc.)."""
+        doc = nlp(text)
+        return [(ent.text, ent.label_) for ent in doc.ents]
+    def analyze(self, mark, goods=None, goods_class=None):
+        """
+        Main method: returns a dictionary of linguistic features.
+        """
+        doc = nlp(mark)
+        tokens = [token.text.lower() for token in doc if token.is_alpha]
+        if not tokens:
+            return {'pos': {}, 'dictionary_word_ratio': 0, 'avg_word_freq': 0,
+                    'descriptive_keyword_overlap': 0, 'ngram_overlap_with_goods': 0,
+                    'has_descriptive_suffix': False, 'has_entity': False, 'ngrams': []}
+        # POS summary
+        pos_tags = [(token.text, token.pos_, token.tag_) for token in doc]
+        pos_summary = {
+            'adjective_count': sum(1 for _, pos, _ in pos_tags if pos == 'ADJ'),
+            'comparative_count': sum(1 for _, _, tag in pos_tags if tag in ('JJR', 'JJS')),
+            'noun_count': sum(1 for _, pos, _ in pos_tags if pos == 'NOUN'),
+            'verb_count': sum(1 for _, pos, _ in pos_tags if pos == 'VERB')
+        }
+        # Dependency modifiers
+        modifiers = self.dependency_relations(mark)
+        # Dictionary word ratio
+        dict_word_ratio = sum(1 for w in tokens if self.is_dictionary_word(w)) / len(tokens) if tokens else 0
+        # Average word frequency (log)
+        avg_freq = sum(self.word_frequency(w) for w in tokens) / len(tokens) if tokens else 0
+        # Overlap with goods n‑grams
+        ngram_overlap = self.ngram_overlap_with_goods(mark, goods, n=2) if goods else 0.0
+        # Descriptive keyword overlap (lemma‑based)
+        desc_overlap = self.descriptive_keyword_overlap(mark, goods_class)
+        # Suffix check on the longest word (or any)
+        has_desc_suffix = any(self.has_descriptive_suffix(w) for w in tokens)
+        # Named entities
+        entities = self.extract_entities(mark)
+        has_entity = len(entities) > 0
+        # Word n‑grams for later use
+        ngrams = self.extract_ngrams(mark, n=2, use_words=True)
+        return {
+            'pos': pos_summary,
+            'modifiers': modifiers,
+            'dictionary_word_ratio': dict_word_ratio,
+            'avg_word_freq': avg_freq,
+            'descriptive_keyword_overlap': desc_overlap,
+            'ngram_overlap_with_goods': ngram_overlap,
+            'has_descriptive_suffix': has_desc_suffix,
+            'has_entity': has_entity,
+            'ngrams': ngrams
+        }

app/src/main.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import os
+from .linguistic import LinguisticAnalyzer
+from .embeddings import EmbeddingSimilarity
+from .cross_encoder import CrossEncoderSimilarity
+from .heuristics import DescriptivenessHeuristic
+class TrademarkAnalyzer:
+    """
+    High-level API for trademark descriptiveness analysis.
+    Initializes all sub-modules and provides a unified analyze() method.
+    """
+    def __init__(self, descriptive_keywords_path=None):
+        """
+        Args:
+            descriptive_keywords_path: Path to JSON file with class‑specific descriptive terms.
+        """
+        # Ensure models are cached in the runtime disk (if not already set)
+        if "HF_HOME" not in os.environ:
+            os.environ["HF_HOME"] = "/tmp/huggingface"
+        # Initialize sub-modules
+        self.linguistic = LinguisticAnalyzer(descriptive_keywords_path)
+        self.embedding = EmbeddingSimilarity()          # uses sentence-transformers
+        self.cross_encoder = CrossEncoderSimilarity()
+        self.heuristic = DescriptivenessHeuristic(
+            self.linguistic,
+            self.embedding,
+            self.cross_encoder
+        )
+    def analyze(self, mark, goods, goods_class=None):
+        """
+        Perform full descriptiveness analysis.
+        Args:
+            mark (str): The trademark text.
+            goods (str): Description of goods/services.
+            goods_class (str, optional): USPTO class (e.g., "30").
+        Returns:
+            dict: Contains descriptive_score, generic_score, reasons, explanation, details.
+        """
+        # Load descriptive terms for the class (if any)
+        descriptive_terms = None
+        if goods_class and self.linguistic.descriptive_keywords:
+            class_key = f"class_{goods_class.zfill(3)}"  # e.g., class_030
+            descriptive_terms = self.linguistic.descriptive_keywords.get(class_key, [])
+        # Run the heuristic assessment
+        result = self.heuristic.assess(
+            mark=mark,
+            goods=goods,
+            goods_class=goods_class,
+            descriptive_terms=descriptive_terms
+        )
+        return result

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+fastapi==0.115.0
+uvicorn==0.30.0
+spacy==3.7.2
+nltk==3.8.1
+sentence-transformers==3.0.1
+transformers==4.41.0
+torch==2.3.0
+numpy==1.24.3
+huggingface-hub==0.23.0
+pydantic==2.7.0