Spaces:
Sleeping
Sleeping
Commit ·
049099a
1
Parent(s): de1718e
Initial deployment
Browse files- Dockerfile +33 -0
- app/__init__.py +0 -0
- app/data/descriptive_keywords.json +0 -0
- app/main.py +54 -0
- app/src/__init__.py +0 -0
- app/src/cross_encoder.py +47 -0
- app/src/embeddings.py +73 -0
- app/src/heuristics.py +87 -0
- app/src/linguistic.py +167 -0
- app/src/main.py +57 -0
- requirements.txt +10 -0
Dockerfile
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use official Python slim image
|
| 2 |
+
FROM python:3.10-slim
|
| 3 |
+
|
| 4 |
+
# ---- Set cache directories FIRST ----
|
| 5 |
+
# All model downloads will go to /tmp/.cache (writable runtime disk)
|
| 6 |
+
ENV HF_HOME=/tmp/.cache/huggingface \
|
| 7 |
+
TRANSFORMERS_CACHE=/tmp/.cache/huggingface \
|
| 8 |
+
SENTENCE_TRANSFORMERS_HOME=/tmp/.cache/sentence-transformers \
|
| 9 |
+
SPACY_DATA=/tmp/.cache/spacy \
|
| 10 |
+
NLTK_DATA=/tmp/.cache/nltk
|
| 11 |
+
|
| 12 |
+
# ---- Install system dependencies (if any) ----
|
| 13 |
+
# (none required for this project, but keep if needed)
|
| 14 |
+
|
| 15 |
+
# ---- Set working directory ----
|
| 16 |
+
WORKDIR /app
|
| 17 |
+
|
| 18 |
+
# ---- Copy requirements first (for Docker layer caching) ----
|
| 19 |
+
COPY requirements.txt .
|
| 20 |
+
|
| 21 |
+
# ---- Install Python dependencies ----
|
| 22 |
+
# This does NOT download any models (spacy model is NOT downloaded here)
|
| 23 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 24 |
+
|
| 25 |
+
# ---- Copy the rest of the application ----
|
| 26 |
+
COPY . .
|
| 27 |
+
|
| 28 |
+
# ---- Expose the port Hugging Face Spaces expects ----
|
| 29 |
+
EXPOSE 7860
|
| 30 |
+
|
| 31 |
+
# ---- Start the FastAPI server ----
|
| 32 |
+
# Models will be downloaded automatically on first request to /tmp/.cache
|
| 33 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
app/__init__.py
ADDED
|
File without changes
|
app/data/descriptive_keywords.json
ADDED
|
File without changes
|
app/main.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, HTTPException
|
| 2 |
+
from pydantic import BaseModel
|
| 3 |
+
from typing import Optional
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
# Ensure models are cached to the runtime disk (important for Hugging Face Spaces)
|
| 7 |
+
os.environ["HF_HOME"] = "/tmp/huggingface"
|
| 8 |
+
|
| 9 |
+
# Import your analyzer
|
| 10 |
+
from app.src.main import TrademarkAnalyzer
|
| 11 |
+
|
| 12 |
+
app = FastAPI(title="Trademark Descriptiveness API")
|
| 13 |
+
|
| 14 |
+
# Check that the data file exists (optional, but helpful)
|
| 15 |
+
data_path = "app/data/descriptive_keywords.json"
|
| 16 |
+
if not os.path.exists(data_path):
|
| 17 |
+
print(f"Warning: Data file not found at {data_path}. Keyword overlap will be disabled.")
|
| 18 |
+
|
| 19 |
+
# Initialize analyzer
|
| 20 |
+
analyzer = TrademarkAnalyzer(descriptive_keywords_path=data_path)
|
| 21 |
+
|
| 22 |
+
class AnalyzeRequest(BaseModel):
|
| 23 |
+
mark: str
|
| 24 |
+
goods: str
|
| 25 |
+
goods_class: Optional[str] = None
|
| 26 |
+
|
| 27 |
+
class AnalyzeResponse(BaseModel):
|
| 28 |
+
descriptive_score: float
|
| 29 |
+
generic_score: float
|
| 30 |
+
reasons: list[str]
|
| 31 |
+
explanation: str
|
| 32 |
+
details: dict
|
| 33 |
+
|
| 34 |
+
@app.get("/")
|
| 35 |
+
def read_root():
|
| 36 |
+
return {"message": "Trademark API is running"}
|
| 37 |
+
|
| 38 |
+
@app.get("/health")
|
| 39 |
+
def health_check():
|
| 40 |
+
return {"status": "ok"}
|
| 41 |
+
|
| 42 |
+
@app.post("/analyze", response_model=AnalyzeResponse)
|
| 43 |
+
def analyze(request: AnalyzeRequest):
|
| 44 |
+
try:
|
| 45 |
+
result = analyzer.analyze(
|
| 46 |
+
mark=request.mark,
|
| 47 |
+
goods=request.goods,
|
| 48 |
+
goods_class=request.goods_class
|
| 49 |
+
)
|
| 50 |
+
return AnalyzeResponse(**result)
|
| 51 |
+
except Exception as e:
|
| 52 |
+
# Log the error (optional)
|
| 53 |
+
print(f"Error during analysis: {e}")
|
| 54 |
+
raise HTTPException(status_code=500, detail=str(e))
|
app/src/__init__.py
ADDED
|
File without changes
|
app/src/cross_encoder.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sentence_transformers import CrossEncoder
|
| 2 |
+
from nltk import sent_tokenize
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
class CrossEncoderSimilarity:
|
| 6 |
+
"""
|
| 7 |
+
Uses a cross‑encoder to compute deep semantic similarity between mark and goods.
|
| 8 |
+
Supports sentence‑level segmentation and returns attention weights for explainability.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
def __init__(self, model_name='cross-encoder/stsb-roberta-large'):
|
| 12 |
+
self.model = CrossEncoder(model_name, num_labels=1) # regression output
|
| 13 |
+
# We'll store the last attention scores if needed (for explainability)
|
| 14 |
+
self.last_attention = None
|
| 15 |
+
|
| 16 |
+
def similarity(self, mark, goods, return_segments=False):
|
| 17 |
+
"""
|
| 18 |
+
Returns a score between 0 and 1. If return_segments=True, also returns
|
| 19 |
+
the maximum segment score and the segment text.
|
| 20 |
+
"""
|
| 21 |
+
if not goods:
|
| 22 |
+
return 0.0 if not return_segments else (0.0, None)
|
| 23 |
+
sentences = sent_tokenize(goods)
|
| 24 |
+
if not sentences:
|
| 25 |
+
return 0.0 if not return_segments else (0.0, None)
|
| 26 |
+
|
| 27 |
+
pairs = [(mark, sent) for sent in sentences]
|
| 28 |
+
scores = self.model.predict(pairs)
|
| 29 |
+
# Normalize: assume model output range roughly 0-5 (for stsb models)
|
| 30 |
+
# If using a different model, adjust normalization accordingly.
|
| 31 |
+
scores_norm = [min(1.0, max(0.0, s / 5.0)) for s in scores]
|
| 32 |
+
max_score = max(scores_norm)
|
| 33 |
+
max_idx = int(np.argmax(scores_norm))
|
| 34 |
+
|
| 35 |
+
if return_segments:
|
| 36 |
+
return max_score, sentences[max_idx]
|
| 37 |
+
return max_score
|
| 38 |
+
|
| 39 |
+
def similarity_with_explanation(self, mark, goods):
|
| 40 |
+
"""
|
| 41 |
+
Returns score and the most relevant sentence from goods, plus optionally attention.
|
| 42 |
+
For attention, we'd need a model that returns cross‑attention; not all do.
|
| 43 |
+
This method provides a simple explanation.
|
| 44 |
+
"""
|
| 45 |
+
max_score, best_sentence = self.similarity(mark, goods, return_segments=True)
|
| 46 |
+
explanation = f"Highest similarity with segment: '{best_sentence}' (score: {max_score:.2f})"
|
| 47 |
+
return max_score, explanation
|
app/src/embeddings.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from sentence_transformers import SentenceTransformer, util
|
| 3 |
+
from nltk import sent_tokenize
|
| 4 |
+
|
| 5 |
+
class EmbeddingSimilarity:
|
| 6 |
+
"""
|
| 7 |
+
Uses a sentence‑transformer model to compute semantic similarity between
|
| 8 |
+
the mark and a list of descriptive terms, and also between mark and goods.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
def __init__(self, model_name='all-MiniLM-L6-v2'):
|
| 12 |
+
self.model = SentenceTransformer(model_name)
|
| 13 |
+
# Cache for pre‑computed class centroids (optional)
|
| 14 |
+
self.class_centroids = {}
|
| 15 |
+
|
| 16 |
+
def encode(self, text):
|
| 17 |
+
"""Return embedding for a single text."""
|
| 18 |
+
return self.model.encode(text, convert_to_tensor=True)
|
| 19 |
+
|
| 20 |
+
def similarity(self, emb1, emb2):
|
| 21 |
+
"""Cosine similarity between two embeddings."""
|
| 22 |
+
return float(util.cos_sim(emb1, emb2)[0][0])
|
| 23 |
+
|
| 24 |
+
def max_similarity_to_terms(self, mark, descriptive_terms):
|
| 25 |
+
"""
|
| 26 |
+
Compute the maximum cosine similarity between the mark embedding
|
| 27 |
+
and each individual descriptive term's embedding.
|
| 28 |
+
"""
|
| 29 |
+
if not descriptive_terms:
|
| 30 |
+
return 0.0
|
| 31 |
+
mark_emb = self.encode(mark)
|
| 32 |
+
term_embs = self.encode(descriptive_terms)
|
| 33 |
+
sims = util.cos_sim(mark_emb, term_embs)[0]
|
| 34 |
+
return float(sims.max())
|
| 35 |
+
|
| 36 |
+
def similarity_to_class_centroid(self, mark, class_terms):
|
| 37 |
+
"""
|
| 38 |
+
Pre‑compute centroid for a class (average of all term embeddings)
|
| 39 |
+
and compare mark against it. (Useful for speed when class_terms are static.)
|
| 40 |
+
"""
|
| 41 |
+
if not class_terms:
|
| 42 |
+
return 0.0
|
| 43 |
+
# Create a key for the class (e.g., tuple of terms sorted)
|
| 44 |
+
# For simplicity, we'll just compute on the fly; you can cache.
|
| 45 |
+
term_embs = self.encode(class_terms)
|
| 46 |
+
centroid = term_embs.mean(axis=0)
|
| 47 |
+
mark_emb = self.encode(mark)
|
| 48 |
+
return self.similarity(mark_emb, centroid)
|
| 49 |
+
|
| 50 |
+
def similarity_to_goods(self, mark, goods):
|
| 51 |
+
"""
|
| 52 |
+
Compute similarity between mark and goods using the bi‑encoder.
|
| 53 |
+
This is a fast alternative to the cross‑encoder.
|
| 54 |
+
"""
|
| 55 |
+
if not goods:
|
| 56 |
+
return 0.0
|
| 57 |
+
mark_emb = self.encode(mark)
|
| 58 |
+
goods_emb = self.encode(goods)
|
| 59 |
+
return self.similarity(mark_emb, goods_emb)
|
| 60 |
+
|
| 61 |
+
def similarity_to_goods_segments(self, mark, goods):
|
| 62 |
+
"""
|
| 63 |
+
Split goods into sentences and take the maximum similarity.
|
| 64 |
+
"""
|
| 65 |
+
if not goods:
|
| 66 |
+
return 0.0
|
| 67 |
+
sentences = sent_tokenize(goods)
|
| 68 |
+
if not sentences:
|
| 69 |
+
return 0.0
|
| 70 |
+
mark_emb = self.encode(mark)
|
| 71 |
+
sent_embs = self.encode(sentences)
|
| 72 |
+
sims = util.cos_sim(mark_emb, sent_embs)[0]
|
| 73 |
+
return float(sims.max())
|
app/src/heuristics.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
|
| 3 |
+
class DescriptivenessHeuristic:
|
| 4 |
+
"""
|
| 5 |
+
Combines outputs from linguistic, embedding, and cross‑encoder modules
|
| 6 |
+
to produce final descriptiveness and genericness scores.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
def __init__(self, ling_analyzer, emb_similarity, cross_encoder, weights=None):
|
| 10 |
+
self.ling = ling_analyzer
|
| 11 |
+
self.emb = emb_similarity
|
| 12 |
+
self.cross = cross_encoder
|
| 13 |
+
# Default weights – can be tuned via validation
|
| 14 |
+
self.weights = weights or {
|
| 15 |
+
'linguistic': 0.25,
|
| 16 |
+
'embedding_max_term': 0.25,
|
| 17 |
+
'embedding_goods': 0.20,
|
| 18 |
+
'cross_encoder': 0.30
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
def assess(self, mark, goods, goods_class=None, descriptive_terms=None):
|
| 22 |
+
"""
|
| 23 |
+
Returns a dict with scores and reasons.
|
| 24 |
+
"""
|
| 25 |
+
# 1. Linguistic features
|
| 26 |
+
ling_feat = self.ling.analyze(mark, goods, goods_class)
|
| 27 |
+
|
| 28 |
+
# Construct a linguistic score (example: weighted combination)
|
| 29 |
+
ling_score = (
|
| 30 |
+
(0.2 if ling_feat['pos']['adjective_count'] > 0 else 0) +
|
| 31 |
+
0.3 * ling_feat['dictionary_word_ratio'] +
|
| 32 |
+
0.2 * ling_feat['descriptive_keyword_overlap'] +
|
| 33 |
+
0.2 * ling_feat['ngram_overlap_with_goods'] +
|
| 34 |
+
(0.1 if ling_feat['has_descriptive_suffix'] else 0)
|
| 35 |
+
)
|
| 36 |
+
ling_score = min(1.0, ling_score)
|
| 37 |
+
|
| 38 |
+
# 2. Embedding similarity to descriptive terms (if provided)
|
| 39 |
+
emb_term_score = 0.0
|
| 40 |
+
if descriptive_terms:
|
| 41 |
+
emb_term_score = self.emb.max_similarity_to_terms(mark, descriptive_terms)
|
| 42 |
+
|
| 43 |
+
# 3. Embedding similarity to goods (bi‑encoder)
|
| 44 |
+
emb_goods_score = self.emb.similarity_to_goods_segments(mark, goods)
|
| 45 |
+
|
| 46 |
+
# 4. Cross‑encoder score
|
| 47 |
+
cross_score = self.cross.similarity(mark, goods)
|
| 48 |
+
|
| 49 |
+
# Weighted combination
|
| 50 |
+
descriptive_score = (
|
| 51 |
+
self.weights['linguistic'] * ling_score +
|
| 52 |
+
self.weights['embedding_max_term'] * emb_term_score +
|
| 53 |
+
self.weights['embedding_goods'] * emb_goods_score +
|
| 54 |
+
self.weights['cross_encoder'] * cross_score
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
# Genericness detection (simplified)
|
| 58 |
+
generic_score = 0.0
|
| 59 |
+
reasons = []
|
| 60 |
+
|
| 61 |
+
# If the mark is a dictionary word and highly similar to goods, could be generic
|
| 62 |
+
if ling_feat['dictionary_word_ratio'] > 0.8 and cross_score > 0.7:
|
| 63 |
+
generic_score = 0.8
|
| 64 |
+
reasons.append("High similarity to goods and common word – potential genericness")
|
| 65 |
+
elif ling_feat['dictionary_word_ratio'] > 0.9:
|
| 66 |
+
generic_score = 0.4
|
| 67 |
+
reasons.append("All words are common dictionary terms")
|
| 68 |
+
|
| 69 |
+
# If mark is a hyponym of a goods category? (could be added with WordNet)
|
| 70 |
+
|
| 71 |
+
# Build explanation
|
| 72 |
+
explanation = f"Descriptiveness score: {descriptive_score:.2f}. "
|
| 73 |
+
if reasons:
|
| 74 |
+
explanation += "Reasons: " + "; ".join(reasons)
|
| 75 |
+
|
| 76 |
+
return {
|
| 77 |
+
'descriptive_score': round(descriptive_score, 2),
|
| 78 |
+
'generic_score': round(generic_score, 2),
|
| 79 |
+
'reasons': reasons,
|
| 80 |
+
'explanation': explanation,
|
| 81 |
+
'details': {
|
| 82 |
+
'linguistic': ling_feat,
|
| 83 |
+
'embedding_term': emb_term_score,
|
| 84 |
+
'embedding_goods': emb_goods_score,
|
| 85 |
+
'cross_encoder': cross_score
|
| 86 |
+
}
|
| 87 |
+
}
|
app/src/linguistic.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import spacy
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
import math
|
| 5 |
+
from collections import Counter
|
| 6 |
+
from nltk import word_tokenize
|
| 7 |
+
from nltk.corpus import wordnet
|
| 8 |
+
from nltk.corpus.reader.wordnet import NOUN, ADJ, ADV, VERB
|
| 9 |
+
|
| 10 |
+
# Load spaCy model (download if not present: python -m spacy download en_core_web_sm)
|
| 11 |
+
nlp = spacy.load("en_core_web_sm")
|
| 12 |
+
|
| 13 |
+
# Optional: load word frequency data (e.g., SUBTLEX frequency file)
|
| 14 |
+
# If not available, we use a simple fallback (all words equally frequent).
|
| 15 |
+
FREQ_DICT = {}
|
| 16 |
+
FREQ_PATH = os.path.join(os.path.dirname(__file__), '..', 'data', 'word_freq.json')
|
| 17 |
+
if os.path.exists(FREQ_PATH):
|
| 18 |
+
with open(FREQ_PATH, 'r') as f:
|
| 19 |
+
FREQ_DICT = json.load(f)
|
| 20 |
+
|
| 21 |
+
class LinguisticAnalyzer:
|
| 22 |
+
"""
|
| 23 |
+
Extracts rich linguistic features from a trademark string.
|
| 24 |
+
Features include POS tags, dependency relations, dictionary membership,
|
| 25 |
+
word frequency, n‑gram overlap with goods description, and named entities.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
def __init__(self, descriptive_keywords_path=None):
|
| 29 |
+
self.descriptive_keywords = {}
|
| 30 |
+
if descriptive_keywords_path and os.path.exists(descriptive_keywords_path):
|
| 31 |
+
with open(descriptive_keywords_path, 'r', encoding='utf-8') as f:
|
| 32 |
+
self.descriptive_keywords = json.load(f) # e.g., {"class_030": ["fresh", "creamy"]}
|
| 33 |
+
|
| 34 |
+
# List of common descriptive suffixes (e.g., -y, -er, -ing)
|
| 35 |
+
self.descriptive_suffixes = ('y', 'er', 'ing', 'ive', 'ous', 'al', 'ic')
|
| 36 |
+
|
| 37 |
+
def pos_tags(self, text):
|
| 38 |
+
"""Return list of (token, POS, detailed tag) using spaCy."""
|
| 39 |
+
doc = nlp(text)
|
| 40 |
+
return [(token.text, token.pos_, token.tag_) for token in doc]
|
| 41 |
+
|
| 42 |
+
def dependency_relations(self, text):
|
| 43 |
+
"""Extract adjective‑noun and other modifier relations."""
|
| 44 |
+
doc = nlp(text)
|
| 45 |
+
modifiers = []
|
| 46 |
+
for token in doc:
|
| 47 |
+
# amod: adjectival modifier, nmod: nominal modifier
|
| 48 |
+
if token.dep_ in ('amod', 'nmod') and token.head.pos_ in ('NOUN', 'PROPN'):
|
| 49 |
+
modifiers.append((token.text, token.head.text, token.dep_))
|
| 50 |
+
return modifiers
|
| 51 |
+
|
| 52 |
+
def is_dictionary_word(self, word):
|
| 53 |
+
"""Check if word exists in WordNet."""
|
| 54 |
+
return bool(wordnet.synsets(word))
|
| 55 |
+
|
| 56 |
+
def word_frequency(self, word):
|
| 57 |
+
"""
|
| 58 |
+
Return log frequency of word (if available). Higher = more common.
|
| 59 |
+
Defaults to 0 if not in frequency dictionary.
|
| 60 |
+
"""
|
| 61 |
+
return FREQ_DICT.get(word.lower(), 0)
|
| 62 |
+
|
| 63 |
+
def extract_ngrams(self, text, n=2, use_words=True):
|
| 64 |
+
"""Generate word n‑grams or character n‑grams."""
|
| 65 |
+
if use_words:
|
| 66 |
+
words = word_tokenize(text.lower())
|
| 67 |
+
ngrams = [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]
|
| 68 |
+
else:
|
| 69 |
+
# character n‑grams
|
| 70 |
+
text_clean = text.lower().replace(' ', '')
|
| 71 |
+
ngrams = [text_clean[i:i+n] for i in range(len(text_clean)-n+1)]
|
| 72 |
+
return ngrams
|
| 73 |
+
|
| 74 |
+
def ngram_overlap_with_goods(self, mark, goods, n=2):
|
| 75 |
+
"""
|
| 76 |
+
Compute the fraction of mark word n‑grams that appear verbatim in the goods description.
|
| 77 |
+
"""
|
| 78 |
+
if not goods:
|
| 79 |
+
return 0.0
|
| 80 |
+
mark_ngrams = set(self.extract_ngrams(mark, n=n, use_words=True))
|
| 81 |
+
goods_ngrams = set(self.extract_ngrams(goods, n=n, use_words=True))
|
| 82 |
+
if not mark_ngrams:
|
| 83 |
+
return 0.0
|
| 84 |
+
overlap = mark_ngrams.intersection(goods_ngrams)
|
| 85 |
+
return len(overlap) / len(mark_ngrams)
|
| 86 |
+
|
| 87 |
+
def descriptive_keyword_overlap(self, mark, goods_class=None):
|
| 88 |
+
"""
|
| 89 |
+
Return fraction of mark words that appear (as lemmas) in the descriptive list for the given class.
|
| 90 |
+
Uses lemmatization to catch inflected forms.
|
| 91 |
+
"""
|
| 92 |
+
if not self.descriptive_keywords or not goods_class:
|
| 93 |
+
return 0.0
|
| 94 |
+
# Lemmatize mark words
|
| 95 |
+
doc = nlp(mark)
|
| 96 |
+
mark_lemmas = {token.lemma_.lower() for token in doc if token.is_alpha}
|
| 97 |
+
desc_words = set(self.descriptive_keywords.get(goods_class, []))
|
| 98 |
+
if not mark_lemmas or not desc_words:
|
| 99 |
+
return 0.0
|
| 100 |
+
overlap = mark_lemmas.intersection(desc_words)
|
| 101 |
+
return len(overlap) / len(mark_lemmas)
|
| 102 |
+
|
| 103 |
+
def has_descriptive_suffix(self, word):
|
| 104 |
+
"""Check if word ends with a common descriptive suffix."""
|
| 105 |
+
return any(word.lower().endswith(suf) for suf in self.descriptive_suffixes)
|
| 106 |
+
|
| 107 |
+
def extract_entities(self, text):
|
| 108 |
+
"""Return list of named entities (PERSON, ORG, GPE, etc.)."""
|
| 109 |
+
doc = nlp(text)
|
| 110 |
+
return [(ent.text, ent.label_) for ent in doc.ents]
|
| 111 |
+
|
| 112 |
+
def analyze(self, mark, goods=None, goods_class=None):
|
| 113 |
+
"""
|
| 114 |
+
Main method: returns a dictionary of linguistic features.
|
| 115 |
+
"""
|
| 116 |
+
doc = nlp(mark)
|
| 117 |
+
tokens = [token.text.lower() for token in doc if token.is_alpha]
|
| 118 |
+
if not tokens:
|
| 119 |
+
return {'pos': {}, 'dictionary_word_ratio': 0, 'avg_word_freq': 0,
|
| 120 |
+
'descriptive_keyword_overlap': 0, 'ngram_overlap_with_goods': 0,
|
| 121 |
+
'has_descriptive_suffix': False, 'has_entity': False, 'ngrams': []}
|
| 122 |
+
|
| 123 |
+
# POS summary
|
| 124 |
+
pos_tags = [(token.text, token.pos_, token.tag_) for token in doc]
|
| 125 |
+
pos_summary = {
|
| 126 |
+
'adjective_count': sum(1 for _, pos, _ in pos_tags if pos == 'ADJ'),
|
| 127 |
+
'comparative_count': sum(1 for _, _, tag in pos_tags if tag in ('JJR', 'JJS')),
|
| 128 |
+
'noun_count': sum(1 for _, pos, _ in pos_tags if pos == 'NOUN'),
|
| 129 |
+
'verb_count': sum(1 for _, pos, _ in pos_tags if pos == 'VERB')
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
# Dependency modifiers
|
| 133 |
+
modifiers = self.dependency_relations(mark)
|
| 134 |
+
|
| 135 |
+
# Dictionary word ratio
|
| 136 |
+
dict_word_ratio = sum(1 for w in tokens if self.is_dictionary_word(w)) / len(tokens) if tokens else 0
|
| 137 |
+
|
| 138 |
+
# Average word frequency (log)
|
| 139 |
+
avg_freq = sum(self.word_frequency(w) for w in tokens) / len(tokens) if tokens else 0
|
| 140 |
+
|
| 141 |
+
# Overlap with goods n‑grams
|
| 142 |
+
ngram_overlap = self.ngram_overlap_with_goods(mark, goods, n=2) if goods else 0.0
|
| 143 |
+
|
| 144 |
+
# Descriptive keyword overlap (lemma‑based)
|
| 145 |
+
desc_overlap = self.descriptive_keyword_overlap(mark, goods_class)
|
| 146 |
+
|
| 147 |
+
# Suffix check on the longest word (or any)
|
| 148 |
+
has_desc_suffix = any(self.has_descriptive_suffix(w) for w in tokens)
|
| 149 |
+
|
| 150 |
+
# Named entities
|
| 151 |
+
entities = self.extract_entities(mark)
|
| 152 |
+
has_entity = len(entities) > 0
|
| 153 |
+
|
| 154 |
+
# Word n‑grams for later use
|
| 155 |
+
ngrams = self.extract_ngrams(mark, n=2, use_words=True)
|
| 156 |
+
|
| 157 |
+
return {
|
| 158 |
+
'pos': pos_summary,
|
| 159 |
+
'modifiers': modifiers,
|
| 160 |
+
'dictionary_word_ratio': dict_word_ratio,
|
| 161 |
+
'avg_word_freq': avg_freq,
|
| 162 |
+
'descriptive_keyword_overlap': desc_overlap,
|
| 163 |
+
'ngram_overlap_with_goods': ngram_overlap,
|
| 164 |
+
'has_descriptive_suffix': has_desc_suffix,
|
| 165 |
+
'has_entity': has_entity,
|
| 166 |
+
'ngrams': ngrams
|
| 167 |
+
}
|
app/src/main.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from .linguistic import LinguisticAnalyzer
|
| 3 |
+
from .embeddings import EmbeddingSimilarity
|
| 4 |
+
from .cross_encoder import CrossEncoderSimilarity
|
| 5 |
+
from .heuristics import DescriptivenessHeuristic
|
| 6 |
+
|
| 7 |
+
class TrademarkAnalyzer:
|
| 8 |
+
"""
|
| 9 |
+
High-level API for trademark descriptiveness analysis.
|
| 10 |
+
Initializes all sub-modules and provides a unified analyze() method.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
def __init__(self, descriptive_keywords_path=None):
|
| 14 |
+
"""
|
| 15 |
+
Args:
|
| 16 |
+
descriptive_keywords_path: Path to JSON file with class‑specific descriptive terms.
|
| 17 |
+
"""
|
| 18 |
+
# Ensure models are cached in the runtime disk (if not already set)
|
| 19 |
+
if "HF_HOME" not in os.environ:
|
| 20 |
+
os.environ["HF_HOME"] = "/tmp/huggingface"
|
| 21 |
+
|
| 22 |
+
# Initialize sub-modules
|
| 23 |
+
self.linguistic = LinguisticAnalyzer(descriptive_keywords_path)
|
| 24 |
+
self.embedding = EmbeddingSimilarity() # uses sentence-transformers
|
| 25 |
+
self.cross_encoder = CrossEncoderSimilarity()
|
| 26 |
+
self.heuristic = DescriptivenessHeuristic(
|
| 27 |
+
self.linguistic,
|
| 28 |
+
self.embedding,
|
| 29 |
+
self.cross_encoder
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
def analyze(self, mark, goods, goods_class=None):
|
| 33 |
+
"""
|
| 34 |
+
Perform full descriptiveness analysis.
|
| 35 |
+
|
| 36 |
+
Args:
|
| 37 |
+
mark (str): The trademark text.
|
| 38 |
+
goods (str): Description of goods/services.
|
| 39 |
+
goods_class (str, optional): USPTO class (e.g., "30").
|
| 40 |
+
|
| 41 |
+
Returns:
|
| 42 |
+
dict: Contains descriptive_score, generic_score, reasons, explanation, details.
|
| 43 |
+
"""
|
| 44 |
+
# Load descriptive terms for the class (if any)
|
| 45 |
+
descriptive_terms = None
|
| 46 |
+
if goods_class and self.linguistic.descriptive_keywords:
|
| 47 |
+
class_key = f"class_{goods_class.zfill(3)}" # e.g., class_030
|
| 48 |
+
descriptive_terms = self.linguistic.descriptive_keywords.get(class_key, [])
|
| 49 |
+
|
| 50 |
+
# Run the heuristic assessment
|
| 51 |
+
result = self.heuristic.assess(
|
| 52 |
+
mark=mark,
|
| 53 |
+
goods=goods,
|
| 54 |
+
goods_class=goods_class,
|
| 55 |
+
descriptive_terms=descriptive_terms
|
| 56 |
+
)
|
| 57 |
+
return result
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.115.0
|
| 2 |
+
uvicorn==0.30.0
|
| 3 |
+
spacy==3.7.2
|
| 4 |
+
nltk==3.8.1
|
| 5 |
+
sentence-transformers==3.0.1
|
| 6 |
+
transformers==4.41.0
|
| 7 |
+
torch==2.3.0
|
| 8 |
+
numpy==1.24.3
|
| 9 |
+
huggingface-hub==0.23.0
|
| 10 |
+
pydantic==2.7.0
|