| """ |
| FastAPI server providing OpenAI-compatible endpoints for code generation. |
| Designed to work with MCP servers and provide unlimited tokens with minimal rate limiting. |
| """ |
| import os |
| import time |
| import uuid |
| from typing import Optional, List, Dict, Any |
| from fastapi import FastAPI, HTTPException |
| from fastapi.middleware.cors import CORSMiddleware |
| from pydantic import BaseModel, Field |
| from huggingface_hub import hf_hub_download |
| from llama_cpp import Llama |
| import uvicorn |
|
|
| |
| |
| |
| MODEL_REPO = "TheBloke/deepseek-coder-1.3b-instruct-GGUF" |
| MODEL_FILE = "deepseek-coder-1.3b-instruct.Q4_K_M.gguf" |
| MODEL_NAME = "deepseek-coder-1.3b-instruct" |
|
|
| |
| MAX_CONTEXT = 4096 |
| MAX_TOKENS = 4096 |
| DEFAULT_TEMP = 0.7 |
| DEFAULT_TOP_P = 0.95 |
|
|
| |
| |
| |
| class Message(BaseModel): |
| role: str |
| content: str |
|
|
| class ChatCompletionRequest(BaseModel): |
| model: str = MODEL_NAME |
| messages: List[Message] |
| temperature: Optional[float] = DEFAULT_TEMP |
| top_p: Optional[float] = DEFAULT_TOP_P |
| max_tokens: Optional[int] = MAX_TOKENS |
| stream: Optional[bool] = False |
| stop: Optional[List[str]] = None |
|
|
| class CompletionRequest(BaseModel): |
| model: str = MODEL_NAME |
| prompt: str |
| temperature: Optional[float] = DEFAULT_TEMP |
| top_p: Optional[float] = DEFAULT_TOP_P |
| max_tokens: Optional[int] = MAX_TOKENS |
| stop: Optional[List[str]] = None |
|
|
| class Usage(BaseModel): |
| prompt_tokens: int |
| completion_tokens: int |
| total_tokens: int |
|
|
| class ChatCompletionChoice(BaseModel): |
| index: int |
| message: Message |
| finish_reason: str |
|
|
| class ChatCompletionResponse(BaseModel): |
| id: str |
| object: str = "chat.completion" |
| created: int |
| model: str |
| choices: List[ChatCompletionChoice] |
| usage: Usage |
|
|
| class CompletionChoice(BaseModel): |
| index: int |
| text: str |
| finish_reason: str |
|
|
| class CompletionResponse(BaseModel): |
| id: str |
| object: str = "text_completion" |
| created: int |
| model: str |
| choices: List[CompletionChoice] |
| usage: Usage |
|
|
| |
| |
| |
| app = FastAPI( |
| title="Code LLM API", |
| description="OpenAI-compatible API for code generation with minimal rate limiting", |
| version="1.0.0" |
| ) |
|
|
| |
| app.add_middleware( |
| CORSMiddleware, |
| allow_origins=["*"], |
| allow_credentials=True, |
| allow_methods=["*"], |
| allow_headers=["*"], |
| ) |
|
|
| |
| llm: Optional[Llama] = None |
|
|
| |
| |
| |
| @app.on_event("startup") |
| async def load_model(): |
| """Load the LLM model on startup.""" |
| global llm |
| print(f"Downloading model {MODEL_REPO}/{MODEL_FILE}...") |
| model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE) |
| print(f"Model downloaded to: {model_path}") |
|
|
| print("Loading model into memory...") |
| llm = Llama( |
| model_path=model_path, |
| n_ctx=MAX_CONTEXT, |
| n_threads=8, |
| n_batch=1024, |
| verbose=False, |
| n_gpu_layers=0 |
| ) |
| print("Model loaded successfully!") |
|
|
| |
| |
| |
| def messages_to_prompt(messages: List[Message]) -> str: |
| """Convert OpenAI-style messages to a prompt for CodeLlama.""" |
| prompt_parts = [] |
|
|
| for msg in messages: |
| if msg.role == "system": |
| prompt_parts.append(f"### System: {msg.content}") |
| elif msg.role == "user": |
| prompt_parts.append(f"### Instruction: {msg.content}") |
| elif msg.role == "assistant": |
| prompt_parts.append(f"### Response: {msg.content}") |
|
|
| prompt_parts.append("### Response:") |
| return "\n".join(prompt_parts) |
|
|
| def estimate_tokens(text: str) -> int: |
| """Rough token estimation (1 token ≈ 4 chars).""" |
| return len(text) // 4 |
|
|
| |
| |
| |
| @app.get("/") |
| async def root(): |
| """Health check endpoint.""" |
| return { |
| "status": "online", |
| "model": MODEL_NAME, |
| "max_context": MAX_CONTEXT, |
| "max_tokens": MAX_TOKENS, |
| "endpoints": { |
| "chat": "/v1/chat/completions", |
| "completion": "/v1/completions", |
| "models": "/v1/models" |
| } |
| } |
|
|
| @app.get("/health") |
| async def health(): |
| """Health check for monitoring.""" |
| return { |
| "status": "healthy" if llm is not None else "loading", |
| "model_loaded": llm is not None |
| } |
|
|
| @app.get("/v1/models") |
| async def list_models(): |
| """List available models (OpenAI-compatible).""" |
| return { |
| "object": "list", |
| "data": [ |
| { |
| "id": MODEL_NAME, |
| "object": "model", |
| "created": int(time.time()), |
| "owned_by": "huggingface", |
| "permission": [], |
| "root": MODEL_NAME, |
| "parent": None |
| } |
| ] |
| } |
|
|
| @app.post("/v1/chat/completions", response_model=ChatCompletionResponse) |
| async def chat_completions(request: ChatCompletionRequest): |
| """ |
| OpenAI-compatible chat completions endpoint. |
| No rate limiting - designed for unlimited use. |
| """ |
| if llm is None: |
| raise HTTPException(status_code=503, detail="Model still loading") |
|
|
| if request.stream: |
| raise HTTPException(status_code=501, detail="Streaming not yet implemented") |
|
|
| |
| prompt = messages_to_prompt(request.messages) |
|
|
| |
| try: |
| output = llm( |
| prompt, |
| max_tokens=request.max_tokens or MAX_TOKENS, |
| temperature=request.temperature or DEFAULT_TEMP, |
| top_p=request.top_p or DEFAULT_TOP_P, |
| stop=request.stop or ["###", "\n\n\n"], |
| echo=False |
| ) |
|
|
| generated_text = output['choices'][0]['text'].strip() |
|
|
| |
| prompt_tokens = estimate_tokens(prompt) |
| completion_tokens = estimate_tokens(generated_text) |
|
|
| return ChatCompletionResponse( |
| id=f"chatcmpl-{uuid.uuid4().hex[:8]}", |
| created=int(time.time()), |
| model=request.model, |
| choices=[ |
| ChatCompletionChoice( |
| index=0, |
| message=Message(role="assistant", content=generated_text), |
| finish_reason="stop" |
| ) |
| ], |
| usage=Usage( |
| prompt_tokens=prompt_tokens, |
| completion_tokens=completion_tokens, |
| total_tokens=prompt_tokens + completion_tokens |
| ) |
| ) |
| except Exception as e: |
| raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}") |
|
|
| @app.post("/v1/completions", response_model=CompletionResponse) |
| async def completions(request: CompletionRequest): |
| """ |
| OpenAI-compatible completions endpoint. |
| No rate limiting - designed for unlimited use. |
| """ |
| if llm is None: |
| raise HTTPException(status_code=503, detail="Model still loading") |
|
|
| try: |
| output = llm( |
| request.prompt, |
| max_tokens=request.max_tokens or MAX_TOKENS, |
| temperature=request.temperature or DEFAULT_TEMP, |
| top_p=request.top_p or DEFAULT_TOP_P, |
| stop=request.stop or [], |
| echo=False |
| ) |
|
|
| generated_text = output['choices'][0]['text'].strip() |
|
|
| |
| prompt_tokens = estimate_tokens(request.prompt) |
| completion_tokens = estimate_tokens(generated_text) |
|
|
| return CompletionResponse( |
| id=f"cmpl-{uuid.uuid4().hex[:8]}", |
| created=int(time.time()), |
| model=request.model, |
| choices=[ |
| CompletionChoice( |
| index=0, |
| text=generated_text, |
| finish_reason="stop" |
| ) |
| ], |
| usage=Usage( |
| prompt_tokens=prompt_tokens, |
| completion_tokens=completion_tokens, |
| total_tokens=prompt_tokens + completion_tokens |
| ) |
| ) |
| except Exception as e: |
| raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}") |
|
|
| |
| |
| |
| @app.post("/generate") |
| async def generate(prompt: str, max_tokens: int = 512): |
| """Simple generation endpoint for quick testing.""" |
| if llm is None: |
| raise HTTPException(status_code=503, detail="Model still loading") |
|
|
| try: |
| output = llm(prompt, max_tokens=max_tokens, temperature=0.7) |
| return { |
| "prompt": prompt, |
| "response": output['choices'][0]['text'].strip(), |
| "model": MODEL_NAME |
| } |
| except Exception as e: |
| raise HTTPException(status_code=500, detail=str(e)) |
|
|
| |
| |
| |
| if __name__ == "__main__": |
| uvicorn.run( |
| app, |
| host="0.0.0.0", |
| port=int(os.getenv("PORT", "7860")), |
| log_level="info" |
| ) |
|
|