LLM router - proxies to provider skills (claude, openai, ollama)

2026-02-03 00:09:06 +01:00
commit 1d9de9d770
4 changed files with 551 additions and 0 deletions
--- a/SKILL.md
+++ b/SKILL.md
@@ -0,0 +1,141 @@
 ---
 name: llm
 description: LLM router that proxies to provider skills (claude, openai, ollama)
 metadata:
  version: "1.0.0"
  vibestack:
    main: false
 ---
 # LLM Skill
 Unified LLM router that proxies requests to provider-specific skills. Abstracts away which LLM backend is being used.
 ## Architecture
 ```
 ┌─────────────┐     ┌─────────────┐
 │   client    │────▶│     llm     │ (router)
 └─────────────┘     └──────┬──────┘
                           │
        ┌──────────────────┼──────────────────┐
        ▼                  ▼                  ▼
 ┌───────────────┐  ┌───────────────┐  ┌───────────────┐
 │ claude skill  │  │ openai skill  │  │ ollama skill  │
 │ localhost:8888│  │ localhost:8889│  │ localhost:11434
 └───────────────┘  └───────────────┘  └───────────────┘
 ```
 ## Configuration
 ### Environment Variables
 | Variable | Default | Description |
 |----------|---------|-------------|
 | `LLM_PORT` | `8082` | Router port |
 | `LLM_PROVIDER` | `claude` | Active provider: `claude`, `openai`, `ollama` |
 | `CLAUDE_URL` | `http://localhost:8888` | Claude skill URL |
 | `OPENAI_URL` | `http://localhost:8889` | OpenAI skill URL |
 | `OLLAMA_URL` | `http://localhost:11434` | Ollama URL |
 | `MEMORY_URL` | (none) | Memory skill URL for conversation persistence |
 ## API
 ### WebSocket Chat
 Connect to `ws://localhost:8082/chat` for unified chat interface.
 **Send message:**
 ```json
 {
  "type": "message",
  "content": "Hello!",
  "session_id": "optional-session-id"
 }
 ```
 **Receive:**
 ```json
 {"type": "start", "session_id": "abc123"}
 {"type": "token", "content": "Hello"}
 {"type": "token", "content": "!"}
 {"type": "end"}
 ```
 ### REST API
 ```bash
 # Chat (proxied to provider)
 curl http://localhost:8082/chat \
  -H "Content-Type: application/json" \
  -d '{"message": "Hello!"}'
 # Execute (one-shot, proxied to provider)
 curl http://localhost:8082/execute \
  -H "Content-Type: application/json" \
  -d '{"prompt": "List all files"}'
 # Health check
 curl http://localhost:8082/health
 # Get current provider
 curl http://localhost:8082/provider
 ```
 ## Provider Skills
 Each provider skill implements its own API. The LLM router translates:
 ### Claude Skill (port 8888)
 - `POST /chat` - `{"message": "...", "session_id": "..."}`
 - `POST /execute` - `{"prompt": "..."}`
 ### OpenAI Skill (port 8889)
 - `POST /v1/chat/completions` - OpenAI format
 ### Ollama (port 11434)
 - `POST /api/chat` - Ollama format
 ## Switching Providers
 ```bash
 # Use Claude (default)
 LLM_PROVIDER=claude
 # Use OpenAI
 LLM_PROVIDER=openai
 # Use Ollama
 LLM_PROVIDER=ollama
 ```
 Clients connect to `localhost:8082` - they don't need to know which provider is active.
 ## Tool Calling (Pass-through)
 Tools are passed to the provider skill. When the LLM wants to call a tool:
 1. LLM router sends tool definitions to provider
 2. Provider returns tool call request
 3. Router passes tool call to client via WebSocket
 4. Client executes tool, sends result back
 5. Router forwards result to provider
 6. Provider continues conversation
 ```json
 // Client receives
 {"type": "tool_call", "name": "read_file", "arguments": {"path": "/etc/hosts"}}
 // Client sends back
 {"type": "tool_result", "name": "read_file", "result": "127.0.0.1 localhost..."}
 ```
 ## Conversation Memory
 If `MEMORY_URL` is set, conversations are stored:
 ```bash
 MEMORY_URL=http://localhost:8081
 ```
 Each conversation is saved to the memory skill for later retrieval.
--- a/scripts/autorun.sh
+++ b/scripts/autorun.sh
@@ -0,0 +1,48 @@
 #!/bin/bash
 set -e
 SKILL_DIR="$(dirname "$(dirname "$0")")"
 # Install Python if not present
 install_python() {
    if command -v python3 &>/dev/null; then
        echo "Python already installed: $(python3 --version)"
        return 0
    fi
    echo "Installing Python..."
    apt-get update
    apt-get install -y python3 python3-pip python3-venv
    echo "Python installed: $(python3 --version)"
 }
 # Setup Python virtual environment and dependencies
 setup_python_env() {
    local venv_dir="$SKILL_DIR/.venv"
    if [ -d "$venv_dir" ]; then
        echo "Python venv already exists"
        return 0
    fi
    echo "Creating Python virtual environment..."
    python3 -m venv "$venv_dir"
    echo "Installing Python dependencies..."
    "$venv_dir/bin/pip" install --upgrade pip
    "$venv_dir/bin/pip" install \
        fastapi==0.109.0 \
        uvicorn==0.27.0 \
        websockets==12.0 \
        httpx==0.26.0 \
        pydantic==2.5.0 \
        python-ulid==2.2.0
    echo "Python environment ready"
 }
 install_python
 setup_python_env
 echo "LLM router setup complete"
--- a/scripts/run.sh
+++ b/scripts/run.sh
@@ -0,0 +1,25 @@
 #!/bin/bash
 set -e
 LLM_PORT="${LLM_PORT:-8082}"
 SKILL_DIR="$(dirname "$(dirname "$0")")"
 VENV_DIR="$SKILL_DIR/.venv"
 # Export config for Python
 export LLM_PORT
 export LLM_PROVIDER="${LLM_PROVIDER:-claude}"
 export CLAUDE_URL="${CLAUDE_URL:-http://localhost:8888}"
 export OPENAI_URL="${OPENAI_URL:-http://localhost:8889}"
 export OLLAMA_URL="${OLLAMA_URL:-http://localhost:11434}"
 export MEMORY_URL="${MEMORY_URL:-}"
 echo "Starting LLM Router on port $LLM_PORT..."
 echo "Provider: $LLM_PROVIDER"
 case "$LLM_PROVIDER" in
    claude)  echo "Backend: $CLAUDE_URL" ;;
    openai)  echo "Backend: $OPENAI_URL" ;;
    ollama)  echo "Backend: $OLLAMA_URL" ;;
 esac
 exec "$VENV_DIR/bin/python" "$SKILL_DIR/src/api.py"
--- a/src/api.py
+++ b/src/api.py
@@ -0,0 +1,337 @@
 #!/usr/bin/env python3
 """
 LLM Router - Proxies requests to provider skills (claude, openai, ollama)
 """
 import os
 import json
 import asyncio
 from typing import Optional
 from contextlib import asynccontextmanager
 import httpx
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
 from ulid import ULID
 # Configuration
 LLM_PORT = int(os.environ.get("LLM_PORT", "8082"))
 LLM_PROVIDER = os.environ.get("LLM_PROVIDER", "claude")
 # Provider skill URLs
 CLAUDE_URL = os.environ.get("CLAUDE_URL", "http://localhost:8888")
 OPENAI_URL = os.environ.get("OPENAI_URL", "http://localhost:8889")
 OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434")
 # Memory integration
 MEMORY_URL = os.environ.get("MEMORY_URL", "")
 def get_provider_url() -> str:
    """Get URL for current provider."""
    providers = {
        "claude": CLAUDE_URL,
        "openai": OPENAI_URL,
        "ollama": OLLAMA_URL,
    }
    return providers.get(LLM_PROVIDER, CLAUDE_URL)
 class ChatRequest(BaseModel):
    message: str
    session_id: Optional[str] = None
 class ExecuteRequest(BaseModel):
    prompt: str
 # Memory integration
 async def store_conversation(session_id: str, message: str, response: str):
    """Store conversation in memory skill."""
    if not MEMORY_URL:
        return
    content = f"User: {message}\nAssistant: {response}"
    try:
        async with httpx.AsyncClient() as client:
            await client.post(
                f"{MEMORY_URL}/memory",
                json={
                    "type": "conversation",
                    "content": content,
                    "metadata": {"session_id": session_id, "provider": LLM_PROVIDER},
                },
                timeout=5,
            )
    except Exception as e:
        print(f"Failed to store conversation: {e}")
@asynccontextmanager
 async def lifespan(app: FastAPI):
    print(f"LLM Router starting on port {LLM_PORT}")
    print(f"Provider: {LLM_PROVIDER} -> {get_provider_url()}")
    yield
    print("Shutting down...")
 app = FastAPI(
    title="LLM Router",
    description="Unified LLM interface routing to provider skills",
    version="1.0.0",
    lifespan=lifespan,
 )
@app.get("/health")
 async def health():
    """Health check - also checks provider health."""
    provider_url = get_provider_url()
    provider_healthy = False
    try:
        async with httpx.AsyncClient() as client:
            resp = await client.get(f"{provider_url}/health", timeout=5)
            provider_healthy = resp.status_code == 200
    except:
        pass
    return {
        "status": "healthy" if provider_healthy else "degraded",
        "provider": LLM_PROVIDER,
        "provider_url": provider_url,
        "provider_healthy": provider_healthy,
    }
@app.get("/provider")
 async def get_provider():
    """Get current provider info."""
    return {
        "provider": LLM_PROVIDER,
        "url": get_provider_url(),
    }
@app.post("/chat")
 async def chat(request: ChatRequest):
    """Chat endpoint - proxies to provider skill."""
    provider_url = get_provider_url()
    session_id = request.session_id or str(ULID())
    try:
        async with httpx.AsyncClient() as client:
            if LLM_PROVIDER == "claude":
                # Claude skill format
                resp = await client.post(
                    f"{provider_url}/chat",
                    json={"message": request.message, "session_id": session_id},
                    timeout=120,
                )
                data = resp.json()
                if data.get("success"):
                    response_text = data.get("response", "")
                    await store_conversation(session_id, request.message, response_text)
                    return {
                        "success": True,
                        "response": response_text,
                        "session_id": session_id,
                        "provider": LLM_PROVIDER,
                    }
                else:
                    return JSONResponse(
                        status_code=500,
                        content={"success": False, "error": data.get("error", "Unknown error")},
                    )
            elif LLM_PROVIDER == "ollama":
                # Ollama format
                resp = await client.post(
                    f"{provider_url}/api/chat",
                    json={
                        "model": os.environ.get("OLLAMA_MODEL", "llama3.2"),
                        "messages": [{"role": "user", "content": request.message}],
                        "stream": False,
                    },
                    timeout=120,
                )
                data = resp.json()
                response_text = data.get("message", {}).get("content", "")
                await store_conversation(session_id, request.message, response_text)
                return {
                    "success": True,
                    "response": response_text,
                    "session_id": session_id,
                    "provider": LLM_PROVIDER,
                }
            elif LLM_PROVIDER == "openai":
                # OpenAI skill format
                resp = await client.post(
                    f"{provider_url}/v1/chat/completions",
                    json={
                        "model": os.environ.get("OPENAI_MODEL", "gpt-4o"),
                        "messages": [{"role": "user", "content": request.message}],
                    },
                    timeout=120,
                )
                data = resp.json()
                response_text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
                await store_conversation(session_id, request.message, response_text)
                return {
                    "success": True,
                    "response": response_text,
                    "session_id": session_id,
                    "provider": LLM_PROVIDER,
                }
            else:
                raise HTTPException(status_code=400, detail=f"Unknown provider: {LLM_PROVIDER}")
    except httpx.RequestError as e:
        return JSONResponse(
            status_code=503,
            content={"success": False, "error": f"Provider unavailable: {e}"},
        )
@app.post("/execute")
 async def execute(request: ExecuteRequest):
    """Execute endpoint - proxies to provider skill."""
    provider_url = get_provider_url()
    try:
        async with httpx.AsyncClient() as client:
            if LLM_PROVIDER == "claude":
                # Claude skill execute endpoint
                resp = await client.post(
                    f"{provider_url}/execute",
                    json={"prompt": request.prompt},
                    timeout=300,  # Longer timeout for execution
                )
                return resp.json()
            elif LLM_PROVIDER == "ollama":
                # Use chat for ollama
                resp = await client.post(
                    f"{provider_url}/api/chat",
                    json={
                        "model": os.environ.get("OLLAMA_MODEL", "llama3.2"),
                        "messages": [{"role": "user", "content": request.prompt}],
                        "stream": False,
                    },
                    timeout=300,
                )
                data = resp.json()
                return {
                    "success": True,
                    "result": data.get("message", {}).get("content", ""),
                }
            else:
                raise HTTPException(status_code=400, detail=f"Execute not supported for: {LLM_PROVIDER}")
    except httpx.RequestError as e:
        return JSONResponse(
            status_code=503,
            content={"success": False, "error": f"Provider unavailable: {e}"},
        )
@app.websocket("/chat")
 async def websocket_chat(websocket: WebSocket):
    """WebSocket chat endpoint with streaming proxy."""
    await websocket.accept()
    provider_url = get_provider_url()
    session_id = str(ULID())
    try:
        while True:
            data = await websocket.receive_json()
            if data.get("type") == "ping":
                await websocket.send_json({"type": "pong"})
                continue
            if data.get("type") != "message":
                continue
            content = data.get("content", "")
            session_id = data.get("session_id") or session_id
            # Send start
            await websocket.send_json({
                "type": "start",
                "session_id": session_id,
                "provider": LLM_PROVIDER,
            })
            try:
                async with httpx.AsyncClient() as client:
                    if LLM_PROVIDER == "claude":
                        # Claude skill (non-streaming for now)
                        resp = await client.post(
                            f"{provider_url}/chat",
                            json={"message": content, "session_id": session_id},
                            timeout=120,
                        )
                        result = resp.json()
                        if result.get("success"):
                            response_text = result.get("response", "")
                            # Send as single token (claude skill doesn't stream yet)
                            await websocket.send_json({"type": "token", "content": response_text})
                            await store_conversation(session_id, content, response_text)
                        else:
                            await websocket.send_json({"type": "error", "message": result.get("error", "Unknown error")})
                    elif LLM_PROVIDER == "ollama":
                        # Ollama streaming
                        async with client.stream(
                            "POST",
                            f"{provider_url}/api/chat",
                            json={
                                "model": os.environ.get("OLLAMA_MODEL", "llama3.2"),
                                "messages": [{"role": "user", "content": content}],
                                "stream": True,
                            },
                            timeout=300,
                        ) as resp:
                            full_response = ""
                            async for line in resp.aiter_lines():
                                if line:
                                    chunk = json.loads(line)
                                    if "message" in chunk and chunk["message"].get("content"):
                                        token = chunk["message"]["content"]
                                        full_response += token
                                        await websocket.send_json({"type": "token", "content": token})
                            await store_conversation(session_id, content, full_response)
                    else:
                        await websocket.send_json({"type": "error", "message": f"Unknown provider: {LLM_PROVIDER}"})
            except httpx.RequestError as e:
                await websocket.send_json({"type": "error", "message": f"Provider unavailable: {e}"})
            # Send end
            await websocket.send_json({"type": "end"})
    except WebSocketDisconnect:
        print("WebSocket disconnected")
    except Exception as e:
        print(f"WebSocket error: {e}")
        try:
            await websocket.send_json({"type": "error", "message": str(e)})
        except:
            pass
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=LLM_PORT)