Local LLM Router for Air-Gapped Networks Intelligent routing of AI coding queries to local LLMs with Serena LSP integration for secure, offline-capable development environments. Prerequisites (CRITICAL) Before using this skill, ensure: Serena MCP Server installed and running (PRIMARY TOOL) At least one local LLM service running (Ollama, LM Studio, Jan, etc.)
Install Serena (required)
pip install serena
Or via uvx
uvx --from git+https://github.com/oraios/serena serena start-mcp-server
Verify local LLM service
curl http://localhost:11434/api/version
Ollama
curl http://localhost:1234/v1/models
LM Studio
curl http://localhost:1337/v1/models
Jan
Quick Start import httpx import asyncio from dataclasses import dataclass from enum import Enum from typing import Optional class TaskCategory ( Enum ) : CODING = "coding" REASONING = "reasoning" ANALYSIS = "analysis" DOCUMENTATION = "documentation" @dataclass class RouterConfig : """Local LLM Router configuration.""" ollama_url : str = "http://localhost:11434" lmstudio_url : str = "http://localhost:1234" jan_url : str = "http://localhost:1337" serena_enabled : bool = True timeout : int = 30 async def quick_route ( query : str , config : RouterConfig = RouterConfig ( ) ) : """Quick routing example - detects services and routes query."""
1. Detect available services
services
await discover_services ( config ) if not services : raise RuntimeError ( "No local LLM services available" )
2. Classify task
category
classify_task ( query )
3. Select best model for task
model
select_model ( category , services )
4. Execute query
return await execute_query ( query , model , services [ 0 ] )
Example usage
- async
- def
- main
- (
- )
- :
- response
- =
- await
- quick_route
- (
- "Write a function to parse JSON safely"
- )
- (
- response
- )
- asyncio
- .
- run
- (
- main
- (
- )
- )
- Serena Integration (PRIMARY TOOL)
- CRITICAL
-
- Serena MCP MUST be invoked FIRST for all code-related tasks. This provides semantic understanding of the codebase before routing to an LLM.
- Why Serena First?
- Token Efficiency
-
- Serena extracts only relevant code context
- Accuracy
-
- Symbol-level operations vs grep-style searches
- Codebase Awareness
-
- Understands types, references, call hierarchies
- Edit Precision
- Applies changes at symbol level, not string matching
Serena MCP Setup
import
subprocess
import
json
from
typing
import
Any
class
SerenaMCP
:
"""Serena MCP client for code intelligence."""
def
init
(
self
,
workspace_root
:
str
)
:
self
.
workspace
=
workspace_root
self
.
process
=
None
async
def
start
(
self
)
:
"""Start Serena MCP server."""
self
.
process
=
subprocess
.
Popen
(
[
"serena"
,
"start-mcp-server"
,
"--workspace"
,
self
.
workspace
]
,
stdin
=
subprocess
.
PIPE
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
async
def
call
(
self
,
method
:
str
,
params
:
dict
)
-
Any : """Call Serena MCP method.""" request = { "jsonrpc" : "2.0" , "id" : 1 , "method" : method , "params" : params } self . process . stdin . write ( json . dumps ( request ) . encode ( ) + b"\n" ) self . process . stdin . flush ( ) response = self . process . stdout . readline ( ) return json . loads ( response ) async def find_symbol ( self , name : str ) -
dict : """Find symbol definition by name.""" return await self . call ( "find_symbol" , { "name" : name } ) async def get_references ( self , file : str , line : int , char : int ) -
list : """Get all references to symbol at position.""" return await self . call ( "get_references" , { "file" : file , "line" : line , "character" : char } ) async def get_hover_info ( self , file : str , line : int , char : int ) -
dict : """Get type/documentation info at position.""" return await self . call ( "get_hover_info" , { "file" : file , "line" : line , "character" : char } ) async def get_diagnostics ( self , file : str ) -
list : """Get errors/warnings for file.""" return await self . call ( "get_diagnostics" , { "file" : file } ) async def apply_edit ( self , file : str , edits : list ) -
bool : """Apply code edits to file.""" return await self . call ( "apply_edit" , { "file" : file , "edits" : edits } )
Serena tools by priority (always use higher priority first)
SERENA_TOOLS
{
Priority 1: Symbol-level operations (highest)
"find_symbol" : { "priority" : 1 , "use_for" : [ "navigation" , "definition" ] } , "get_references" : { "priority" : 1 , "use_for" : [ "refactoring" , "impact analysis" ] } , "get_hover_info" : { "priority" : 1 , "use_for" : [ "type info" , "documentation" ] } ,
Priority 2: Code navigation
"go_to_definition" : { "priority" : 2 , "use_for" : [ "navigation" ] } , "go_to_type_definition" : { "priority" : 2 , "use_for" : [ "type navigation" ] } , "go_to_implementation" : { "priority" : 2 , "use_for" : [ "interface impl" ] } ,
Priority 3: Code understanding
"get_document_symbols" : { "priority" : 3 , "use_for" : [ "file structure" ] } , "get_workspace_symbols" : { "priority" : 3 , "use_for" : [ "codebase search" ] } , "get_call_hierarchy" : { "priority" : 3 , "use_for" : [ "call analysis" ] } ,
Priority 4: Code modification
"apply_edit" : { "priority" : 4 , "use_for" : [ "editing" ] } , "rename_symbol" : { "priority" : 4 , "use_for" : [ "refactoring" ] } ,
Priority 5: Diagnostics
"get_diagnostics" : { "priority" : 5 , "use_for" : [ "errors" , "warnings" ] } , "get_code_actions" : { "priority" : 5 , "use_for" : [ "quick fixes" ] } , } Serena-First Request Handler async def handle_code_request ( query : str , file_context : Optional [ dict ] = None , serena : SerenaMCP = None , router : "LLMRouter" = None ) : """ Handle code request with Serena-first pattern. CRITICAL: Serena is ALWAYS invoked first for code tasks. """
Step 1: Classify the task
category
classify_task ( query )
Step 2: ALWAYS use Serena for code context (if available)
serena_context
{ } if serena and file_context :
Gather semantic context from Serena
if file_context . get ( "file" ) and file_context . get ( "position" ) : file = file_context [ "file" ] line = file_context [ "position" ] [ "line" ] char = file_context [ "position" ] [ "character" ]
Get hover info (type, docs)
serena_context [ "hover" ] = await serena . get_hover_info ( file , line , char )
For refactoring/analysis, get references
if category in [ TaskCategory . ANALYSIS , TaskCategory . CODING ] : if "refactor" in query . lower ( ) or "rename" in query . lower ( ) : serena_context [ "references" ] = await serena . get_references ( file , line , char )
Always get diagnostics for the file
serena_context [ "diagnostics" ] = await serena . get_diagnostics ( file )
Step 3: Build enriched prompt with Serena context
enriched_query
build_enriched_query ( query , serena_context )
Step 4: Select and route to appropriate LLM
model
router . select_model ( category ) response = await router . execute ( enriched_query , model )
Step 5: If response contains edits, apply via Serena
if serena and contains_code_edit ( response ) : edits = parse_code_edits ( response ) await serena . apply_edit ( file_context [ "file" ] , edits ) return response def build_enriched_query ( query : str , serena_context : dict ) -
str : """Build query enriched with Serena context.""" parts = [ query ] if serena_context . get ( "hover" ) : hover = serena_context [ "hover" ] parts . append ( f"\n## Type Information\n
\n { hover } \n" ) if serena_context . get ( "references" ) : refs = serena_context [ "references" ] parts . append ( f"\n## References ( { len ( refs ) } found)\n" ) for ref in refs [ : 10 ] :
Limit to first 10
parts . append ( f"- { ref [ 'file' ] } : { ref [ 'line' ] } " ) if serena_context . get ( "diagnostics" ) : diags = serena_context [ "diagnostics" ] if diags : parts . append ( f"\n## Current Issues ( { len ( diags ) } )\n" ) for diag in diags [ : 5 ] : parts . append ( f"- Line { diag [ 'line' ] } : { diag [ 'message' ] } " ) return "\n" . join ( parts ) Service Discovery Supported Services Service Default Endpoint Health Check Models Endpoint Chat Endpoint API Style Ollama localhost:11434 /api/version /api/tags /api/chat Native LM Studio localhost:1234 /v1/models /v1/models /v1/chat/completions OpenAI Jan localhost:1337 /v1/models /v1/models /v1/chat/completions OpenAI OpenWebUI localhost:3000 /api/health /api/models /api/chat Custom LocalAI localhost:8080 /readyz /v1/models /v1/chat/completions OpenAI vLLM localhost:8000 /health /v1/models /v1/chat/completions OpenAI llama.cpp localhost:8080 /health /v1/models /v1/chat/completions OpenAI Kobold.cpp localhost:5001 /api/v1/info /api/v1/models /api/v1/generate Custom GPT4All localhost:4891 /v1/models /v1/models /v1/chat/completions OpenAI text-generation-webui localhost:5000 /api/v1/model /api/v1/models /api/v1/chat Custom OS Detection import sys import os import platform from dataclasses import dataclass @dataclass class OSInfo : platform : str
'windows', 'linux', 'darwin'
release : str arch : str
'x64', 'arm64'
is_wsl : bool is_container : bool def detect_os ( ) -
OSInfo : """Detect operating system and environment.""" plat = sys . platform
Normalize platform name
if plat == 'win32' : plat = 'windows' elif plat == 'darwin' : plat = 'darwin' else : plat = 'linux'
WSL detection
is_wsl
False if plat == 'linux' : try : with open ( '/proc/version' , 'r' ) as f : is_wsl = 'microsoft' in f . read ( ) . lower ( ) except FileNotFoundError : pass is_wsl = is_wsl or os . environ . get ( 'WSL_DISTRO_NAME' ) is not None
Container detection
is_container
( os . path . exists ( '/.dockerenv' ) or os . environ . get ( 'KUBERNETES_SERVICE_HOST' ) is not None ) if not is_container and plat == 'linux' : try : with open ( '/proc/1/cgroup' , 'r' ) as f : is_container = 'docker' in f . read ( ) or 'kubepods' in f . read ( ) except FileNotFoundError : pass return OSInfo ( platform = plat , release = platform . release ( ) , arch = platform . machine ( ) , is_wsl = is_wsl , is_container = is_container ) def adjust_endpoint_for_os ( endpoint : str , os_info : OSInfo ) -
str : """Adjust endpoint based on OS environment.""" if os_info . is_wsl or os_info . is_container :
In WSL/containers, localhost services are on the host
return endpoint . replace ( 'localhost' , 'host.docker.internal' ) return endpoint Service Discovery Implementation import httpx import asyncio from dataclasses import dataclass , field from datetime import datetime from typing import Optional @dataclass class DiscoveredModel : id : str name : str size : int = 0 family : Optional [ str ] = None context_length : int = 4096 quantization : Optional [ str ] = None @dataclass class LLMService : name : str type : str
'ollama', 'lmstudio', 'jan', 'openwebui', 'custom'
endpoint : str status : str = 'unknown'
'online', 'offline', 'unknown'
models : list = field ( default_factory = list ) last_checked : datetime = None api_style : str = 'openai'
'openai', 'native'
Endpoint paths
health_path : str = '/v1/models' models_path : str = '/v1/models' chat_path : str = '/v1/chat/completions'
Default service configurations
SERVICE_DEFAULTS
{ 'ollama' : LLMService ( name = 'Ollama' , type = 'ollama' , endpoint = 'http://localhost:11434' , health_path = '/api/version' , models_path = '/api/tags' , chat_path = '/api/chat' , api_style = 'native' ) , 'lmstudio' : LLMService ( name = 'LM Studio' , type = 'lmstudio' , endpoint = 'http://localhost:1234' , health_path = '/v1/models' , models_path = '/v1/models' , chat_path = '/v1/chat/completions' , api_style = 'openai' ) , 'jan' : LLMService ( name = 'Jan' , type = 'jan' , endpoint = 'http://localhost:1337' , health_path = '/v1/models' , models_path = '/v1/models' , chat_path = '/v1/chat/completions' , api_style = 'openai' ) , 'openwebui' : LLMService ( name = 'Open WebUI' , type = 'openwebui' , endpoint = 'http://localhost:3000' , health_path = '/api/health' , models_path = '/api/models' , chat_path = '/api/chat' , api_style = 'custom' ) , 'localai' : LLMService ( name = 'LocalAI' , type = 'localai' , endpoint = 'http://localhost:8080' , health_path = '/readyz' , models_path = '/v1/models' , chat_path = '/v1/chat/completions' , api_style = 'openai' ) , 'vllm' : LLMService ( name = 'vLLM' , type = 'vllm' , endpoint = 'http://localhost:8000' , health_path = '/health' , models_path = '/v1/models' , chat_path = '/v1/chat/completions' , api_style = 'openai' ) , 'llamacpp' : LLMService ( name = 'llama.cpp' , type = 'llamacpp' , endpoint = 'http://localhost:8080' , health_path = '/health' , models_path = '/v1/models' , chat_path = '/v1/chat/completions' , api_style = 'openai' ) , 'koboldcpp' : LLMService ( name = 'Kobold.cpp' , type = 'koboldcpp' , endpoint = 'http://localhost:5001' , health_path = '/api/v1/info' , models_path = '/api/v1/model' , chat_path = '/api/v1/generate' , api_style = 'custom' ) , 'gpt4all' : LLMService ( name = 'GPT4All' , type = 'gpt4all' , endpoint = 'http://localhost:4891' , health_path = '/v1/models' , models_path = '/v1/models' , chat_path = '/v1/chat/completions' , api_style = 'openai' ) , } class ServiceDiscovery : """Discover and monitor local LLM services.""" def init ( self , custom_endpoints : list = None ) : self . services : dict [ str , LLMService ] = { } self . os_info = detect_os ( ) self . custom_endpoints = custom_endpoints or [ ] self . _client = httpx . AsyncClient ( timeout = 5.0 ) async def discover_all ( self ) -
list [ LLMService ] : """Discover all available LLM services.""" discovered = [ ]
Check default services
tasks
[ ] for key , default in SERVICE_DEFAULTS . items ( ) : service = LLMService ( name = default . name , type = default . type , endpoint = adjust_endpoint_for_os ( default . endpoint , self . os_info ) , health_path = default . health_path , models_path = default . models_path , chat_path = default . chat_path , api_style = default . api_style ) tasks . append ( self . _check_service ( service ) )
Check custom endpoints
for custom in self . custom_endpoints : service = LLMService ( name = custom . get ( 'name' , 'Custom' ) , type = 'custom' , endpoint = custom [ 'endpoint' ] , health_path = custom . get ( 'health_path' , '/v1/models' ) , models_path = custom . get ( 'models_path' , '/v1/models' ) , chat_path = custom . get ( 'chat_path' , '/v1/chat/completions' ) , api_style = custom . get ( 'api_style' , 'openai' ) ) tasks . append ( self . _check_service ( service ) ) results = await asyncio . gather ( * tasks , return_exceptions = True ) for result in results : if isinstance ( result , LLMService ) and result . status == 'online' : discovered . append ( result ) self . services [ result . type ] = result return discovered async def _check_service ( self , service : LLMService ) -
LLMService : """Check if service is online and discover models.""" try :
Health check
response
await self . _client . get ( f" { service . endpoint } { service . health_path } " ) if response . status_code == 200 : service . status = 'online' service . last_checked = datetime . now ( )
Discover models
service . models = await self . _discover_models ( service ) else : service . status = 'offline' except ( httpx . ConnectError , httpx . TimeoutException ) : service . status = 'offline' return service async def _discover_models ( self , service : LLMService ) -
list [ DiscoveredModel ] : """Discover available models on service.""" try : response = await self . _client . get ( f" { service . endpoint } { service . models_path } " ) data = response . json ( )
Parse based on service type
if service . type == 'ollama' : return [ DiscoveredModel ( id = m [ 'name' ] , name = m [ 'name' ] , size = m . get ( 'size' , 0 ) , family = m . get ( 'details' , { } ) . get ( 'family' ) , context_length = self . _infer_context_length ( m [ 'name' ] ) ) for m in data . get ( 'models' , [ ] ) ] else :
OpenAI-style
return [ DiscoveredModel ( id = m [ 'id' ] , name = m [ 'id' ] , context_length = m . get ( 'context_length' , 4096 ) ) for m in data . get ( 'data' , [ ] ) ] except Exception : return [ ] def _infer_context_length ( self , model_name : str ) -
int : """Infer context length from model name.""" name_lower = model_name . lower ( )
Check for explicit context markers
if '128k' in name_lower or '131k' in name_lower : return 131072 if '64k' in name_lower : return 65536 if '32k' in name_lower : return 32768 if '16k' in name_lower : return 16384
Model family defaults
if 'qwen' in name_lower : return 131072
Qwen models typically have 128K+
if 'deepseek' in name_lower : return 128000 if 'llama-3' in name_lower or 'llama3' in name_lower : return 128000 if 'codellama' in name_lower : return 100000 if 'mixtral' in name_lower : return 65536 return 8192
Safe default
Task Classification Classification System import re from enum import Enum from dataclasses import dataclass class TaskCategory ( Enum ) : CODING = "coding" REASONING = "reasoning" ANALYSIS = "analysis" DOCUMENTATION = "documentation" @dataclass class ClassificationResult : category : TaskCategory confidence : float
0.0 - 1.0
requires_serena : bool keywords_matched : list [ str ]
Task patterns (regex)
TASK_PATTERNS
{ TaskCategory . CODING : [ r"(?:write|create|implement|code|generate)\s+(?:a\s+)?(?:function|class|method|component)" , r"(?:fix|debug|solve)\s+(?:this|the)\s+(?:bug|error|issue)" , r"refactor\s+(?:this|the)" , r"add\s+(?:error\s+handling|validation|logging|tests?)" , r"complete\s+(?:this|the)\s+code" , r"(?:convert|translate)\s+(?:this|the)\s+code" , r"(?:optimize|improve)\s+(?:this|the)\s+(?:function|code|performance)" , ] , TaskCategory . REASONING : [ r"(?:design|architect|plan)\s+(?:a|the)\s+(?:system|architecture|solution)" , r"how\s+should\s+(?:I|we)\s+(?:approach|structure|implement)" , r"what\s+(?:is|would\s+be)\s+the\s+best\s+(?:way|approach|pattern)" , r"explain\s+the\s+(?:logic|reasoning|algorithm)" , r"compare\s+(?:and\s+contrast|between)" , r"(?:recommend|suggest)\s+(?:an?\s+)?(?:approach|solution|pattern)" , r"trade-?offs?\s+(?:between|of)" , ] , TaskCategory . ANALYSIS : [ r"(?:review|analyze|audit)\s+(?:this|the)\s+code" , r"find\s+(?:potential\s+)?(?:issues|vulnerabilities|bugs|problems)" , r"(?:security|performance)\s+(?:review|analysis|audit)" , r"what\s+(?:could|might)\s+go\s+wrong" , r"identify\s+(?:problems|improvements|issues)" , r"(?:check|scan)\s+for\s+(?:vulnerabilities|issues)" , ] , TaskCategory . DOCUMENTATION : [ r"(?:write|create|generate)\s+(?:documentation|docs|docstring)" , r"(?:add|write)\s+(?:comments|jsdoc|docstring|type\s+hints)" , r"(?:document|explain)\s+(?:this|the)\s+(?:code|function|api)" , r"(?:create|write)\s+(?:a\s+)?readme" , r"(?:generate|write)\s+(?:api\s+)?documentation" , r"describe\s+(?:what|how)\s+(?:this|the)" , ] , }
Keyword weights for scoring
KEYWORD_WEIGHTS
{
Coding
"function" : ( TaskCategory . CODING , 0.3 ) , "implement" : ( TaskCategory . CODING , 0.4 ) , "code" : ( TaskCategory . CODING , 0.2 ) , "debug" : ( TaskCategory . CODING , 0.5 ) , "refactor" : ( TaskCategory . CODING , 0.6 ) , "fix" : ( TaskCategory . CODING , 0.4 ) , "test" : ( TaskCategory . CODING , 0.3 ) , "bug" : ( TaskCategory . CODING , 0.5 ) ,
Reasoning
"architecture" : ( TaskCategory . REASONING , 0.6 ) , "design" : ( TaskCategory . REASONING , 0.4 ) , "approach" : ( TaskCategory . REASONING , 0.3 ) , "strategy" : ( TaskCategory . REASONING , 0.5 ) , "tradeoff" : ( TaskCategory . REASONING , 0.5 ) , "compare" : ( TaskCategory . REASONING , 0.4 ) , "recommend" : ( TaskCategory . REASONING , 0.4 ) ,
Analysis
"review" : ( TaskCategory . ANALYSIS , 0.5 ) , "analyze" : ( TaskCategory . ANALYSIS , 0.6 ) , "security" : ( TaskCategory . ANALYSIS , 0.4 ) , "vulnerability" : ( TaskCategory . ANALYSIS , 0.7 ) , "performance" : ( TaskCategory . ANALYSIS , 0.3 ) , "audit" : ( TaskCategory . ANALYSIS , 0.6 ) ,
Documentation
"document" : ( TaskCategory . DOCUMENTATION , 0.6 ) , "readme" : ( TaskCategory . DOCUMENTATION , 0.8 ) , "docstring" : ( TaskCategory . DOCUMENTATION , 0.8 ) , "comment" : ( TaskCategory . DOCUMENTATION , 0.4 ) , "explain" : ( TaskCategory . DOCUMENTATION , 0.3 ) , } def classify_task ( query : str ) -
ClassificationResult : """Classify a query into a task category.""" query_lower = query . lower ( ) scores = { cat : 0.0 for cat in TaskCategory } matched_keywords = [ ]
Pattern matching (weight: 0.5)
for category , patterns in TASK_PATTERNS . items ( ) : for pattern in patterns : if re . search ( pattern , query_lower ) : scores [ category ] += 0.5
Keyword scoring (weight: 0.5)
words
re . findall ( r'\w+' , query_lower ) for word in words : if word in KEYWORD_WEIGHTS : category , weight = KEYWORD_WEIGHTS [ word ] scores [ category ] += weight * 0.5 matched_keywords . append ( word )
Find highest scoring category
best_category
max ( scores , key = scores . get ) confidence = min ( scores [ best_category ] , 1.0 )
Default to CODING if no clear match
if confidence < 0.2 : best_category = TaskCategory . CODING confidence = 0.5
Determine if Serena is required
requires_serena
( best_category == TaskCategory . ANALYSIS or any ( kw in query_lower for kw in [ 'definition' , 'reference' , 'symbol' , 'rename' , 'where is' , 'find all' , 'go to' , 'jump to' ] ) ) return ClassificationResult ( category = best_category , confidence = confidence , requires_serena = requires_serena , keywords_matched = matched_keywords ) Model Selection Model Capability Matrix from dataclasses import dataclass from typing import Optional @dataclass class ModelCapability : id : str family : str context_window : int vram_gb : float categories : list [ TaskCategory ] performance_scores : dict [ TaskCategory , int ]
0-100
tier : int
1=best, 2=good, 3=basic
quantization : Optional [ str ] = None
Comprehensive model database (40+ models) - Updated January 2025
MODEL_DATABASE : dict [ str , ModelCapability ] = {
=== CODING SPECIALISTS (Tier 1) ===
"deepseek-v3" : ModelCapability ( id = "deepseek-v3" , family = "deepseek" , context_window = 128000 , vram_gb = 48 ,
MoE: 685B total, 37B active
categories
[ TaskCategory . CODING , TaskCategory . REASONING , TaskCategory . ANALYSIS ] , performance_scores = { TaskCategory . CODING : 99 , TaskCategory . REASONING : 97 , TaskCategory . ANALYSIS : 96 , TaskCategory . DOCUMENTATION : 92 } , tier = 1 ) , "qwen2.5-coder-32b" : ModelCapability ( id = "qwen2.5-coder-32b" , family = "qwen" , context_window = 131072 , vram_gb = 22 , categories = [ TaskCategory . CODING , TaskCategory . ANALYSIS ] , performance_scores = { TaskCategory . CODING : 96 , TaskCategory . REASONING : 82 , TaskCategory . ANALYSIS : 92 , TaskCategory . DOCUMENTATION : 88 } , tier = 1 ) , "deepseek-coder-v2" : ModelCapability ( id = "deepseek-coder-v2" , family = "deepseek" , context_window = 128000 , vram_gb = 48 ,
MoE: 236B total, 21B active
categories
[ TaskCategory . CODING , TaskCategory . ANALYSIS , TaskCategory . REASONING ] , performance_scores = { TaskCategory . CODING : 95 , TaskCategory . REASONING : 88 , TaskCategory . ANALYSIS : 92 , TaskCategory . DOCUMENTATION : 80 } , tier = 1 ) , "codellama-70b" : ModelCapability ( id = "codellama-70b" , family = "llama" , context_window = 100000 , vram_gb = 40 , categories = [ TaskCategory . CODING ] , performance_scores = { TaskCategory . CODING : 90 , TaskCategory . REASONING : 70 , TaskCategory . ANALYSIS : 85 , TaskCategory . DOCUMENTATION : 75 } , tier = 1 ) , "codellama-34b" : ModelCapability ( id = "codellama-34b" , family = "llama" , context_window = 100000 , vram_gb = 20 , categories = [ TaskCategory . CODING ] , performance_scores = { TaskCategory . CODING : 85 , TaskCategory . REASONING : 65 , TaskCategory . ANALYSIS : 80 , TaskCategory . DOCUMENTATION : 70 } , tier = 2 ) , "qwen2.5-coder-14b" : ModelCapability ( id = "qwen2.5-coder-14b" , family = "qwen" , context_window = 131072 , vram_gb = 10 , categories = [ TaskCategory . CODING ] , performance_scores = { TaskCategory . CODING : 82 , TaskCategory . REASONING : 60 , TaskCategory . ANALYSIS : 75 , TaskCategory . DOCUMENTATION : 70 } , tier = 2 ) , "starcoder2-15b" : ModelCapability ( id = "starcoder2-15b" , family = "starcoder" , context_window = 16384 , vram_gb = 10 , categories = [ TaskCategory . CODING ] , performance_scores = { TaskCategory . CODING : 80 , TaskCategory . REASONING : 50 , TaskCategory . ANALYSIS : 70 , TaskCategory . DOCUMENTATION : 60 } , tier = 2 ) , "deepseek-coder-6.7b" : ModelCapability ( id = "deepseek-coder-6.7b" , family = "deepseek" , context_window = 16384 , vram_gb = 5 , categories = [ TaskCategory . CODING ] , performance_scores = { TaskCategory . CODING : 75 , TaskCategory . REASONING : 50 , TaskCategory . ANALYSIS : 65 , TaskCategory . DOCUMENTATION : 55 } , tier = 3 ) , "codellama-7b" : ModelCapability ( id = "codellama-7b" , family = "llama" , context_window = 16384 , vram_gb = 5 , categories = [ TaskCategory . CODING ] , performance_scores = { TaskCategory . CODING : 70 , TaskCategory . REASONING : 45 , TaskCategory . ANALYSIS : 60 , TaskCategory . DOCUMENTATION : 50 } , tier = 3 ) ,
=== REASONING SPECIALISTS ===
"deepseek-r1" : ModelCapability ( id = "deepseek-r1" , family = "deepseek" , context_window = 128000 , vram_gb = 160 ,
671B total
categories
[ TaskCategory . REASONING , TaskCategory . CODING ] , performance_scores = { TaskCategory . CODING : 92 , TaskCategory . REASONING : 99 , TaskCategory . ANALYSIS : 95 , TaskCategory . DOCUMENTATION : 90 } , tier = 1 ) , "deepseek-r1-distill-70b" : ModelCapability ( id = "deepseek-r1-distill-70b" , family = "deepseek" , context_window = 128000 , vram_gb = 42 , categories = [ TaskCategory . REASONING , TaskCategory . CODING ] , performance_scores = { TaskCategory . CODING : 88 , TaskCategory . REASONING : 94 , TaskCategory . ANALYSIS : 90 , TaskCategory . DOCUMENTATION : 86 } , tier = 1 ) , "qwen2.5-72b-instruct" : ModelCapability ( id = "qwen2.5-72b-instruct" , family = "qwen" , context_window = 131072 , vram_gb = 48 , categories = [ TaskCategory . REASONING , TaskCategory . CODING ] , performance_scores = { TaskCategory . CODING : 88 , TaskCategory . REASONING : 95 , TaskCategory . ANALYSIS : 92 , TaskCategory . DOCUMENTATION : 94 } , tier = 1 ) , "llama-3.3-70b-instruct" : ModelCapability ( id = "llama-3.3-70b-instruct" , family = "llama" , context_window = 128000 , vram_gb = 42 , categories = [ TaskCategory . REASONING , TaskCategory . CODING ] , performance_scores = { TaskCategory . CODING : 85 , TaskCategory . REASONING : 92 , TaskCategory . ANALYSIS : 88 , TaskCategory . DOCUMENTATION : 90 } , tier = 1 ) , "deepseek-r1-distill-32b" : ModelCapability ( id = "deepseek-r1-distill-32b" , family = "deepseek" , context_window = 128000 , vram_gb = 22 , categories = [ TaskCategory . REASONING , TaskCategory . CODING ] , performance_scores = { TaskCategory . CODING : 82 , TaskCategory . REASONING : 90 , TaskCategory . ANALYSIS : 85 , TaskCategory . DOCUMENTATION : 82 } , tier = 2 ) , "mistral-small-24b" : ModelCapability ( id = "mistral-small-24b" , family = "mistral" , context_window = 32768 , vram_gb = 16 , categories = [ TaskCategory . REASONING , TaskCategory . CODING ] , performance_scores = { TaskCategory . CODING : 80 , TaskCategory . REASONING : 85 , TaskCategory . ANALYSIS : 82 , TaskCategory . DOCUMENTATION : 84 } , tier = 2 ) , "qwen2.5-32b-instruct" : ModelCapability ( id = "qwen2.5-32b-instruct" , family = "qwen" , context_window = 131072 , vram_gb = 22 , categories = [ TaskCategory . REASONING , TaskCategory . DOCUMENTATION ] , performance_scores = { TaskCategory . CODING : 78 , TaskCategory . REASONING : 86 , TaskCategory . ANALYSIS : 82 , TaskCategory . DOCUMENTATION : 88 } , tier = 2 ) , "phi-4" : ModelCapability ( id = "phi-4" , family = "phi" , context_window = 16384 , vram_gb = 10 , categories = [ TaskCategory . REASONING , TaskCategory . CODING ] , performance_scores = { TaskCategory . CODING : 82 , TaskCategory . REASONING : 88 , TaskCategory . ANALYSIS : 80 , TaskCategory . DOCUMENTATION : 78 } , tier = 2 ) , "deepseek-r1-distill-14b" : ModelCapability ( id = "deepseek-r1-distill-14b" , family = "deepseek" , context_window = 128000 , vram_gb = 10 , categories = [ TaskCategory . REASONING ] , performance_scores = { TaskCategory . CODING : 75 , TaskCategory . REASONING : 85 , TaskCategory . ANALYSIS : 78 , TaskCategory . DOCUMENTATION : 76 } , tier = 2 ) , "llama-3.2-11b-vision" : ModelCapability ( id = "llama-3.2-11b-vision" , family = "llama" , context_window = 128000 , vram_gb = 8 , categories = [ TaskCategory . REASONING , TaskCategory . DOCUMENTATION ] , performance_scores = { TaskCategory . CODING : 68 , TaskCategory . REASONING : 78 , TaskCategory . ANALYSIS : 75 , TaskCategory . DOCUMENTATION : 80 } , tier = 2 ) , "gemma-2-27b" : ModelCapability ( id = "gemma-2-27b" , family = "gemma" , context_window = 8192 , vram_gb = 18 , categories = [ TaskCategory . REASONING , TaskCategory . CODING ] , performance_scores = { TaskCategory . CODING : 78 , TaskCategory . REASONING : 82 , TaskCategory . ANALYSIS : 78 , TaskCategory . DOCUMENTATION : 80 } , tier = 2 ) , "deepseek-r1-distill-8b" : ModelCapability ( id = "deepseek-r1-distill-8b" , family = "deepseek" , context_window = 128000 , vram_gb = 6 , categories = [ TaskCategory . REASONING ] , performance_scores = { TaskCategory . CODING : 68 , TaskCategory . REASONING : 78 , TaskCategory . ANALYSIS : 70 , TaskCategory . DOCUMENTATION : 68 } , tier = 3 ) , "gemma-2-9b" : ModelCapability ( id = "gemma-2-9b" , family = "gemma" , context_window = 8192 , vram_gb = 7 , categories = [ TaskCategory . REASONING , TaskCategory . CODING ] , performance_scores = { TaskCategory . CODING : 72 , TaskCategory . REASONING : 75 , TaskCategory . ANALYSIS : 70 , TaskCategory . DOCUMENTATION : 74 } , tier = 3 ) , "llama-3.2-3b" : ModelCapability ( id = "llama-3.2-3b" , family = "llama" , context_window = 128000 , vram_gb = 3 , categories = [ TaskCategory . REASONING ] , performance_scores = { TaskCategory . CODING : 55 , TaskCategory . REASONING : 65 , TaskCategory . ANALYSIS : 58 , TaskCategory . DOCUMENTATION : 65 } , tier = 3 ) ,
=== ANALYSIS SPECIALISTS (Serena Required) ===
"codellama-34b-instruct" : ModelCapability ( id = "codellama-34b-instruct" , family = "llama" , context_window = 100000 , vram_gb = 20 , categories = [ TaskCategory . ANALYSIS ] , performance_scores = { TaskCategory . CODING : 80 , TaskCategory . REASONING : 70 , TaskCategory . ANALYSIS : 88 , TaskCategory . DOCUMENTATION : 75 } , tier = 2 ) ,
=== DOCUMENTATION SPECIALISTS ===
"mistral-nemo-12b" : ModelCapability ( id = "mistral-nemo-12b" , family = "mistral" , context_window = 128000 , vram_gb = 8 , categories = [ TaskCategory . DOCUMENTATION ] , performance_scores = { TaskCategory . CODING : 65 , TaskCategory . REASONING : 70 , TaskCategory . ANALYSIS : 65 , TaskCategory . DOCUMENTATION : 82 } , tier = 2 ) , "mistral-7b" : ModelCapability ( id = "mistral-7b" , family = "mistral" , context_window = 32768 , vram_gb = 5 , categories = [ TaskCategory . DOCUMENTATION ] , performance_scores = { TaskCategory . CODING : 55 , TaskCategory . REASONING : 60 , TaskCategory . ANALYSIS : 55 , TaskCategory . DOCUMENTATION : 72 } , tier = 3 ) ,
=== ADDITIONAL MODELS ===
"phi-3-medium" : ModelCapability ( id = "phi-3-medium" , family = "phi" , context_window = 128000 , vram_gb = 8 , categories = [ TaskCategory . CODING , TaskCategory . REASONING ] , performance_scores = { TaskCategory . CODING : 72 , TaskCategory . REASONING : 75 , TaskCategory . ANALYSIS : 68 , TaskCategory . DOCUMENTATION : 70 } , tier = 2 ) , "gemma-2-27b" : ModelCapability ( id = "gemma-2-27b" , family = "gemma" , context_window = 8192 , vram_gb = 18 , categories = [ TaskCategory . CODING , TaskCategory . REASONING ] , performance_scores = { TaskCategory . CODING : 78 , TaskCategory . REASONING : 80 , TaskCategory . ANALYSIS : 75 , TaskCategory . DOCUMENTATION : 78 } , tier = 2 ) , "yi-34b" : ModelCapability ( id = "yi-34b" , family = "yi" , context_window = 200000 , vram_gb = 20 , categories = [ TaskCategory . REASONING , TaskCategory . DOCUMENTATION ] , performance_scores = { TaskCategory . CODING : 72 , TaskCategory . REASONING : 82 , TaskCategory . ANALYSIS : 75 , TaskCategory . DOCUMENTATION : 80 } , tier = 2 ) , "command-r-plus" : ModelCapability ( id = "command-r-plus" , family = "cohere" , context_window = 128000 , vram_gb = 48 , categories = [ TaskCategory . REASONING , TaskCategory . DOCUMENTATION ] , performance_scores = { TaskCategory . CODING : 70 , TaskCategory . REASONING : 85 , TaskCategory . ANALYSIS : 78 , TaskCategory . DOCUMENTATION : 88 } , tier = 1 ) , "wizardcoder-33b" : ModelCapability ( id = "wizardcoder-33b" , family = "wizard" , context_window = 16384 , vram_gb = 20 , categories = [ TaskCategory . CODING ] , performance_scores = { TaskCategory . CODING : 85 , TaskCategory . REASONING : 60 , TaskCategory . ANALYSIS : 75 , TaskCategory . DOCUMENTATION : 65 } , tier = 2 ) , "magicoder-7b" : ModelCapability ( id = "magicoder-7b" , family = "magicoder" , context_window = 16384 , vram_gb = 5 , categories = [ TaskCategory . CODING ] , performance_scores = { TaskCategory . CODING : 78 , TaskCategory . REASONING : 50 , TaskCategory . ANALYSIS : 65 , TaskCategory . DOCUMENTATION : 55 } , tier = 3 ) , "dolphin-mixtral-8x7b" : ModelCapability ( id = "dolphin-mixtral-8x7b" , family = "dolphin" , context_window = 32768 , vram_gb = 28 , categories = [ TaskCategory . CODING , TaskCategory . REASONING ] , performance_scores = { TaskCategory . CODING : 75 , TaskCategory . REASONING : 78 , TaskCategory . ANALYSIS : 72 , TaskCategory . DOCUMENTATION : 75 } , tier = 2 ) , "nous-hermes-2-mixtral" : ModelCapability ( id = "nous-hermes-2-mixtral" , family = "nous" , context_window = 32768 , vram_gb = 28 , categories = [ TaskCategory . REASONING ] , performance_scores = { TaskCategory . CODING : 72 , TaskCategory . REASONING : 82 , TaskCategory . ANALYSIS : 75 , TaskCategory . DOCUMENTATION : 78 } , tier = 2 ) , "solar-10.7b" : ModelCapability ( id = "solar-10.7b" , family = "solar" , context_window = 4096 , vram_gb = 7 , categories = [ TaskCategory . REASONING , TaskCategory . DOCUMENTATION ] , performance_scores = { TaskCategory . CODING : 60 , TaskCategory . REASONING : 72 , TaskCategory . ANALYSIS : 65 , TaskCategory . DOCUMENTATION : 75 } , tier = 3 ) , }
Task-to-model priority mapping (Updated January 2025)
TASK_MODEL_PRIORITY
{ TaskCategory . CODING : [
Tier 1 - Best
"deepseek-v3" , "qwen2.5-coder-32b" , "deepseek-coder-v2" ,
Tier 2 - Good
"codellama-70b" , "qwen2.5-coder-14b" , "codellama-34b" , "starcoder2-15b" , "phi-4" ,
Tier 3 - Basic
"qwen2.5-coder-7b" , "codellama-7b" , "deepseek-coder-6.7b" ] , TaskCategory . REASONING : [
Tier 1 - Best
"deepseek-r1" , "deepseek-v3" , "deepseek-r1-distill-70b" , "qwen2.5-72b-instruct" , "llama-3.3-70b-instruct" ,
Tier 2 - Good
"deepseek-r1-distill-32b" , "mistral-small-24b" , "qwen2.5-32b-instruct" , "phi-4" , "gemma-2-27b" ,
Tier 3 - Basic
"deepseek-r1-distill-14b" , "deepseek-r1-distill-8b" , "gemma-2-9b" ] , TaskCategory . ANALYSIS : [
Requires Serena LSP
"deepseek-v3" , "qwen2.5-coder-32b" , "deepseek-coder-v2" , "codellama-34b-instruct" , "qwen2.5-72b-instruct" ] , TaskCategory . DOCUMENTATION : [ "qwen2.5-72b-instruct" , "llama-3.3-70b-instruct" , "qwen2.5-32b-instruct" , "mistral-small-24b" , "mistral-nemo-12b" , "gemma-2-27b" ] , } Model Selection Logic from typing import Optional class ModelSelector : """Select optimal model for task based on availability and requirements.""" def init ( self , available_models : list [ str ] ) : self . available = set ( m . lower ( ) for m in available_models ) def select ( self , category : TaskCategory , required_context : int = 0 , max_vram_gb : Optional [ float ] = None ) -
Optional [ str ] : """Select best available model for task category."""
Get priority list for category
priority_list
TASK_MODEL_PRIORITY . get ( category , [ ] ) for model_id in priority_list :
Check if model is available
if not self . _is_available ( model_id ) : continue
Check model capability
capability
MODEL_DATABASE . get ( model_id ) if not capability : continue
Check context window requirement
if required_context
0 and capability . context_window < required_context : continue
Check VRAM constraint
if max_vram_gb and capability . vram_gb
max_vram_gb : continue return model_id
Fallback: return any available model
for model_id , capability in MODEL_DATABASE . items ( ) : if self . _is_available ( model_id ) : return model_id return None def _is_available ( self , model_id : str ) -
bool : """Check if model is available (fuzzy matching).""" model_lower = model_id . lower ( )
Exact match
if model_lower in self . available : return True
Partial match (model name contained in available)
for avail in self . available : if model_lower in avail or avail in model_lower : return True return False def get_fallback_models ( self , category : TaskCategory ) -
list [ str ] : """Get list of fallback models for category.""" priority_list = TASK_MODEL_PRIORITY . get ( category , [ ] ) available_in_priority = [ m for m in priority_list if self . _is_available ( m ) ]
Return tier 2 and 3 models as fallbacks
fallbacks
[ ] for model_id in available_in_priority : capability = MODEL_DATABASE . get ( model_id ) if capability and capability . tier
= 2 : fallbacks . append ( model_id ) return fallbacks Context Management Token Counting from abc import ABC , abstractmethod import re class TokenCounter ( ABC ) : """Base class for token counting.""" @abstractmethod def count ( self , text : str ) -
int : pass class EstimationCounter ( TokenCounter ) : """Estimation-based token counter (no external dependencies).""" def init ( self , chars_per_token : float = 4.0 ) : self . chars_per_token = chars_per_token def count ( self , text : str ) -
int : return int ( len ( text ) / self . chars_per_token ) class QwenCounter ( TokenCounter ) : """Token counter for Qwen models.""" def count ( self , text : str ) -
int :
Qwen uses slightly different tokenization
return int ( len ( text ) / 3.5 ) class LlamaCounter ( TokenCounter ) : """Token counter for Llama models.""" def count ( self , text : str ) -
int :
Llama uses SentencePiece
return int ( len ( text ) / 3.8 )
Model family to counter mapping
TOKEN_COUNTERS
{ "qwen" : QwenCounter ( ) , "deepseek" : EstimationCounter ( 4.0 ) , "llama" : LlamaCounter ( ) , "mistral" : EstimationCounter ( 4.0 ) , "mixtral" : EstimationCounter ( 4.0 ) , "default" : EstimationCounter ( 4.0 ) , } def get_token_counter ( model_id : str ) -
TokenCounter : """Get appropriate token counter for model.""" capability = MODEL_DATABASE . get ( model_id ) if capability : return TOKEN_COUNTERS . get ( capability . family , TOKEN_COUNTERS [ "default" ] ) return TOKEN_COUNTERS [ "default" ] Context Manager from dataclasses import dataclass , field from datetime import datetime from typing import Optional @dataclass class Message : role : str
'system', 'user', 'assistant', 'tool'
content : str timestamp : datetime = field ( default_factory = datetime . now ) token_count : int = 0 metadata : dict = field ( default_factory = dict ) @dataclass class ConversationContext : session_id : str messages : list [ Message ] = field ( default_factory = list ) total_tokens : int = 0 system_prompt : str = "" system_prompt_tokens : int = 0 active_model : str = "" model_history : list [ str ] = field ( default_factory = list ) compaction_count : int = 0 class ContextManager : """Manage conversation context with compaction support.""" def init ( self , session_id : str , system_prompt : str = "" , compaction_threshold : float = 0.8 ,
80% of context window
compaction_target : float = 0.5 ,
Compact to 50%
preserve_recent : int = 10
Keep last N messages
) : self . context = ConversationContext ( session_id = session_id , system_prompt = system_prompt ) self . compaction_threshold = compaction_threshold self . compaction_target = compaction_target self . preserve_recent = preserve_recent self . _counter : Optional [ TokenCounter ] = None def set_model ( self , model_id : str ) : """Set active model and update token counter.""" if self . context . active_model : self . context . model_history . append ( self . context . active_model ) self . context . active_model = model_id self . _counter = get_token_counter ( model_id )
Recount all tokens with new counter
self . _recount_tokens ( ) def add_message ( self , role : str , content : str , metadata : dict = None ) : """Add message to context.""" token_count = self . _counter . count ( content ) if self . _counter else 0 message = Message ( role = role , content = content , token_count = token_count , metadata = metadata or { } ) self . context . messages . append ( message ) self . context . total_tokens += token_count def check_and_compact ( self , max_tokens : int ) -
bool : """Check if compaction needed and perform if so.""" threshold = int ( max_tokens * self . compaction_threshold ) if self . context . total_tokens
threshold : self . _compact ( max_tokens ) return True return False def _compact ( self , max_tokens : int ) : """Compact context to target size.""" target = int ( max_tokens * self . compaction_target )
Step 1: Truncate large tool outputs
for msg in self . context . messages : if msg . role == 'tool' and msg . token_count
500 : original = msg . token_count msg . content = f"[Tool output truncated - { msg . metadata . get ( 'tool_name' , 'unknown' ) } ]" msg . token_count = self . _counter . count ( msg . content ) msg . metadata [ 'truncated' ] = True msg . metadata [ 'original_tokens' ] = original self . _recalculate_total ( ) if self . context . total_tokens <= target : return
Step 2: Summarize older messages
if len ( self . context . messages )
self . preserve_recent : older = self . context . messages [ : - self . preserve_recent ] recent = self . context . messages [ - self . preserve_recent : ]
Create summary of older messages
summary
self . _create_summary ( older ) summary_msg = Message ( role = 'system' , content = f"[Previous conversation summary]\n { summary } " , token_count = self . _counter . count ( summary ) , metadata = { 'compacted' : True } ) self . context . messages = [ summary_msg ] + recent self . context . compaction_count += 1 self . _recalculate_total ( ) def _create_summary ( self , messages : list [ Message ] ) -
str : """Create summary of messages (simple implementation)."""
In production, this would use a lightweight LLM
key_points
[ ] for msg in messages : if msg . role == 'user' :
Extract first sentence of user queries
first_sentence
msg . content . split ( '.' ) [ 0 ] [ : 100 ] key_points . append ( f"- User asked: { first_sentence } " ) elif msg . role == 'assistant' and len ( key_points ) < 10 :
Extract key decisions/results
if 'created' in msg . content . lower ( ) or 'implemented' in msg . content . lower ( ) : first_sentence = msg . content . split ( '.' ) [ 0 ] [ : 100 ] key_points . append ( f"- Assistant: { first_sentence } " ) return "\n" . join ( key_points [ : 10 ] ) def _recount_tokens ( self ) : """Recount all tokens with current counter.""" if not self . _counter : return self . context . system_prompt_tokens = self . _counter . count ( self . context . system_prompt ) for msg in self . context . messages : msg . token_count = self . _counter . count ( msg . content ) self . _recalculate_total ( ) def _recalculate_total ( self ) : """Recalculate total token count.""" self . context . total_tokens = ( self . context . system_prompt_tokens + sum ( m . token_count for m in self . context . messages ) ) def export_for_api ( self ) -
list [ dict ] : """Export messages in API format.""" messages = [ ] if self . context . system_prompt : messages . append ( { "role" : "system" , "content" : self . context . system_prompt } ) for msg in self . context . messages : messages . append ( { "role" : msg . role , "content" : msg . content } ) return messages def prepare_handoff ( self , new_model : str ) -
"ContextManager" : """Prepare context for model switch.""" self . set_model ( new_model ) return self Configuration Inline Configuration Schema from dataclasses import dataclass , field from typing import Optional @dataclass class ServiceConfig : """Configuration for a single LLM service.""" enabled : bool = True endpoint : str = "" priority : int = 1 timeout : int = 30000 max_retries : int = 3 api_style : str = "openai" @dataclass class TaskRoutingConfig : """Configuration for task routing.""" primary_models : list [ str ] = field ( default_factory = list ) fallback_models : list [ str ] = field ( default_factory = list ) min_context : int = 8192 require_serena : bool = False @dataclass class SecurityConfig : """Security configuration for air-gapped networks.""" allow_external : bool = False allowed_hosts : list [ str ] = field ( default_factory = lambda : [ "localhost" , "127.0.0.1" , "host.docker.internal" ] ) allowed_cidrs : list [ str ] = field ( default_factory = lambda : [ "192.168.0.0/16" , "10.0.0.0/8" , "172.16.0.0/12" ] ) audit_enabled : bool = True audit_log_path : str = "./audit.log" log_queries : bool = True log_responses : bool = False
Don't log sensitive responses
verify_checksums : bool = True @dataclass class ContextConfig : """Context management configuration.""" compaction_threshold : float = 0.8 compaction_target : float = 0.5 preserve_recent_messages : int = 10 preserve_recent_tool_calls : int = 5 max_tool_output_tokens : int = 500 @dataclass class RouterConfig : """Complete router configuration."""
Services
ollama : ServiceConfig = field ( default_factory = lambda : ServiceConfig ( endpoint = "http://localhost:11434" , priority = 1 ) ) lmstudio : ServiceConfig = field ( default_factory = lambda : ServiceConfig ( endpoint = "http://localhost:1234" , priority = 2 ) ) jan : ServiceConfig = field ( default_factory = lambda : ServiceConfig ( endpoint = "http://localhost:1337" , priority = 3 ) ) custom_endpoints : list [ dict ] = field ( default_factory = list )
Task routing (Updated January 2025)
coding : TaskRoutingConfig = field ( default_factory = lambda : TaskRoutingConfig ( primary_models = [ "deepseek-v3" , "qwen2.5-coder-32b" , "deepseek-coder-v2" ] , fallback_models = [ "codellama-34b" , "qwen2.5-coder-14b" , "phi-4" ] , min_context = 8192 ) ) reasoning : TaskRoutingConfig = field ( default_factory = lambda : TaskRoutingConfig ( primary_models = [ "deepseek-r1" , "deepseek-v3" , "qwen2.5-72b-instruct" ] , fallback_models = [ "deepseek-r1-distill-32b" , "mistral-small-24b" ] , min_context = 16384 ) ) analysis : TaskRoutingConfig = field ( default_factory = lambda : TaskRoutingConfig ( primary_models = [ "deepseek-v3" , "qwen2.5-coder-32b" ] , fallback_models = [ "codellama-34b-instruct" , "qwen2.5-72b-instruct" ] , min_context = 16384 , require_serena = True ) ) documentation : TaskRoutingConfig = field ( default_factory = lambda : TaskRoutingConfig ( primary_models = [ "qwen2.5-72b-instruct" , "llama-3.3-70b-instruct" ] , fallback_models = [ "qwen2.5-32b-instruct" , "mistral-nemo-12b" ] , min_context = 8192 ) )
Serena
serena_enabled : bool = True serena_priority : str = "always_first"
Context
context : ContextConfig = field ( default_factory = ContextConfig )
Security
security : SecurityConfig = field ( default_factory = SecurityConfig )
Default configuration instance
DEFAULT_CONFIG
RouterConfig ( ) def load_config_from_dict ( data : dict ) -
RouterConfig : """Load configuration from dictionary (e.g., parsed YAML).""" config = RouterConfig ( )
Update services
if 'services' in data : for service_name , service_data in data [ 'services' ] . items ( ) : if hasattr ( config , service_name ) : setattr ( config , service_name , ServiceConfig ( ** service_data ) )
Update task routing
for category in [ 'coding' , 'reasoning' , 'analysis' , 'documentation' ] : if category in data . get ( 'task_routing' , { } ) : setattr ( config , category , TaskRoutingConfig ( ** data [ 'task_routing' ] [ category ] ) )
Update security
if 'security' in data : config . security = SecurityConfig ( ** data [ 'security' ] ) return config Example YAML Configuration (for reference)
local-llm-router.yaml
Copy this to your project and customize
version : "1.0" environment : "air-gapped" services : ollama : enabled : true endpoint : "http://localhost:11434" priority : 1 timeout : 30000 lmstudio : enabled : true endpoint : "http://localhost:1234" priority : 2 jan : enabled : false endpoint : "http://localhost:1337" priority : 3 custom_endpoints : - name : "internal-gpu-server" endpoint : "http://192.168.1.100:8000" priority : 0 api_style : "openai" task_routing : coding : primary_models : - "deepseek-v3" - "qwen2.5-coder-32b" - "deepseek-coder-v2" fallback_models : - "codellama-34b" - "qwen2.5-coder-14b" - "phi-4" min_context : 8192 reasoning : primary_models : - "deepseek-r1" - "deepseek-v3" - "qwen2.5-72b-instruct" fallback_models : - "deepseek-r1-distill-32b" - "mistral-small-24b" min_context : 16384 analysis : primary_models : - "deepseek-v3" - "qwen2.5-coder-32b" require_serena : true documentation : primary_models : - "qwen2.5-72b-instruct" - "llama-3.3-70b-instruct" fallback_models : - "mistral-nemo-12b" serena : enabled : true priority : "always_first" workspace : "${WORKSPACE_ROOT}" context : compaction_threshold : 0.8 preserve_recent_messages : 10 security : allow_external : false allowed_hosts : - "localhost" - "127.0.0.1" - "192.168.0.0/16" audit_enabled : true audit_log_path : "./llm-router-audit.log" Fallback Strategy Graceful Degradation from enum import IntEnum from dataclasses import dataclass from typing import Optional , Any class FallbackLevel ( IntEnum ) : PRIMARY = 0 FALLBACK_MODELS = 1 REDUCED_CONTEXT = 2 SMALLEST_MODEL = 3 FAILED = 4 @dataclass class ExecutionResult : success : bool model : Optional [ str ] = None service : Optional [ str ] = None response : Any = None fallback_level : FallbackLevel = FallbackLevel . PRIMARY error : Optional [ str ] = None class FallbackExecutor : """Execute queries with multi-level fallback.""" def init ( self , discovery : ServiceDiscovery , context_manager : ContextManager , config : RouterConfig ) : self . discovery = discovery self . context = context_manager self . config = config async def execute_with_fallback ( self , query : str , category : TaskCategory ) -
ExecutionResult : """Execute query with fallback strategy."""
Get model lists
task_config
getattr ( self . config , category . value ) primary_models = task_config . primary_models fallback_models = task_config . fallback_models
Level 0: Try primary models
for model in primary_models : result = await self . _try_model ( model , query ) if result . success : result . fallback_level = FallbackLevel . PRIMARY return result
Level 1: Try fallback models
for model in fallback_models : result = await self . _try_model ( model , query ) if result . success : result . fallback_level = FallbackLevel . FALLBACK_MODELS return result
Level 2: Reduce context and retry
self . context . _compact ( task_config . min_context ) for model in primary_models + fallback_models : result = await self . _try_model ( model , query ) if result . success : result . fallback_level = FallbackLevel . REDUCED_CONTEXT return result
Level 3: Use smallest available model
smallest
await self . _find_smallest_model ( ) if smallest : result = await self . _try_model ( smallest , query ) if result . success : result . fallback_level = FallbackLevel . SMALLEST_MODEL return result
Level 4: All failed
return ExecutionResult ( success = False , fallback_level = FallbackLevel . FAILED , error = "All fallback strategies exhausted" ) async def _try_model ( self , model_id : str , query : str ) -
ExecutionResult : """Try executing query on specific model."""
Find service with this model
service
await self . _find_service_with_model ( model_id ) if not service : return ExecutionResult ( success = False , error = f"Model { model_id } not available" ) try : response = await self . _execute_on_service ( service , model_id , query ) return ExecutionResult ( success = True , model = model_id , service = service . name , response = response ) except Exception as e : return ExecutionResult ( success = False , error = str ( e ) ) async def _find_service_with_model ( self , model_id : str ) -
Optional [ LLMService ] : """Find service that has the specified model.""" services = list ( self . discovery . services . values ( ) )
Sort by priority
services . sort ( key = lambda s : getattr ( self . config , s . type , ServiceConfig ( ) ) . priority ) for service in services : for model in service . models : if model_id . lower ( ) in model . id . lower ( ) or model . id . lower ( ) in model_id . lower ( ) : return service return None async def _find_smallest_model ( self ) -
Optional [ str ] : """Find smallest available model by VRAM requirement.""" smallest = None smallest_vram = float ( 'inf' ) for service in self . discovery . services . values ( ) : for model in service . models : capability = MODEL_DATABASE . get ( model . id ) if capability and capability . vram_gb < smallest_vram : smallest = model . id smallest_vram = capability . vram_gb return smallest async def _execute_on_service ( self , service : LLMService , model_id : str , query : str ) -
str : """Execute query on specific service.""" import httpx messages = self . context . export_for_api ( ) messages . append ( { "role" : "user" , "content" : query } ) async with httpx . AsyncClient ( ) as client : if service . api_style == 'native' and service . type == 'ollama' :
Ollama native API
response
await client . post ( f" { service . endpoint } { service . chat_path } " , json = { "model" : model_id , "messages" : messages , "stream" : False } , timeout = self . config . ollama . timeout / 1000 ) data = response . json ( ) return data . get ( 'message' , { } ) . get ( 'content' , '' ) else :
OpenAI-compatible API
response
await client . post ( f" { service . endpoint } { service . chat_path } " , json = { "model" : model_id , "messages" : messages , "stream" : False } , timeout = 30 ) data = response . json ( ) return data . get ( 'choices' , [ { } ] ) [ 0 ] . get ( 'message' , { } ) . get ( 'content' , '' ) Security (Air-Gapped) Network Isolation import hashlib import json from datetime import datetime from dataclasses import dataclass from typing import Optional import ipaddress import logging @dataclass class AuditLogEntry : timestamp : str event_type : str session_id : Optional [ str ] = None model : Optional [ str ] = None service : Optional [ str ] = None query_hash : Optional [ str ] = None
Hashed, not plaintext
tokens_in : int = 0 tokens_out : int = 0 success : bool = True error : Optional [ str ] = None class SecurityModule : """Security enforcement for air-gapped networks.""" def init ( self , config : SecurityConfig ) : self . config = config self . _allowed_ips = self . _parse_allowed_networks ( ) self . _logger = self . _setup_audit_logger ( ) def _parse_allowed_networks ( self ) -
list : """Parse allowed hosts and CIDRs.""" networks = [ ] for host in self . config . allowed_hosts : if '/' in host :
CIDR notation
networks . append ( ipaddress . ip_network ( host , strict = False ) ) else :
Single host
try : ip = ipaddress . ip_address ( host ) networks . append ( ipaddress . ip_network ( f" { ip } /32" ) ) except ValueError :
Hostname like 'localhost'
if host == 'localhost' : networks . append ( ipaddress . ip_network ( "127.0.0.0/8" ) ) elif host == 'host.docker.internal' :
Allow common Docker host IPs
networks . append ( ipaddress . ip_network ( "172.17.0.0/16" ) ) for cidr in self . config . allowed_cidrs : networks . append ( ipaddress . ip_network ( cidr , strict = False ) ) return networks def _setup_audit_logger ( self ) -
logging . Logger : """Setup audit logger.""" logger = logging . getLogger ( 'llm-router-audit' ) logger . setLevel ( logging . INFO ) if self . config . audit_enabled : handler = logging . FileHandler ( self . config . audit_log_path ) handler . setFormatter ( logging . Formatter ( '%(message)s' ) ) logger . addHandler ( handler ) return logger def validate_endpoint ( self , url : str ) -
bool : """Validate that endpoint is in allowed network.""" if self . config . allow_external : return True try : from urllib . parse import urlparse parsed = urlparse ( url ) host = parsed . hostname
Check for localhost
if host in [ 'localhost' , '127.0.0.1' , '::1' ] : return True
Check against allowed networks
try : ip = ipaddress . ip_address ( host ) for network in self . _allowed_ips : if ip in network : return True except ValueError :
Hostname - only allow specific ones
return host in [ 'localhost' , 'host.docker.internal' ] return False except Exception : return False def log_query ( self , session_id : str , model : str , service : str , query : str , tokens_in : int , tokens_out : int , success : bool , error : Optional [ str ] = None ) : """Log query for audit trail.""" if not self . config . audit_enabled : return entry = AuditLogEntry ( timestamp = datetime . now ( ) . isoformat ( ) , event_type = 'query' , session_id = session_id , model = model , service = service , query_hash = self . _hash_content ( query ) if self . config . log_queries else None , tokens_in = tokens_in , tokens_out = tokens_out , success = success , error = error ) self . _logger . info ( json . dumps ( entry . dict ) ) def log_security_event ( self , event_type : str , details : dict ) : """Log security-related event.""" if not self . config . audit_enabled : return entry = { 'timestamp' : datetime . now ( ) . isoformat ( ) , 'event_type' : f'security: { event_type } ' , ** details } self . _logger . warning ( json . dumps ( entry ) ) def _hash_content ( self , content : str ) -
str : """Hash content for audit logging (privacy).""" return hashlib . sha256 ( content . encode ( ) ) . hexdigest ( ) [ : 16 ]
Security checklist for air-gapped deployment
AIR_GAPPED_CHECKLIST
"""
Air-Gapped Deployment Checklist
Network
- [ ] Verify no external DNS resolution
- [ ] Block all egress traffic at firewall
- [ ] Whitelist only internal IP ranges
- [ ] Disable IPv6 if not needed
Model Verification
- [ ] Pre-download all required models
- [ ] Generate SHA256 checksums for all models
- [ ] Store checksums in tamper-evident location
- [ ] Verify checksums before loading models
Access Control
- [ ] Implement role-based access to LLM services
- [ ] Require authentication for all endpoints
- [ ] Use short-lived tokens for API access
- [ ] Log all access attempts
Audit
- [ ] Enable comprehensive audit logging
- [ ] Log queries (hashed, not plaintext)
- [ ] Log model usage patterns
- [ ] Log all security events
- [ ] Implement log rotation and retention """ Coding Agent Detection Detect Active Coding Agent import os import sys from dataclasses import dataclass from typing import Optional @dataclass class CodingAgentInfo : name : str type : str version : Optional [ str ] = None config_path : Optional [ str ] = None
Environment variable markers for different agents
AGENT_ENV_MARKERS
{
CLI-based agents
'QWEN_CLI_VERSION' : ( 'qwen-cli' , 'cli' ) , 'OPENCODE_SESSION' : ( 'opencode' , 'cli' ) , 'AIDER_SESSION' : ( 'aider' , 'cli' ) , 'CODEX_SESSION' : ( 'codex' , 'cli' ) , 'GEMINI_CLI_SESSION' : ( 'gemini-cli' , 'cli' ) ,
IDE extensions
'CONTINUE_SESSION' : ( 'continue' , 'ide' ) , 'CLINE_SESSION' : ( 'cline' , 'ide' ) , 'ROO_CODE_SESSION' : ( 'roo-code' , 'ide' ) , 'CURSOR_SESSION' : ( 'cursor' , 'ide' ) ,
Local GUI apps
'OPENWEBUI_SESSION' : ( 'openwebui' , 'gui' ) , 'JAN_SESSION' : ( 'jan' , 'gui' ) , 'AGNO_SESSION' : ( 'agno' , 'gui' ) ,
Generic markers
'LLM_AGENT' : ( 'generic' , 'unknown' ) , } def detect_coding_agent ( ) -
CodingAgentInfo : """Detect which coding agent is invoking the router."""
Check environment variables
for env_var , ( name , agent_type ) in AGENT_ENV_MARKERS . items ( ) : value = os . environ . get ( env_var ) if value : return CodingAgentInfo ( name = name , type = agent_type , version = value if value != '1' else None )
Check process name / parent process
try : import psutil parent = psutil . Process ( os . getppid ( ) ) parent_name = parent . name ( ) . lower ( ) agent_process_names = { 'qwen' : 'qwen-cli' , 'aider' : 'aider' , 'codex' : 'codex' , 'continue' : 'continue' , 'cursor' : 'cursor' , } for proc_name , agent_name in agent_process_names . items ( ) : if proc_name in parent_name : return CodingAgentInfo ( name = agent_name , type = 'detected' ) except ImportError : pass
psutil not available
Check for MCP client markers
if os . environ . get ( 'MCP_CLIENT' ) : return CodingAgentInfo ( name = os . environ . get ( 'MCP_CLIENT' , 'mcp-client' ) , type = 'mcp' )
Default: unknown
return CodingAgentInfo ( name = 'unknown' , type = 'unknown' ) def get_agent_specific_config ( agent : CodingAgentInfo ) -
dict : """Get agent-specific configuration overrides.""" configs = { 'qwen-cli' : { 'default_model_preference' : 'qwen' , 'context_format' : 'qwen' , } , 'aider' : { 'default_model_preference' : 'gpt' , 'context_format' : 'openai' , } , 'cursor' : { 'default_model_preference' : 'claude' , 'context_format' : 'anthropic' , } , 'continue' : { 'supports_streaming' : True , 'context_format' : 'openai' , } , } return configs . get ( agent . name , { } ) Complete Router Implementation class LocalLLMRouter : """ Complete Local LLM Router with Serena integration. Usage: router = LocalLLMRouter(workspace="/path/to/project") await router.initialize() response = await router.route("Implement a binary search function") print(response) """ def init ( self , workspace : str , config : RouterConfig = None , session_id : str = None ) : self . workspace = workspace self . config = config or DEFAULT_CONFIG self . session_id = session_id or self . _generate_session_id ( )
Components
self . serena : Optional [ SerenaMCP ] = None self . discovery : Optional [ ServiceDiscovery ] = None self . context : Optional [ ContextManager ] = None self . security : Optional [ SecurityModule ] = None self . selector : Optional [ ModelSelector ] = None self . fallback : Optional [ FallbackExecutor ] = None
State
self . os_info = detect_os ( ) self . coding_agent = detect_coding_agent ( ) self . _initialized = False async def initialize ( self ) : """Initialize all router components."""
Security module
self . security = SecurityModule ( self . config . security )
Service discovery
self . discovery = ServiceDiscovery ( self . config . custom_endpoints ) services = await self . discovery . discover_all ( ) if not services : raise RuntimeError ( "No local LLM services available" )
Model selector
all_models
[ ] for service in services : all_models . extend ( m . id for m in service . models ) self . selector = ModelSelector ( all_models )
Context manager
self . context = ContextManager ( session_id = self . session_id , system_prompt = self . _build_system_prompt ( ) , compaction_threshold = self . config . context . compaction_threshold , compaction_target = self . config . context . compaction_target , preserve_recent = self . config . context . preserve_recent_messages )
Serena MCP (if enabled)
if self . config . serena_enabled : self . serena = SerenaMCP ( self . workspace ) try : await self . serena . start ( ) except Exception as e : logging . warning ( f"Serena MCP failed to start: { e } " ) self . serena = None
Fallback executor
self . fallback = FallbackExecutor ( self . discovery , self . context , self . config ) self . _initialized = True async def route ( self , query : str , file_context : dict = None ) -
str : """ Route query to appropriate LLM. Args: query: The user's query file_context: Optional dict with 'file', 'position' for code context Returns: LLM response string """ if not self . _initialized : await self . initialize ( )
Step 1: Classify task
classification
classify_task ( query )
Step 2: Serena first (if code-related)
serena_context
{ } if self . serena and ( classification . requires_serena or file_context ) : serena_context = await self . _gather_serena_context ( query , file_context , classification )
Step 3: Build enriched query
enriched_query
self . _build_enriched_query ( query , serena_context )
Step 4: Select model
model
self . selector . select ( classification . category , required_context = self . context . context . total_tokens + len ( query ) // 4 ) if not model : raise RuntimeError ( "No suitable model available" )
Step 5: Update context manager with selected model
self . context . set_model ( model )
Step 6: Check context and compact if needed
model_capability
MODEL_DATABASE . get ( model ) if model_capability : self . context . check_and_compact ( model_capability . context_window )
Step 7: Execute with fallback
result
await self . fallback . execute_with_fallback ( enriched_query , classification . category )
Step 8: Log for audit
self . security . log_query ( session_id = self . session_id , model = result . model or model , service = result . service or 'unknown' , query = query , tokens_in = len ( query ) // 4 , tokens_out = len ( result . response or '' ) // 4 , success = result . success , error = result . error ) if not result . success : raise RuntimeError ( f"Query failed: { result . error } " )
Step 9: Update context with response
self . context . add_message ( 'user' , query ) self . context . add_message ( 'assistant' , result . response )
Step 10: Apply edits via Serena if needed
if self . serena and file_context and contains_code_edit ( result . response ) : await self . _apply_serena_edits ( result . response , file_context ) return result . response async def _gather_serena_context ( self , query : str , file_context : dict , classification : ClassificationResult ) -
dict : """Gather code context from Serena.""" context = { } if not file_context : return context file = file_context . get ( 'file' ) position = file_context . get ( 'position' , { } ) line = position . get ( 'line' , 0 ) char = position . get ( 'character' , 0 ) try :
Always get hover info
context [ 'hover' ] = await self . serena . get_hover_info ( file , line , char )
Get references for refactoring tasks
if 'refactor' in query . lower ( ) or 'rename' in query . lower ( ) : context [ 'references' ] = await self . serena . get_references ( file , line , char )
Get diagnostics for analysis
if classification . category == TaskCategory . ANALYSIS : context [ 'diagnostics' ] = await self . serena . get_diagnostics ( file ) except Exception as e : logging . warning ( f"Serena context gathering failed: { e } " ) return context def _build_enriched_query ( self , query : str , serena_context : dict ) -
str : """Build query enriched with Serena context.""" return build_enriched_query ( query , serena_context ) async def _apply_serena_edits ( self , response : str , file_context : dict ) : """Apply code edits from response via Serena.""" edits = parse_code_edits ( response ) if edits : await self . serena . apply_edit ( file_context [ 'file' ] , edits ) def _build_system_prompt ( self ) -
str : """Build system prompt with router context.""" return f"""You are a coding assistant running in a local, air-gapped environment. Environment: - OS: { self . os_info . platform } ( { self . os_info . arch } ) - Coding Agent: { self . coding_agent . name } - Serena LSP: { 'enabled' if self . config . serena_enabled else 'disabled' } Guidelines: - Provide concise, accurate code - Use Serena's semantic information when provided - Respect security constraints (no external calls) - Focus on the specific task at hand """ def _generate_session_id ( self ) -
str : """Generate unique session ID.""" import uuid return str ( uuid . uuid4 ( ) ) [ : 8 ]
Utility functions
def contains_code_edit ( response : str ) -
bool : """Check if response contains code edits.""" markers = [ '```' , 'def ' , 'class ' , 'function ' , 'const ' , 'let ' , 'var ' ] return any ( marker in response for marker in markers ) def parse_code_edits ( response : str ) -
list : """Parse code edits from response."""
Simple implementation - extract code blocks
import
re
code_blocks
=
re
.
findall
(
r'(?:\w+)?\n(.*?)'
,
response
,
re
.
DOTALL
)
return
[
{
'content'
:
block
.
strip
(
)
}
for
block
in
code_blocks
]
Resources
Serena MCP
:
https://github.com/oraios/serena
Serena Documentation
:
https://github.com/oraios/serena#user-guide
Ollama API
:
https://github.com/ollama/ollama/blob/main/docs/api.md
LM Studio
:
https://lmstudio.ai/docs/developer
Jan AI
:
https://jan.ai/docs/desktop/api-server
OpenWebUI
:
https://docs.openwebui.com/
LocalAI
:
https://localai.io/basics/getting_started/