File size: 2,303 Bytes
8f4d405
79ea999
8f4d405
 
 
 
b3aba24
8f4d405
 
 
 
79ea999
 
 
 
8f4d405
 
79ea999
8f4d405
79ea999
8f4d405
 
 
 
 
b3aba24
8f4d405
 
 
 
79ea999
 
8f4d405
 
b3aba24
8f4d405
 
 
79ea999
 
8f4d405
 
 
 
 
 
79ea999
 
 
 
 
 
 
8f4d405
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# models_config.py
# Optimized for NVIDIA T4 Medium (16GB VRAM) with 4-bit quantization
LLM_CONFIG = {
    "primary_provider": "huggingface",
    "models": {
        "reasoning_primary": {
            "model_id": "meta-llama/Llama-3.1-8B-Instruct:cerebras",  # Cerebras deployment
            "task": "general_reasoning",
            "max_tokens": 10000,
            "temperature": 0.7,
            "cost_per_token": 0.000015,
            "fallback": "Qwen/Qwen2.5-7B-Instruct",  # Fallback to Qwen if Llama unavailable
            "is_chat_model": True,
            "use_4bit_quantization": True,  # Enable 4-bit quantization for 16GB T4
            "use_8bit_quantization": False
        },
        "embedding_specialist": {
            "model_id": "intfloat/e5-large-v2",  # Upgraded: 1024-dim embeddings (vs 384), much better semantic understanding
            "task": "embeddings",
            "vector_dimensions": 1024,
            "purpose": "semantic_similarity",
            "cost_advantage": "90%_cheaper_than_primary",
            "is_chat_model": False
        },
        "classification_specialist": {
            "model_id": "meta-llama/Llama-3.1-8B-Instruct:cerebras",  # Cerebras deployment for classification
            "task": "intent_classification",
            "max_length": 512,
            "specialization": "fast_inference",
            "latency_target": "<100ms",
            "is_chat_model": True,
            "use_4bit_quantization": True
        },
        "safety_checker": {
            "model_id": "meta-llama/Llama-3.1-8B-Instruct:cerebras",  # Cerebras deployment for safety
            "task": "content_moderation",
            "confidence_threshold": 0.85,
            "purpose": "bias_detection",
            "is_chat_model": True,
            "use_4bit_quantization": True
        }
    },
    "routing_logic": {
        "strategy": "task_based_routing",
        "fallback_chain": ["primary", "fallback", "degraded_mode"],
        "load_balancing": "round_robin_with_health_check"
    },
    "quantization_settings": {
        "default_4bit": True,  # Enable 4-bit quantization by default for T4 16GB
        "default_8bit": False,
        "bnb_4bit_compute_dtype": "float16",
        "bnb_4bit_use_double_quant": True,
        "bnb_4bit_quant_type": "nf4"
    }
}