diff --git a/llama_proxy_man/config.yaml b/llama_proxy_man/config.yaml index 8aba5d6..c658f7a 100644 --- a/llama_proxy_man/config.yaml +++ b/llama_proxy_man/config.yaml @@ -5,30 +5,37 @@ models: - name: "tabby-code" port: 18080 internal_port: 28080 - vram_usage: 8.25G - ram_usage: 2.6G + autostart: true + vram_usage: 26.7G # Coder-32B + draft 0.5B + ram_usage: 3G # Coder-32B + draft 0.5B + # vram_usage: 8.25G # Coder-7B + # ram_usage: 2.6G # Coder-7B env: CUDA_VISIBLE_DEVICES: 0 HSA_OVERRIDE_GFX_VERSION: '11.0.0' args: - model: /media/SanDisk/ai/models_live/Qwen2.5-Coder-7B-Instruct-Q6_K_L.gguf - gpu-layers: 9999 + # host: 0.0.0.0 flash-attn: true ctx-size: 32768 - host: 0.0.0.0 + model: /media/SanDisk/ai/models_live/Qwen2.5-Coder-32B-Instruct-IQ4_XS.gguf + gpu-layers: 9999 + model-draft: /media/SanDisk/ai/models_live/Qwen2.5-Coder-0.5B-Q8_0.gguf + gpu-layers-draft: 9999 + draft-max: 16 + draft-min: 5 - name: "tabby-embeddings" port: 18081 internal_port: 28081 - vram_usage: 1G - ram_usage: 2G + vram_usage: 0.4G + ram_usage: 2.5G env: CUDA_VISIBLE_DEVICES: 0 HSA_OVERRIDE_GFX_VERSION: '11.0.0' args: model: /media/SanDisk/ai/models_live/nomic-embed-text-v1-f32.gguf - gpu-layers: 9999 + gpu-layers: -1 flash-attn: true - host: 0.0.0.0 + # host: 0.0.0.0 embeddings: true - name: "big-chat" port: 18082 @@ -43,4 +50,21 @@ models: gpu-layers: 9999 ctx-size: 16384 flash-attn: true - host: 0.0.0.0 + # host: 0.0.0.0 + - name: "bigger-chat" + port: 18083 + internal_port: 28083 + vram_usage: 29G + ram_usage: 4G + env: + CUDA_VISIBLE_DEVICES: 0 + HSA_OVERRIDE_GFX_VERSION: '11.0.0' + args: + model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf + gpu-layers: 9999 + flash-attn: true + # ctx-size: 8192 + ctx-size: 16384 + # host: 0.0.0.0 + cache-type-k: q8_0 + cache-type-v: q8_0