diff --git a/llama_proxy_man/config.yaml b/llama_proxy_man/config.yaml index a19d518..e8048ca 100644 --- a/llama_proxy_man/config.yaml +++ b/llama_proxy_man/config.yaml @@ -6,7 +6,7 @@ model_specs: port: 18080 # internal_port: 28080 # Optional autostart: true - vram_usage: 26.7G # Coder-32B + draft 0.5B + vram_usage: 27G # Coder-32B + draft 0.5B ram_usage: 3G # Coder-32B + draft 0.5B # vram_usage: 8.25G # Coder-7B # ram_usage: 2.6G # Coder-7B @@ -19,8 +19,10 @@ model_specs: ctx-size: 32768 model: /media/SanDisk/ai/models_live/Qwen2.5-Coder-32B-Instruct-IQ4_XS.gguf gpu-layers: 9999 - model-draft: /media/SanDisk/ai/models_live/Qwen2.5-Coder-0.5B-Q8_0.gguf + model-draft: /media/SanDisk/ai/models_live/Qwen2.5-Coder-0.5B-Instruct-Q8_0.gguf gpu-layers-draft: 9999 + cache-type-k: q8_0 + cache-type-v: q8_0 draft-max: 16 draft-min: 5 - name: "tabby-embeddings" @@ -31,12 +33,12 @@ model_specs: CUDA_VISIBLE_DEVICES: 0 HSA_OVERRIDE_GFX_VERSION: '11.0.0' args: - model: /media/SanDisk/ai/models_live/nomic-embed-text-v1-f32.gguf + model: /media/SanDisk/ai/models_live/nomic-embed-text-v1.f32.gguf gpu-layers: -1 flash-attn: true # host: 0.0.0.0 embeddings: true - - name: "big-chat" + - name: "chat" port: 18082 vram_usage: 26.5G ram_usage: 2.5G @@ -46,63 +48,79 @@ model_specs: args: ctx-size: 16384 flash-attn: true - model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf + model: /media/SanDisk/ai/models_live/mistralai_Mistral-Small-3.1-24B-Instruct-2503-IQ4_XS gpu-layers: 9999 - model-draft: /media/SanDisk/ai/models_live/Qwen2.5-0.5B-Instruct-Q8_0.gguf + model-draft: /media/SanDisk/ai/models_live/Mistral-Small-3.1-DRAFT-0.5B.Q8_0.gguf gpu-layers-draft: 9999 - # draft-max: 16 - # draft-min: 5 - - name: "bigger-chat" - port: 18085 - vram_usage: 29G - ram_usage: 5G - env: - CUDA_VISIBLE_DEVICES: 0 - HSA_OVERRIDE_GFX_VERSION: '11.0.0' - args: - ctx-size: 8192 - flash-attn: true - cache-type-k: q8_0 - cache-type-v: q8_0 - model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf - gpu-layers: 9999 - model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf - gpu-layers-draft: 0 - # draft-max: 16 - # draft-min: 5 - - name: "bigger-chat-2" - port: 18083 - vram_usage: 29G - ram_usage: 5G - env: - CUDA_VISIBLE_DEVICES: 0 - HSA_OVERRIDE_GFX_VERSION: '11.0.0' - args: - ctx-size: 8192 - flash-attn: true - cache-type-k: q8_0 - cache-type-v: q8_0 - model: /media/SanDisk/ai/models_live/Llama-3.3-70B-Instruct-IQ3_XXS.gguf - gpu-layers: 9999 - # model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf - # gpu-layers-draft: 0 - # draft-max: 16 - # draft-min: 5 - - name: "deep-think" - port: 18084 - vram_usage: 29G - ram_usage: 5G - env: - CUDA_VISIBLE_DEVICES: 0 - HSA_OVERRIDE_GFX_VERSION: '11.0.0' - args: - ctx-size: 32768 - flash-attn: true - # cache-type-k: q8_0 - # cache-type-v: q8_0 - model: /media/SanDisk/ai/models_live/QwQ-32B-Preview-IQ4_XS.gguf - gpu-layers: 9999 - # model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf - # gpu-layers-draft: 0 - # draft-max: 16 - # draft-min: 5 + draft-max: 16 + draft-min: 4 + # - name: "big-chat" + # port: 18082 + # vram_usage: 26.5G + # ram_usage: 2.5G + # env: + # CUDA_VISIBLE_DEVICES: 0 + # HSA_OVERRIDE_GFX_VERSION: '11.0.0' + # args: + # ctx-size: 16384 + # flash-attn: true + # model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf + # gpu-layers: 9999 + # model-draft: /media/SanDisk/ai/models_live/Qwen2.5-0.5B-Instruct-Q8_0.gguf + # gpu-layers-draft: 9999 + # # draft-max: 16 + # # draft-min: 5 + # - name: "bigger-chat" + # port: 18085 + # vram_usage: 29G + # ram_usage: 5G + # env: + # CUDA_VISIBLE_DEVICES: 0 + # HSA_OVERRIDE_GFX_VERSION: '11.0.0' + # args: + # ctx-size: 8192 + # flash-attn: true + # cache-type-k: q8_0 + # cache-type-v: q8_0 + # model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf + # gpu-layers: 9999 + # model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf + # gpu-layers-draft: 0 + # # draft-max: 16 + # # draft-min: 5 + # - name: "bigger-chat-2" + # port: 18083 + # vram_usage: 29G + # ram_usage: 5G + # env: + # CUDA_VISIBLE_DEVICES: 0 + # HSA_OVERRIDE_GFX_VERSION: '11.0.0' + # args: + # ctx-size: 8192 + # flash-attn: true + # cache-type-k: q8_0 + # cache-type-v: q8_0 + # model: /media/SanDisk/ai/models_live/Llama-3.3-70B-Instruct-IQ3_XXS.gguf + # gpu-layers: 9999 + # # model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf + # # gpu-layers-draft: 0 + # # draft-max: 16 + # # draft-min: 5 + # - name: "deep-think" + # port: 18084 + # vram_usage: 29G + # ram_usage: 5G + # env: + # CUDA_VISIBLE_DEVICES: 0 + # HSA_OVERRIDE_GFX_VERSION: '11.0.0' + # args: + # ctx-size: 32768 + # flash-attn: true + # cache-type-k: q8_0 + # cache-type-v: q8_0 + # model: /media/SanDisk/ai/models_live/Qwen_QwQ-32B-IQ4_XS.gguf + # gpu-layers: 9999 + # # model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf + # # gpu-layers-draft: 0 + # # draft-max: 16 + # # draft-min: 5