diff --git a/llama_proxy_man/config.yaml b/llama_proxy_man/config.yaml index 25fb678..ee89000 100644 --- a/llama_proxy_man/config.yaml +++ b/llama_proxy_man/config.yaml @@ -53,7 +53,7 @@ models: # draft-max: 16 # draft-min: 5 - name: "bigger-chat" - port: 18083 + port: 18085 vram_usage: 29G ram_usage: 5G env: @@ -70,3 +70,39 @@ models: gpu-layers-draft: 0 # draft-max: 16 # draft-min: 5 + - name: "bigger-chat-2" + port: 18083 + vram_usage: 29G + ram_usage: 5G + env: + CUDA_VISIBLE_DEVICES: 0 + HSA_OVERRIDE_GFX_VERSION: '11.0.0' + args: + ctx-size: 8192 + flash-attn: true + cache-type-k: q8_0 + cache-type-v: q8_0 + model: /media/SanDisk/ai/models_live/Llama-3.3-70B-Instruct-IQ3_XXS.gguf + gpu-layers: 9999 + # model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf + # gpu-layers-draft: 0 + # draft-max: 16 + # draft-min: 5 + - name: "deep-think" + port: 18084 + vram_usage: 29G + ram_usage: 5G + env: + CUDA_VISIBLE_DEVICES: 0 + HSA_OVERRIDE_GFX_VERSION: '11.0.0' + args: + ctx-size: 32768 + flash-attn: true + # cache-type-k: q8_0 + # cache-type-v: q8_0 + model: /media/SanDisk/ai/models_live/QwQ-32B-Preview-IQ4_XS.gguf + gpu-layers: 9999 + # model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf + # gpu-layers-draft: 0 + # draft-max: 16 + # draft-min: 5