system_resources: ram: 48G vram: 30G model_specs: - name: "tabby-code" port: 18080 # internal_port: 28080 # Optional autostart: "true" vram_usage: "26.7G" # Coder-32B + draft 0.5B ram_usage: "3G" # Coder-32B + draft 0.5B # vram_usage: 8.25G # Coder-7B # ram_usage: 2.6G # Coder-7B env: CUDA_VISIBLE_DEVICES: "0" HSA_OVERRIDE_GFX_VERSION: '11.0.0' args: # host: 0.0.0.0 flash-attn: "true" ctx-size: "32768" model: /media/SanDisk/ai/models_live/Qwen2.5-Coder-32B-Instruct-IQ4_XS.gguf gpu-layers: "9999" model-draft: /media/SanDisk/ai/models_live/Qwen2.5-Coder-0.5B-Q8_0.gguf gpu-layers-draft: "9999" draft-max: "16" draft-min: "5" - name: "tabby-embeddings" port: 18081 vram_usage: "0.4G" ram_usage: "2.5G" env: CUDA_VISIBLE_DEVICES: "0" HSA_OVERRIDE_GFX_VERSION: '11.0.0' args: model: "/media/SanDisk/ai/models_live/nomic-embed-text-v1-f32.gguf" gpu-layers: "-1" flash-attn: "true" # host: 0.0.0.0 embeddings: "true" - name: "big-chat" port: 18082 vram_usage: 26.5G ram_usage: 2.5G env: CUDA_VISIBLE_DEVICES: "0" HSA_OVERRIDE_GFX_VERSION: '11.0.0' args: ctx-size: "16384" flash-attn: "true" model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf gpu-layers: "9999" model-draft: /media/SanDisk/ai/models_live/Qwen2.5-0.5B-Instruct-Q8_0.gguf gpu-layers-draft: "9999" # draft-max: "16" # draft-min: "5" - name: "bigger-chat" port: 18085 vram_usage: 29G ram_usage: 5G env: CUDA_VISIBLE_DEVICES: "0" HSA_OVERRIDE_GFX_VERSION: '11.0.0' args: ctx-size: "8192" flash-attn: "true" cache-type-k: q8_0 cache-type-v: q8_0 model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf gpu-layers: "9999" model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf gpu-layers-draft: "0" # draft-max: "16" # draft-min: "5" - name: "bigger-chat-2" port: 18083 vram_usage: 29G ram_usage: 5G env: CUDA_VISIBLE_DEVICES: "0" HSA_OVERRIDE_GFX_VERSION: '11.0.0' args: ctx-size: "8192" flash-attn: "true" cache-type-k: q8_0 cache-type-v: q8_0 model: /media/SanDisk/ai/models_live/Llama-3.3-70B-Instruct-IQ3_XXS.gguf gpu-layers: "9999" # model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf # gpu-layers-draft: 0 # draft-max: "16" # draft-min: "5" - name: "deep-think" port: 18084 vram_usage: 29G ram_usage: 5G env: CUDA_VISIBLE_DEVICES: "0" HSA_OVERRIDE_GFX_VERSION: '11.0.0' args: ctx-size: "32768" flash-attn: "true" # cache-type-k: q8_0 # cache-type-v: q8_0 model: /media/SanDisk/ai/models_live/QwQ-32B-Preview-IQ4_XS.gguf gpu-layers: "9999" # model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf # gpu-layers-draft: 0 # draft-max: "16" # draft-min: "5"