hardware: ram: 48G vram: 30G models: - name: "tabby-code" port: 18080 # internal_port: 28080 # Optional autostart: true vram_usage: 26.7G # Coder-32B + draft 0.5B ram_usage: 3G # Coder-32B + draft 0.5B # vram_usage: 8.25G # Coder-7B # ram_usage: 2.6G # Coder-7B env: CUDA_VISIBLE_DEVICES: 0 HSA_OVERRIDE_GFX_VERSION: '11.0.0' args: # host: 0.0.0.0 flash-attn: true ctx-size: 32768 model: /media/SanDisk/ai/models_live/Qwen2.5-Coder-32B-Instruct-IQ4_XS.gguf gpu-layers: 9999 model-draft: /media/SanDisk/ai/models_live/Qwen2.5-Coder-0.5B-Q8_0.gguf gpu-layers-draft: 9999 draft-max: 16 draft-min: 5 - name: "tabby-embeddings" port: 18081 vram_usage: 0.4G ram_usage: 2.5G env: CUDA_VISIBLE_DEVICES: 0 HSA_OVERRIDE_GFX_VERSION: '11.0.0' args: model: /media/SanDisk/ai/models_live/nomic-embed-text-v1-f32.gguf gpu-layers: -1 flash-attn: true # host: 0.0.0.0 embeddings: true - name: "big-chat" port: 18082 vram_usage: 26.5G ram_usage: 2.5G env: CUDA_VISIBLE_DEVICES: 0 HSA_OVERRIDE_GFX_VERSION: '11.0.0' args: ctx-size: 16384 flash-attn: true model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf gpu-layers: 9999 model-draft: /media/SanDisk/ai/models_live/Qwen2.5-0.5B-Instruct-Q8_0.gguf gpu-layers-draft: 9999 # draft-max: 16 # draft-min: 5 - name: "bigger-chat" port: 18083 vram_usage: 29G ram_usage: 5G env: CUDA_VISIBLE_DEVICES: 0 HSA_OVERRIDE_GFX_VERSION: '11.0.0' args: ctx-size: 8192 flash-attn: true cache-type-k: q8_0 cache-type-v: q8_0 model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf gpu-layers: 9999 model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf gpu-layers-draft: 0 # draft-max: 16 # draft-min: 5