system_resources: ram: 48G vram: 30G model_specs: - name: "tabby-code" port: 18080 # internal_port: 28080 # Optional autostart: true vram_usage: 27G # Coder-32B + draft 0.5B ram_usage: 3G # Coder-32B + draft 0.5B # vram_usage: 8.25G # Coder-7B # ram_usage: 2.6G # Coder-7B env: CUDA_VISIBLE_DEVICES: 0 HSA_OVERRIDE_GFX_VERSION: '11.0.0' args: # host: 0.0.0.0 flash-attn: true ctx-size: 32768 model: /media/SanDisk/ai/models_live/Qwen2.5-Coder-32B-Instruct-IQ4_XS.gguf gpu-layers: 9999 model-draft: /media/SanDisk/ai/models_live/Qwen2.5-Coder-0.5B-Instruct-Q8_0.gguf gpu-layers-draft: 9999 cache-type-k: q8_0 cache-type-v: q8_0 draft-max: 16 draft-min: 5 - name: "tabby-embeddings" port: 18081 vram_usage: 0.4G ram_usage: 2.5G env: CUDA_VISIBLE_DEVICES: 0 HSA_OVERRIDE_GFX_VERSION: '11.0.0' args: model: /media/SanDisk/ai/models_live/nomic-embed-text-v1.f32.gguf gpu-layers: -1 flash-attn: true # host: 0.0.0.0 embeddings: true - name: "chat" port: 18082 vram_usage: 26.5G ram_usage: 2.5G env: CUDA_VISIBLE_DEVICES: 0 HSA_OVERRIDE_GFX_VERSION: '11.0.0' args: ctx-size: 16384 flash-attn: true model: /media/SanDisk/ai/models_live/Qwen_Qwen3-30B-A3B-Q5_K_S.gguf gpu-layers: 9999 # model-draft: /media/SanDisk/ai/models_live/Qwen_Qwen3-0.6B-Q6_K.gguf # gpu-layers-draft: 9999 # draft-max: 16 # draft-min: 4 # - name: "big-chat" # port: 18082 # vram_usage: 26.5G # ram_usage: 2.5G # env: # CUDA_VISIBLE_DEVICES: 0 # HSA_OVERRIDE_GFX_VERSION: '11.0.0' # args: # ctx-size: 16384 # flash-attn: true # model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf # gpu-layers: 9999 # model-draft: /media/SanDisk/ai/models_live/Qwen2.5-0.5B-Instruct-Q8_0.gguf # gpu-layers-draft: 9999 # # draft-max: 16 # # draft-min: 5 # - name: "bigger-chat" # port: 18085 # vram_usage: 29G # ram_usage: 5G # env: # CUDA_VISIBLE_DEVICES: 0 # HSA_OVERRIDE_GFX_VERSION: '11.0.0' # args: # ctx-size: 8192 # flash-attn: true # cache-type-k: q8_0 # cache-type-v: q8_0 # model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf # gpu-layers: 9999 # model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf # gpu-layers-draft: 0 # # draft-max: 16 # # draft-min: 5 # - name: "bigger-chat-2" # port: 18083 # vram_usage: 29G # ram_usage: 5G # env: # CUDA_VISIBLE_DEVICES: 0 # HSA_OVERRIDE_GFX_VERSION: '11.0.0' # args: # ctx-size: 8192 # flash-attn: true # cache-type-k: q8_0 # cache-type-v: q8_0 # model: /media/SanDisk/ai/models_live/Llama-3.3-70B-Instruct-IQ3_XXS.gguf # gpu-layers: 9999 # # model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf # # gpu-layers-draft: 0 # # draft-max: 16 # # draft-min: 5 # - name: "deep-think" # port: 18084 # vram_usage: 29G # ram_usage: 5G # env: # CUDA_VISIBLE_DEVICES: 0 # HSA_OVERRIDE_GFX_VERSION: '11.0.0' # args: # ctx-size: 32768 # flash-attn: true # cache-type-k: q8_0 # cache-type-v: q8_0 # model: /media/SanDisk/ai/models_live/Qwen_QwQ-32B-IQ4_XS.gguf # gpu-layers: 9999 # # model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf # # gpu-layers-draft: 0 # # draft-max: 16 # # draft-min: 5