109 lines
3.1 KiB
YAML
109 lines
3.1 KiB
YAML
|
system_resources:
|
||
|
ram: 48G
|
||
|
vram: 30G
|
||
|
model_specs:
|
||
|
- name: "tabby-code"
|
||
|
port: 18080
|
||
|
# internal_port: 28080 # Optional
|
||
|
autostart: "true"
|
||
|
vram_usage: "26.7G" # Coder-32B + draft 0.5B
|
||
|
ram_usage: "3G" # Coder-32B + draft 0.5B
|
||
|
# vram_usage: 8.25G # Coder-7B
|
||
|
# ram_usage: 2.6G # Coder-7B
|
||
|
env:
|
||
|
CUDA_VISIBLE_DEVICES: "0"
|
||
|
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
||
|
args:
|
||
|
# host: 0.0.0.0
|
||
|
flash-attn: "true"
|
||
|
ctx-size: "32768"
|
||
|
model: /media/SanDisk/ai/models_live/Qwen2.5-Coder-32B-Instruct-IQ4_XS.gguf
|
||
|
gpu-layers: "9999"
|
||
|
model-draft: /media/SanDisk/ai/models_live/Qwen2.5-Coder-0.5B-Q8_0.gguf
|
||
|
gpu-layers-draft: "9999"
|
||
|
draft-max: "16"
|
||
|
draft-min: "5"
|
||
|
- name: "tabby-embeddings"
|
||
|
port: 18081
|
||
|
vram_usage: "0.4G"
|
||
|
ram_usage: "2.5G"
|
||
|
env:
|
||
|
CUDA_VISIBLE_DEVICES: "0"
|
||
|
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
||
|
args:
|
||
|
model: "/media/SanDisk/ai/models_live/nomic-embed-text-v1-f32.gguf"
|
||
|
gpu-layers: "-1"
|
||
|
flash-attn: "true"
|
||
|
# host: 0.0.0.0
|
||
|
embeddings: "true"
|
||
|
- name: "big-chat"
|
||
|
port: 18082
|
||
|
vram_usage: 26.5G
|
||
|
ram_usage: 2.5G
|
||
|
env:
|
||
|
CUDA_VISIBLE_DEVICES: "0"
|
||
|
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
||
|
args:
|
||
|
ctx-size: "16384"
|
||
|
flash-attn: "true"
|
||
|
model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf
|
||
|
gpu-layers: "9999"
|
||
|
model-draft: /media/SanDisk/ai/models_live/Qwen2.5-0.5B-Instruct-Q8_0.gguf
|
||
|
gpu-layers-draft: "9999"
|
||
|
# draft-max: "16"
|
||
|
# draft-min: "5"
|
||
|
- name: "bigger-chat"
|
||
|
port: 18085
|
||
|
vram_usage: 29G
|
||
|
ram_usage: 5G
|
||
|
env:
|
||
|
CUDA_VISIBLE_DEVICES: "0"
|
||
|
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
||
|
args:
|
||
|
ctx-size: "8192"
|
||
|
flash-attn: "true"
|
||
|
cache-type-k: q8_0
|
||
|
cache-type-v: q8_0
|
||
|
model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf
|
||
|
gpu-layers: "9999"
|
||
|
model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf
|
||
|
gpu-layers-draft: "0"
|
||
|
# draft-max: "16"
|
||
|
# draft-min: "5"
|
||
|
- name: "bigger-chat-2"
|
||
|
port: 18083
|
||
|
vram_usage: 29G
|
||
|
ram_usage: 5G
|
||
|
env:
|
||
|
CUDA_VISIBLE_DEVICES: "0"
|
||
|
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
||
|
args:
|
||
|
ctx-size: "8192"
|
||
|
flash-attn: "true"
|
||
|
cache-type-k: q8_0
|
||
|
cache-type-v: q8_0
|
||
|
model: /media/SanDisk/ai/models_live/Llama-3.3-70B-Instruct-IQ3_XXS.gguf
|
||
|
gpu-layers: "9999"
|
||
|
# model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf
|
||
|
# gpu-layers-draft: 0
|
||
|
# draft-max: "16"
|
||
|
# draft-min: "5"
|
||
|
- name: "deep-think"
|
||
|
port: 18084
|
||
|
vram_usage: 29G
|
||
|
ram_usage: 5G
|
||
|
env:
|
||
|
CUDA_VISIBLE_DEVICES: "0"
|
||
|
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
||
|
args:
|
||
|
ctx-size: "32768"
|
||
|
flash-attn: "true"
|
||
|
# cache-type-k: q8_0
|
||
|
# cache-type-v: q8_0
|
||
|
model: /media/SanDisk/ai/models_live/QwQ-32B-Preview-IQ4_XS.gguf
|
||
|
gpu-layers: "9999"
|
||
|
# model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf
|
||
|
# gpu-layers-draft: 0
|
||
|
# draft-max: "16"
|
||
|
# draft-min: "5"
|