Add spec decode config

This commit is contained in:
Tristan D. 2024-11-26 18:28:37 +01:00
parent 7e3990f7e9
commit 89dec115b7
Signed by: tristan
SSH key fingerprint: SHA256:3RU4RLOoM8oAjFU19f1W6t8uouZbA7GWkaSW6rjp1k8

View file

@ -5,30 +5,37 @@ models:
- name: "tabby-code"
port: 18080
internal_port: 28080
vram_usage: 8.25G
ram_usage: 2.6G
autostart: true
vram_usage: 26.7G # Coder-32B + draft 0.5B
ram_usage: 3G # Coder-32B + draft 0.5B
# vram_usage: 8.25G # Coder-7B
# ram_usage: 2.6G # Coder-7B
env:
CUDA_VISIBLE_DEVICES: 0
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
args:
model: /media/SanDisk/ai/models_live/Qwen2.5-Coder-7B-Instruct-Q6_K_L.gguf
gpu-layers: 9999
# host: 0.0.0.0
flash-attn: true
ctx-size: 32768
host: 0.0.0.0
model: /media/SanDisk/ai/models_live/Qwen2.5-Coder-32B-Instruct-IQ4_XS.gguf
gpu-layers: 9999
model-draft: /media/SanDisk/ai/models_live/Qwen2.5-Coder-0.5B-Q8_0.gguf
gpu-layers-draft: 9999
draft-max: 16
draft-min: 5
- name: "tabby-embeddings"
port: 18081
internal_port: 28081
vram_usage: 1G
ram_usage: 2G
vram_usage: 0.4G
ram_usage: 2.5G
env:
CUDA_VISIBLE_DEVICES: 0
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
args:
model: /media/SanDisk/ai/models_live/nomic-embed-text-v1-f32.gguf
gpu-layers: 9999
gpu-layers: -1
flash-attn: true
host: 0.0.0.0
# host: 0.0.0.0
embeddings: true
- name: "big-chat"
port: 18082
@ -43,4 +50,21 @@ models:
gpu-layers: 9999
ctx-size: 16384
flash-attn: true
host: 0.0.0.0
# host: 0.0.0.0
- name: "bigger-chat"
port: 18083
internal_port: 28083
vram_usage: 29G
ram_usage: 4G
env:
CUDA_VISIBLE_DEVICES: 0
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
args:
model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf
gpu-layers: 9999
flash-attn: true
# ctx-size: 8192
ctx-size: 16384
# host: 0.0.0.0
cache-type-k: q8_0
cache-type-v: q8_0