Add spec decode config
This commit is contained in:
parent
7e3990f7e9
commit
89dec115b7
1 changed files with 34 additions and 10 deletions
|
@ -5,30 +5,37 @@ models:
|
||||||
- name: "tabby-code"
|
- name: "tabby-code"
|
||||||
port: 18080
|
port: 18080
|
||||||
internal_port: 28080
|
internal_port: 28080
|
||||||
vram_usage: 8.25G
|
autostart: true
|
||||||
ram_usage: 2.6G
|
vram_usage: 26.7G # Coder-32B + draft 0.5B
|
||||||
|
ram_usage: 3G # Coder-32B + draft 0.5B
|
||||||
|
# vram_usage: 8.25G # Coder-7B
|
||||||
|
# ram_usage: 2.6G # Coder-7B
|
||||||
env:
|
env:
|
||||||
CUDA_VISIBLE_DEVICES: 0
|
CUDA_VISIBLE_DEVICES: 0
|
||||||
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
||||||
args:
|
args:
|
||||||
model: /media/SanDisk/ai/models_live/Qwen2.5-Coder-7B-Instruct-Q6_K_L.gguf
|
# host: 0.0.0.0
|
||||||
gpu-layers: 9999
|
|
||||||
flash-attn: true
|
flash-attn: true
|
||||||
ctx-size: 32768
|
ctx-size: 32768
|
||||||
host: 0.0.0.0
|
model: /media/SanDisk/ai/models_live/Qwen2.5-Coder-32B-Instruct-IQ4_XS.gguf
|
||||||
|
gpu-layers: 9999
|
||||||
|
model-draft: /media/SanDisk/ai/models_live/Qwen2.5-Coder-0.5B-Q8_0.gguf
|
||||||
|
gpu-layers-draft: 9999
|
||||||
|
draft-max: 16
|
||||||
|
draft-min: 5
|
||||||
- name: "tabby-embeddings"
|
- name: "tabby-embeddings"
|
||||||
port: 18081
|
port: 18081
|
||||||
internal_port: 28081
|
internal_port: 28081
|
||||||
vram_usage: 1G
|
vram_usage: 0.4G
|
||||||
ram_usage: 2G
|
ram_usage: 2.5G
|
||||||
env:
|
env:
|
||||||
CUDA_VISIBLE_DEVICES: 0
|
CUDA_VISIBLE_DEVICES: 0
|
||||||
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
||||||
args:
|
args:
|
||||||
model: /media/SanDisk/ai/models_live/nomic-embed-text-v1-f32.gguf
|
model: /media/SanDisk/ai/models_live/nomic-embed-text-v1-f32.gguf
|
||||||
gpu-layers: 9999
|
gpu-layers: -1
|
||||||
flash-attn: true
|
flash-attn: true
|
||||||
host: 0.0.0.0
|
# host: 0.0.0.0
|
||||||
embeddings: true
|
embeddings: true
|
||||||
- name: "big-chat"
|
- name: "big-chat"
|
||||||
port: 18082
|
port: 18082
|
||||||
|
@ -43,4 +50,21 @@ models:
|
||||||
gpu-layers: 9999
|
gpu-layers: 9999
|
||||||
ctx-size: 16384
|
ctx-size: 16384
|
||||||
flash-attn: true
|
flash-attn: true
|
||||||
host: 0.0.0.0
|
# host: 0.0.0.0
|
||||||
|
- name: "bigger-chat"
|
||||||
|
port: 18083
|
||||||
|
internal_port: 28083
|
||||||
|
vram_usage: 29G
|
||||||
|
ram_usage: 4G
|
||||||
|
env:
|
||||||
|
CUDA_VISIBLE_DEVICES: 0
|
||||||
|
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
||||||
|
args:
|
||||||
|
model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf
|
||||||
|
gpu-layers: 9999
|
||||||
|
flash-attn: true
|
||||||
|
# ctx-size: 8192
|
||||||
|
ctx-size: 16384
|
||||||
|
# host: 0.0.0.0
|
||||||
|
cache-type-k: q8_0
|
||||||
|
cache-type-v: q8_0
|
||||||
|
|
Loading…
Add table
Reference in a new issue