Upgrade test cfg

This commit is contained in:
Tristan D. 2025-04-16 09:16:05 +02:00
parent 2e85273aed
commit b44e953e7b
Signed by: tristan
SSH key fingerprint: SHA256:3RU4RLOoM8oAjFU19f1W6t8uouZbA7GWkaSW6rjp1k8

View file

@ -6,7 +6,7 @@ model_specs:
port: 18080 port: 18080
# internal_port: 28080 # Optional # internal_port: 28080 # Optional
autostart: true autostart: true
vram_usage: 26.7G # Coder-32B + draft 0.5B vram_usage: 27G # Coder-32B + draft 0.5B
ram_usage: 3G # Coder-32B + draft 0.5B ram_usage: 3G # Coder-32B + draft 0.5B
# vram_usage: 8.25G # Coder-7B # vram_usage: 8.25G # Coder-7B
# ram_usage: 2.6G # Coder-7B # ram_usage: 2.6G # Coder-7B
@ -19,8 +19,10 @@ model_specs:
ctx-size: 32768 ctx-size: 32768
model: /media/SanDisk/ai/models_live/Qwen2.5-Coder-32B-Instruct-IQ4_XS.gguf model: /media/SanDisk/ai/models_live/Qwen2.5-Coder-32B-Instruct-IQ4_XS.gguf
gpu-layers: 9999 gpu-layers: 9999
model-draft: /media/SanDisk/ai/models_live/Qwen2.5-Coder-0.5B-Q8_0.gguf model-draft: /media/SanDisk/ai/models_live/Qwen2.5-Coder-0.5B-Instruct-Q8_0.gguf
gpu-layers-draft: 9999 gpu-layers-draft: 9999
cache-type-k: q8_0
cache-type-v: q8_0
draft-max: 16 draft-max: 16
draft-min: 5 draft-min: 5
- name: "tabby-embeddings" - name: "tabby-embeddings"
@ -31,12 +33,12 @@ model_specs:
CUDA_VISIBLE_DEVICES: 0 CUDA_VISIBLE_DEVICES: 0
HSA_OVERRIDE_GFX_VERSION: '11.0.0' HSA_OVERRIDE_GFX_VERSION: '11.0.0'
args: args:
model: /media/SanDisk/ai/models_live/nomic-embed-text-v1-f32.gguf model: /media/SanDisk/ai/models_live/nomic-embed-text-v1.f32.gguf
gpu-layers: -1 gpu-layers: -1
flash-attn: true flash-attn: true
# host: 0.0.0.0 # host: 0.0.0.0
embeddings: true embeddings: true
- name: "big-chat" - name: "chat"
port: 18082 port: 18082
vram_usage: 26.5G vram_usage: 26.5G
ram_usage: 2.5G ram_usage: 2.5G
@ -46,63 +48,79 @@ model_specs:
args: args:
ctx-size: 16384 ctx-size: 16384
flash-attn: true flash-attn: true
model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf model: /media/SanDisk/ai/models_live/mistralai_Mistral-Small-3.1-24B-Instruct-2503-IQ4_XS
gpu-layers: 9999 gpu-layers: 9999
model-draft: /media/SanDisk/ai/models_live/Qwen2.5-0.5B-Instruct-Q8_0.gguf model-draft: /media/SanDisk/ai/models_live/Mistral-Small-3.1-DRAFT-0.5B.Q8_0.gguf
gpu-layers-draft: 9999 gpu-layers-draft: 9999
# draft-max: 16 draft-max: 16
# draft-min: 5 draft-min: 4
- name: "bigger-chat" # - name: "big-chat"
port: 18085 # port: 18082
vram_usage: 29G # vram_usage: 26.5G
ram_usage: 5G # ram_usage: 2.5G
env: # env:
CUDA_VISIBLE_DEVICES: 0 # CUDA_VISIBLE_DEVICES: 0
HSA_OVERRIDE_GFX_VERSION: '11.0.0' # HSA_OVERRIDE_GFX_VERSION: '11.0.0'
args: # args:
ctx-size: 8192 # ctx-size: 16384
flash-attn: true # flash-attn: true
cache-type-k: q8_0 # model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf
cache-type-v: q8_0 # gpu-layers: 9999
model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf # model-draft: /media/SanDisk/ai/models_live/Qwen2.5-0.5B-Instruct-Q8_0.gguf
gpu-layers: 9999 # gpu-layers-draft: 9999
model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf # # draft-max: 16
gpu-layers-draft: 0 # # draft-min: 5
# draft-max: 16 # - name: "bigger-chat"
# draft-min: 5 # port: 18085
- name: "bigger-chat-2" # vram_usage: 29G
port: 18083 # ram_usage: 5G
vram_usage: 29G # env:
ram_usage: 5G # CUDA_VISIBLE_DEVICES: 0
env: # HSA_OVERRIDE_GFX_VERSION: '11.0.0'
CUDA_VISIBLE_DEVICES: 0 # args:
HSA_OVERRIDE_GFX_VERSION: '11.0.0' # ctx-size: 8192
args: # flash-attn: true
ctx-size: 8192 # cache-type-k: q8_0
flash-attn: true # cache-type-v: q8_0
cache-type-k: q8_0 # model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf
cache-type-v: q8_0 # gpu-layers: 9999
model: /media/SanDisk/ai/models_live/Llama-3.3-70B-Instruct-IQ3_XXS.gguf # model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf
gpu-layers: 9999 # gpu-layers-draft: 0
# model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf # # draft-max: 16
# gpu-layers-draft: 0 # # draft-min: 5
# draft-max: 16 # - name: "bigger-chat-2"
# draft-min: 5 # port: 18083
- name: "deep-think" # vram_usage: 29G
port: 18084 # ram_usage: 5G
vram_usage: 29G # env:
ram_usage: 5G # CUDA_VISIBLE_DEVICES: 0
env: # HSA_OVERRIDE_GFX_VERSION: '11.0.0'
CUDA_VISIBLE_DEVICES: 0 # args:
HSA_OVERRIDE_GFX_VERSION: '11.0.0' # ctx-size: 8192
args: # flash-attn: true
ctx-size: 32768 # cache-type-k: q8_0
flash-attn: true # cache-type-v: q8_0
# cache-type-k: q8_0 # model: /media/SanDisk/ai/models_live/Llama-3.3-70B-Instruct-IQ3_XXS.gguf
# cache-type-v: q8_0 # gpu-layers: 9999
model: /media/SanDisk/ai/models_live/QwQ-32B-Preview-IQ4_XS.gguf # # model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf
gpu-layers: 9999 # # gpu-layers-draft: 0
# model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf # # draft-max: 16
# gpu-layers-draft: 0 # # draft-min: 5
# draft-max: 16 # - name: "deep-think"
# draft-min: 5 # port: 18084
# vram_usage: 29G
# ram_usage: 5G
# env:
# CUDA_VISIBLE_DEVICES: 0
# HSA_OVERRIDE_GFX_VERSION: '11.0.0'
# args:
# ctx-size: 32768
# flash-attn: true
# cache-type-k: q8_0
# cache-type-v: q8_0
# model: /media/SanDisk/ai/models_live/Qwen_QwQ-32B-IQ4_XS.gguf
# gpu-layers: 9999
# # model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf
# # gpu-layers-draft: 0
# # draft-max: 16
# # draft-min: 5