Upgrade test cfg
This commit is contained in:
parent
2e85273aed
commit
b44e953e7b
1 changed files with 80 additions and 62 deletions
|
@ -6,7 +6,7 @@ model_specs:
|
|||
port: 18080
|
||||
# internal_port: 28080 # Optional
|
||||
autostart: true
|
||||
vram_usage: 26.7G # Coder-32B + draft 0.5B
|
||||
vram_usage: 27G # Coder-32B + draft 0.5B
|
||||
ram_usage: 3G # Coder-32B + draft 0.5B
|
||||
# vram_usage: 8.25G # Coder-7B
|
||||
# ram_usage: 2.6G # Coder-7B
|
||||
|
@ -19,8 +19,10 @@ model_specs:
|
|||
ctx-size: 32768
|
||||
model: /media/SanDisk/ai/models_live/Qwen2.5-Coder-32B-Instruct-IQ4_XS.gguf
|
||||
gpu-layers: 9999
|
||||
model-draft: /media/SanDisk/ai/models_live/Qwen2.5-Coder-0.5B-Q8_0.gguf
|
||||
model-draft: /media/SanDisk/ai/models_live/Qwen2.5-Coder-0.5B-Instruct-Q8_0.gguf
|
||||
gpu-layers-draft: 9999
|
||||
cache-type-k: q8_0
|
||||
cache-type-v: q8_0
|
||||
draft-max: 16
|
||||
draft-min: 5
|
||||
- name: "tabby-embeddings"
|
||||
|
@ -31,12 +33,12 @@ model_specs:
|
|||
CUDA_VISIBLE_DEVICES: 0
|
||||
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
||||
args:
|
||||
model: /media/SanDisk/ai/models_live/nomic-embed-text-v1-f32.gguf
|
||||
model: /media/SanDisk/ai/models_live/nomic-embed-text-v1.f32.gguf
|
||||
gpu-layers: -1
|
||||
flash-attn: true
|
||||
# host: 0.0.0.0
|
||||
embeddings: true
|
||||
- name: "big-chat"
|
||||
- name: "chat"
|
||||
port: 18082
|
||||
vram_usage: 26.5G
|
||||
ram_usage: 2.5G
|
||||
|
@ -46,63 +48,79 @@ model_specs:
|
|||
args:
|
||||
ctx-size: 16384
|
||||
flash-attn: true
|
||||
model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf
|
||||
model: /media/SanDisk/ai/models_live/mistralai_Mistral-Small-3.1-24B-Instruct-2503-IQ4_XS
|
||||
gpu-layers: 9999
|
||||
model-draft: /media/SanDisk/ai/models_live/Qwen2.5-0.5B-Instruct-Q8_0.gguf
|
||||
model-draft: /media/SanDisk/ai/models_live/Mistral-Small-3.1-DRAFT-0.5B.Q8_0.gguf
|
||||
gpu-layers-draft: 9999
|
||||
# draft-max: 16
|
||||
# draft-min: 5
|
||||
- name: "bigger-chat"
|
||||
port: 18085
|
||||
vram_usage: 29G
|
||||
ram_usage: 5G
|
||||
env:
|
||||
CUDA_VISIBLE_DEVICES: 0
|
||||
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
||||
args:
|
||||
ctx-size: 8192
|
||||
flash-attn: true
|
||||
cache-type-k: q8_0
|
||||
cache-type-v: q8_0
|
||||
model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf
|
||||
gpu-layers: 9999
|
||||
model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf
|
||||
gpu-layers-draft: 0
|
||||
# draft-max: 16
|
||||
# draft-min: 5
|
||||
- name: "bigger-chat-2"
|
||||
port: 18083
|
||||
vram_usage: 29G
|
||||
ram_usage: 5G
|
||||
env:
|
||||
CUDA_VISIBLE_DEVICES: 0
|
||||
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
||||
args:
|
||||
ctx-size: 8192
|
||||
flash-attn: true
|
||||
cache-type-k: q8_0
|
||||
cache-type-v: q8_0
|
||||
model: /media/SanDisk/ai/models_live/Llama-3.3-70B-Instruct-IQ3_XXS.gguf
|
||||
gpu-layers: 9999
|
||||
# model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf
|
||||
# gpu-layers-draft: 0
|
||||
# draft-max: 16
|
||||
# draft-min: 5
|
||||
- name: "deep-think"
|
||||
port: 18084
|
||||
vram_usage: 29G
|
||||
ram_usage: 5G
|
||||
env:
|
||||
CUDA_VISIBLE_DEVICES: 0
|
||||
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
||||
args:
|
||||
ctx-size: 32768
|
||||
flash-attn: true
|
||||
draft-max: 16
|
||||
draft-min: 4
|
||||
# - name: "big-chat"
|
||||
# port: 18082
|
||||
# vram_usage: 26.5G
|
||||
# ram_usage: 2.5G
|
||||
# env:
|
||||
# CUDA_VISIBLE_DEVICES: 0
|
||||
# HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
||||
# args:
|
||||
# ctx-size: 16384
|
||||
# flash-attn: true
|
||||
# model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf
|
||||
# gpu-layers: 9999
|
||||
# model-draft: /media/SanDisk/ai/models_live/Qwen2.5-0.5B-Instruct-Q8_0.gguf
|
||||
# gpu-layers-draft: 9999
|
||||
# # draft-max: 16
|
||||
# # draft-min: 5
|
||||
# - name: "bigger-chat"
|
||||
# port: 18085
|
||||
# vram_usage: 29G
|
||||
# ram_usage: 5G
|
||||
# env:
|
||||
# CUDA_VISIBLE_DEVICES: 0
|
||||
# HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
||||
# args:
|
||||
# ctx-size: 8192
|
||||
# flash-attn: true
|
||||
# cache-type-k: q8_0
|
||||
# cache-type-v: q8_0
|
||||
model: /media/SanDisk/ai/models_live/QwQ-32B-Preview-IQ4_XS.gguf
|
||||
gpu-layers: 9999
|
||||
# model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf
|
||||
# gpu-layers: 9999
|
||||
# model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf
|
||||
# gpu-layers-draft: 0
|
||||
# draft-max: 16
|
||||
# draft-min: 5
|
||||
# # draft-max: 16
|
||||
# # draft-min: 5
|
||||
# - name: "bigger-chat-2"
|
||||
# port: 18083
|
||||
# vram_usage: 29G
|
||||
# ram_usage: 5G
|
||||
# env:
|
||||
# CUDA_VISIBLE_DEVICES: 0
|
||||
# HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
||||
# args:
|
||||
# ctx-size: 8192
|
||||
# flash-attn: true
|
||||
# cache-type-k: q8_0
|
||||
# cache-type-v: q8_0
|
||||
# model: /media/SanDisk/ai/models_live/Llama-3.3-70B-Instruct-IQ3_XXS.gguf
|
||||
# gpu-layers: 9999
|
||||
# # model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf
|
||||
# # gpu-layers-draft: 0
|
||||
# # draft-max: 16
|
||||
# # draft-min: 5
|
||||
# - name: "deep-think"
|
||||
# port: 18084
|
||||
# vram_usage: 29G
|
||||
# ram_usage: 5G
|
||||
# env:
|
||||
# CUDA_VISIBLE_DEVICES: 0
|
||||
# HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
||||
# args:
|
||||
# ctx-size: 32768
|
||||
# flash-attn: true
|
||||
# cache-type-k: q8_0
|
||||
# cache-type-v: q8_0
|
||||
# model: /media/SanDisk/ai/models_live/Qwen_QwQ-32B-IQ4_XS.gguf
|
||||
# gpu-layers: 9999
|
||||
# # model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf
|
||||
# # gpu-layers-draft: 0
|
||||
# # draft-max: 16
|
||||
# # draft-min: 5
|
||||
|
|
Loading…
Add table
Reference in a new issue