redvault-ai/llama_proxy_man/config.yaml

109 lines
3 KiB
YAML
Raw Normal View History

2025-02-11 01:02:16 +01:00
system_resources:
2024-11-27 10:04:02 +01:00
ram: 48G
vram: 30G
2025-02-11 01:02:16 +01:00
model_specs:
- name: "tabby-code"
port: 18080
2024-11-27 10:04:02 +01:00
# internal_port: 28080 # Optional
2024-11-26 18:28:37 +01:00
autostart: true
vram_usage: 26.7G # Coder-32B + draft 0.5B
ram_usage: 3G # Coder-32B + draft 0.5B
# vram_usage: 8.25G # Coder-7B
# ram_usage: 2.6G # Coder-7B
2024-09-19 16:49:46 +02:00
env:
CUDA_VISIBLE_DEVICES: 0
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
args:
2024-11-26 18:28:37 +01:00
# host: 0.0.0.0
flash-attn: true
ctx-size: 32768
2024-11-26 18:28:37 +01:00
model: /media/SanDisk/ai/models_live/Qwen2.5-Coder-32B-Instruct-IQ4_XS.gguf
gpu-layers: 9999
model-draft: /media/SanDisk/ai/models_live/Qwen2.5-Coder-0.5B-Q8_0.gguf
gpu-layers-draft: 9999
draft-max: 16
draft-min: 5
- name: "tabby-embeddings"
port: 18081
2024-11-26 18:28:37 +01:00
vram_usage: 0.4G
ram_usage: 2.5G
2024-09-19 16:49:46 +02:00
env:
CUDA_VISIBLE_DEVICES: 0
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
args:
model: /media/SanDisk/ai/models_live/nomic-embed-text-v1-f32.gguf
2024-11-26 18:28:37 +01:00
gpu-layers: -1
flash-attn: true
2024-11-26 18:28:37 +01:00
# host: 0.0.0.0
embeddings: true
- name: "big-chat"
port: 18082
vram_usage: 26.5G
ram_usage: 2.5G
env:
CUDA_VISIBLE_DEVICES: 0
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
args:
ctx-size: 16384
flash-attn: true
2024-11-27 10:04:02 +01:00
model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf
gpu-layers: 9999
model-draft: /media/SanDisk/ai/models_live/Qwen2.5-0.5B-Instruct-Q8_0.gguf
gpu-layers-draft: 9999
# draft-max: 16
# draft-min: 5
2024-11-26 18:28:37 +01:00
- name: "bigger-chat"
2025-01-31 13:20:40 +01:00
port: 18085
2024-11-26 18:28:37 +01:00
vram_usage: 29G
2024-11-27 10:04:02 +01:00
ram_usage: 5G
2024-11-26 18:28:37 +01:00
env:
CUDA_VISIBLE_DEVICES: 0
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
args:
2024-11-27 10:18:54 +01:00
ctx-size: 8192
2024-11-27 10:04:02 +01:00
flash-attn: true
2024-11-26 18:28:37 +01:00
cache-type-k: q8_0
cache-type-v: q8_0
2024-11-27 10:04:02 +01:00
model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf
gpu-layers: 9999
model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf
gpu-layers-draft: 0
# draft-max: 16
# draft-min: 5
2025-01-31 13:20:40 +01:00
- name: "bigger-chat-2"
port: 18083
vram_usage: 29G
ram_usage: 5G
env:
CUDA_VISIBLE_DEVICES: 0
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
args:
ctx-size: 8192
flash-attn: true
cache-type-k: q8_0
cache-type-v: q8_0
model: /media/SanDisk/ai/models_live/Llama-3.3-70B-Instruct-IQ3_XXS.gguf
gpu-layers: 9999
# model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf
# gpu-layers-draft: 0
# draft-max: 16
# draft-min: 5
- name: "deep-think"
port: 18084
vram_usage: 29G
ram_usage: 5G
env:
CUDA_VISIBLE_DEVICES: 0
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
args:
ctx-size: 32768
flash-attn: true
# cache-type-k: q8_0
# cache-type-v: q8_0
model: /media/SanDisk/ai/models_live/QwQ-32B-Preview-IQ4_XS.gguf
gpu-layers: 9999
# model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf
# gpu-layers-draft: 0
# draft-max: 16
# draft-min: 5