Compare commits

...

2 commits

Author SHA1 Message Date
f799554173
Bump 2025-05-02 02:58:07 +02:00
cdd0a84d67
Add gfx1030 2025-04-29 09:36:22 +02:00
3 changed files with 21 additions and 11 deletions

8
flake.lock generated
View file

@ -64,16 +64,16 @@
]
},
"locked": {
"lastModified": 1745868199,
"narHash": "sha256-kTy1oNcN4bNHDV7yqIOSTo/JUN4enHeqyUeVE/q4iHg=",
"lastModified": 1745909149,
"narHash": "sha256-TDGa+/P14TAS1DC9MsTilNQ3sw/mL08s4KYIlTwW850=",
"owner": "ggerganov",
"repo": "llama.cpp",
"rev": "eaea3253244dc4bbe07f6cd81325847ccc6cf93e",
"rev": "b6ce7430b7eb51f032152316880204e0a9c0470e",
"type": "github"
},
"original": {
"owner": "ggerganov",
"ref": "b5214",
"ref": "b5216",
"repo": "llama.cpp",
"type": "github"
}

View file

@ -19,7 +19,7 @@
flake-parts.url = "github:hercules-ci/flake-parts";
devshell.url = "github:numtide/devshell";
llama-cpp = {
url = "github:ggerganov/llama.cpp/b5214";
url = "github:ggerganov/llama.cpp/b5216";
inputs.nixpkgs.follows = "nixpkgs";
inputs.flake-parts.follows = "flake-parts";
};
@ -89,6 +89,7 @@
wayland
];
rocmTargets = [
"gfx1030"
"gfx1100"
"gfx1102"
"gfx1103"

View file

@ -3,9 +3,9 @@ system_resources:
vram: 30G
model_specs:
- name: "tabby-code"
port: 18080
port: 28080 # to make sure its not sued for now
# internal_port: 28080 # Optional
autostart: true
autostart: false
vram_usage: 27G # Coder-32B + draft 0.5B
ram_usage: 3G # Coder-32B + draft 0.5B
# vram_usage: 8.25G # Coder-7B
@ -21,12 +21,12 @@ model_specs:
gpu-layers: 9999
model-draft: /media/SanDisk/ai/models_live/Qwen2.5-Coder-0.5B-Instruct-Q8_0.gguf
gpu-layers-draft: 9999
cache-type-k: q8_0
# cache-type-k: q8_0
cache-type-v: q8_0
draft-max: 16
draft-min: 5
- name: "tabby-embeddings"
port: 18081
port: 28081
vram_usage: 0.4G
ram_usage: 2.5G
env:
@ -46,10 +46,19 @@ model_specs:
CUDA_VISIBLE_DEVICES: 0
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
args:
ctx-size: 16384
ctx-size: 65536
flash-attn: true
model: /media/SanDisk/ai/models_live/Qwen_Qwen3-30B-A3B-Q5_K_S.gguf
model: /media/SanDisk/ai/models_live/Qwen_Qwen3-30B-A3B-Q4_K_L.gguf
cache-type-k: f16
cache-type-v: f16
gpu-layers: 9999
rope-scaling: yarn
rope-scale: 4
yarn-orig-ctx: 32768
temp: 0.65
top-p: 0.9
top-k: 20
min-p: 0.0
# model-draft: /media/SanDisk/ai/models_live/Qwen_Qwen3-0.6B-Q6_K.gguf
# gpu-layers-draft: 9999
# draft-max: 16