3 changed files with 11 additions and 21 deletions
--- a/flake.lock
+++ b/flake.lock
@ -64,16 +64,16 @@
        ]
      },
      "locked": {
-        "lastModified": 1745909149,
-        "narHash": "sha256-TDGa+/P14TAS1DC9MsTilNQ3sw/mL08s4KYIlTwW850=",
+        "lastModified": 1745868199,
+        "narHash": "sha256-kTy1oNcN4bNHDV7yqIOSTo/JUN4enHeqyUeVE/q4iHg=",
        "owner": "ggerganov",
        "repo": "llama.cpp",
-        "rev": "b6ce7430b7eb51f032152316880204e0a9c0470e",
+        "rev": "eaea3253244dc4bbe07f6cd81325847ccc6cf93e",
        "type": "github"
      },
      "original": {
        "owner": "ggerganov",
-        "ref": "b5216",
+        "ref": "b5214",
        "repo": "llama.cpp",
        "type": "github"
      }
--- a/flake.nix
+++ b/flake.nix
@ -19,7 +19,7 @@
    flake-parts.url = "github:hercules-ci/flake-parts";
    devshell.url = "github:numtide/devshell";
    llama-cpp = {
-      url = "github:ggerganov/llama.cpp/b5216";
+      url = "github:ggerganov/llama.cpp/b5214";
      inputs.nixpkgs.follows = "nixpkgs";
      inputs.flake-parts.follows = "flake-parts";
    };
@ -89,7 +89,6 @@
          wayland
        ];
        rocmTargets = [
-          "gfx1030"
          "gfx1100"
          "gfx1102"
          "gfx1103"
--- a/llama_proxy_man/config.yaml
+++ b/llama_proxy_man/config.yaml
@ -3,9 +3,9 @@ system_resources:
  vram: 30G
 model_specs:
  - name: "tabby-code"
-    port: 28080 # to make sure its not sued for now
+    port: 18080
    # internal_port: 28080 # Optional
-    autostart: false
+    autostart: true
    vram_usage: 27G     # Coder-32B + draft 0.5B
    ram_usage: 3G       # Coder-32B + draft 0.5B
    # vram_usage: 8.25G # Coder-7B
@ -21,12 +21,12 @@ model_specs:
      gpu-layers: 9999
      model-draft: /media/SanDisk/ai/models_live/Qwen2.5-Coder-0.5B-Instruct-Q8_0.gguf
      gpu-layers-draft: 9999
-      # cache-type-k: q8_0
+      cache-type-k: q8_0
      cache-type-v: q8_0
      draft-max: 16
      draft-min: 5
  - name: "tabby-embeddings"
-    port: 28081
+    port: 18081
    vram_usage: 0.4G
    ram_usage: 2.5G
    env:
@ -46,19 +46,10 @@ model_specs:
      CUDA_VISIBLE_DEVICES: 0
      HSA_OVERRIDE_GFX_VERSION: '11.0.0'
    args:
-      ctx-size: 65536
+      ctx-size: 16384
      flash-attn: true
-      model: /media/SanDisk/ai/models_live/Qwen_Qwen3-30B-A3B-Q4_K_L.gguf
-      cache-type-k: f16
-      cache-type-v: f16
+      model: /media/SanDisk/ai/models_live/Qwen_Qwen3-30B-A3B-Q5_K_S.gguf
      gpu-layers: 9999
-      rope-scaling: yarn 
-      rope-scale: 4 
-      yarn-orig-ctx: 32768
-      temp: 0.65
-      top-p: 0.9
-      top-k: 20
-      min-p: 0.0
      # model-draft: /media/SanDisk/ai/models_live/Qwen_Qwen3-0.6B-Q6_K.gguf
      # gpu-layers-draft: 9999
      # draft-max: 16