diff --git a/Cargo.lock b/Cargo.lock index 8882d76..c982d96 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -215,7 +215,7 @@ dependencies = [ [[package]] name = "axum-controller" -version = "0.2.2" +version = "0.2.1" dependencies = [ "axum-controller-macros", "axum-typed-routing", @@ -223,7 +223,7 @@ dependencies = [ [[package]] name = "axum-controller-macros" -version = "0.2.2" +version = "0.2.1" dependencies = [ "prettyplease", "proc-macro2", diff --git a/flake.lock b/flake.lock index 4820b73..a505047 100644 --- a/flake.lock +++ b/flake.lock @@ -64,16 +64,16 @@ ] }, "locked": { - "lastModified": 1744745160, - "narHash": "sha256-tTfodLDvQslNA5irDmavbOnHsCyaQ3uQKxmsoKtrKdU=", + "lastModified": 1745868199, + "narHash": "sha256-kTy1oNcN4bNHDV7yqIOSTo/JUN4enHeqyUeVE/q4iHg=", "owner": "ggerganov", "repo": "llama.cpp", - "rev": "80f19b41869728eeb6a26569957b92a773a2b2c6", + "rev": "eaea3253244dc4bbe07f6cd81325847ccc6cf93e", "type": "github" }, "original": { "owner": "ggerganov", - "ref": "b5142", + "ref": "b5214", "repo": "llama.cpp", "type": "github" } diff --git a/flake.nix b/flake.nix index eac9198..e8543e5 100644 --- a/flake.nix +++ b/flake.nix @@ -19,7 +19,7 @@ flake-parts.url = "github:hercules-ci/flake-parts"; devshell.url = "github:numtide/devshell"; llama-cpp = { - url = "github:ggerganov/llama.cpp/b5142"; + url = "github:ggerganov/llama.cpp/b5214"; inputs.nixpkgs.follows = "nixpkgs"; inputs.flake-parts.follows = "flake-parts"; }; diff --git a/llama_proxy_man/config.yaml b/llama_proxy_man/config.yaml index e8048ca..9666414 100644 --- a/llama_proxy_man/config.yaml +++ b/llama_proxy_man/config.yaml @@ -48,12 +48,12 @@ model_specs: args: ctx-size: 16384 flash-attn: true - model: /media/SanDisk/ai/models_live/mistralai_Mistral-Small-3.1-24B-Instruct-2503-IQ4_XS + model: /media/SanDisk/ai/models_live/Qwen_Qwen3-30B-A3B-Q5_K_S.gguf gpu-layers: 9999 - model-draft: /media/SanDisk/ai/models_live/Mistral-Small-3.1-DRAFT-0.5B.Q8_0.gguf - gpu-layers-draft: 9999 - draft-max: 16 - draft-min: 4 + # model-draft: /media/SanDisk/ai/models_live/Qwen_Qwen3-0.6B-Q6_K.gguf + # gpu-layers-draft: 9999 + # draft-max: 16 + # draft-min: 4 # - name: "big-chat" # port: 18082 # vram_usage: 26.5G