Compare commits

..

No commits in common. "3b4655728dfada0bb1b1ae01165c84b38881409d" and "58b04b09d1da3f1d68f55e2d1565b169ceb6c279" have entirely different histories.

4 changed files with 52 additions and 85 deletions

52
flake.lock generated
View file

@ -5,11 +5,11 @@
"nixpkgs": "nixpkgs"
},
"locked": {
"lastModified": 1728330715,
"narHash": "sha256-xRJ2nPOXb//u1jaBnDP56M7v5ldavjbtR6lfGqSvcKg=",
"lastModified": 1722113426,
"narHash": "sha256-Yo/3loq572A8Su6aY5GP56knpuKYRvM2a1meP9oJZCw=",
"owner": "numtide",
"repo": "devshell",
"rev": "dd6b80932022cea34a019e2bb32f6fa9e494dfef",
"rev": "67cce7359e4cd3c45296fb4aaf6a19e2a9c757ae",
"type": "github"
},
"original": {
@ -23,11 +23,11 @@
"nixpkgs-lib": "nixpkgs-lib"
},
"locked": {
"lastModified": 1730504689,
"narHash": "sha256-hgmguH29K2fvs9szpq2r3pz2/8cJd2LPS+b4tfNFCwE=",
"lastModified": 1725234343,
"narHash": "sha256-+ebgonl3NbiKD2UD0x4BszCZQ6sTfL4xioaM49o5B3Y=",
"owner": "hercules-ci",
"repo": "flake-parts",
"rev": "506278e768c2a08bec68eb62932193e341f55c90",
"rev": "567b938d64d4b4112ee253b9274472dc3a346eb6",
"type": "github"
},
"original": {
@ -41,11 +41,11 @@
"systems": "systems"
},
"locked": {
"lastModified": 1726560853,
"narHash": "sha256-X6rJYSESBVr3hBoH0WbKE5KvhPU5bloyZ2L4K60/fPQ=",
"lastModified": 1710146030,
"narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "c1dfcf08411b08f6b8615f7d8971a2bfa81d5e8a",
"rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
"type": "github"
},
"original": {
@ -64,16 +64,16 @@
]
},
"locked": {
"lastModified": 1732585640,
"narHash": "sha256-sZxUPkGSTpcGgaRoB6X0xqodZMcqayCtOSceZxc0FjU=",
"lastModified": 1728330908,
"narHash": "sha256-2N7yfI0N4Up+aYzq7++BqMXZhuPcQGskSuq0TUcK5V0=",
"owner": "ggerganov",
"repo": "llama.cpp",
"rev": "0eb4e12beebabae46d37b78742f4c5d4dbe52dc1",
"rev": "6374743747b14db4eb73ce82ae449a2978bc3b47",
"type": "github"
},
"original": {
"owner": "ggerganov",
"ref": "b4174",
"ref": "b3896",
"repo": "llama.cpp",
"type": "github"
}
@ -96,23 +96,23 @@
},
"nixpkgs-lib": {
"locked": {
"lastModified": 1730504152,
"narHash": "sha256-lXvH/vOfb4aGYyvFmZK/HlsNsr/0CVWlwYvo2rxJk3s=",
"lastModified": 1725233747,
"narHash": "sha256-Ss8QWLXdr2JCBPcYChJhz4xJm+h/xjl4G0c0XlP6a74=",
"type": "tarball",
"url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
"url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
},
"original": {
"type": "tarball",
"url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
"url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
}
},
"nixpkgs_2": {
"locked": {
"lastModified": 1730958623,
"narHash": "sha256-JwQZIGSYnRNOgDDoIgqKITrPVil+RMWHsZH1eE1VGN0=",
"lastModified": 1724208548,
"narHash": "sha256-8Aiur5lv2L8o9ErxHqS2F293MHiHCoRG8C4vCwhkeXo=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "85f7e662eda4fa3a995556527c87b2524b691933",
"rev": "4c30668e1edb7348169407f218fa7c71a94b17f3",
"type": "github"
},
"original": {
@ -124,11 +124,11 @@
},
"nixpkgs_3": {
"locked": {
"lastModified": 1728538411,
"narHash": "sha256-f0SBJz1eZ2yOuKUr5CA9BHULGXVSn6miBuUWdTyhUhU=",
"lastModified": 1718428119,
"narHash": "sha256-WdWDpNaq6u1IPtxtYHHWpl5BmabtpmLnMAx0RdJ/vo8=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "b69de56fac8c2b6f8fd27f2eca01dcda8e0a4221",
"rev": "e6cea36f83499eb4e9cd184c8a8e823296b50ad5",
"type": "github"
},
"original": {
@ -170,11 +170,11 @@
"nixpkgs": "nixpkgs_3"
},
"locked": {
"lastModified": 1731119076,
"narHash": "sha256-2eVhmocCZHJlFAz6Mt3EwPdFFVAtGgIySJc1EHQVxcc=",
"lastModified": 1724206841,
"narHash": "sha256-L8dKaX4T3k+TR2fEHCfGbH4UXdspovz/pj87iai9qmc=",
"owner": "oxalica",
"repo": "rust-overlay",
"rev": "23c4b3ba5f806fcf25d5a3b6b54fa0d07854c032",
"rev": "45e98fbd62c32e5927e952d2833fa1ba4fb35a61",
"type": "github"
},
"original": {

View file

@ -2,12 +2,12 @@
description = "A Nix-flake-based Rust development environment";
nixConfig = {
extra-substituters = [
# "https://nixcache.vlt81.de"
"https://nixcache.vlt81.de"
"https://llama-cpp.cachix.org"
"https://cuda-maintainers.cachix.org"
];
extra-trusted-public-keys = [
# "nixcache.vlt81.de:nw0FfUpePtL6P3IMNT9X6oln0Wg9REZINtkkI9SisqQ="
"nixcache.vlt81.de:nw0FfUpePtL6P3IMNT9X6oln0Wg9REZINtkkI9SisqQ="
"llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
"cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
];
@ -23,7 +23,7 @@
flake = false;
};
llama-cpp = {
url = "github:ggerganov/llama.cpp/b4174";
url = "github:ggerganov/llama.cpp/b3896";
inputs.nixpkgs.follows = "nixpkgs";
inputs.flake-parts.follows = "flake-parts";
};
@ -49,9 +49,6 @@
(final: prev: {
customRustToolchain = prev.rust-bin.fromRustupToolchainFile ./rust-toolchain.toml;
})
(final: prev: {
nodejs-16_x = prev.nodePackages.nodejs; # needed for npmlock2nix
})
(final: prev: {
npmlock2nix = import npmlock2nix { pkgs = prev; };
})
@ -76,7 +73,7 @@
};
customNodeModules = pkgs.npmlock2nix.v2.node_modules {
src = ./.;
nodejs = pkgs.nodejs_22;
nodejs = pkgs.nodejs_20;
};
buildInputs = with pkgs; [
harfbuzz
@ -124,8 +121,7 @@
(lib.cmakeBool "GGML_NATIVE" true)
(lib.cmakeBool "GGML_BLAS" false)
(lib.cmakeBool "GGML_CUDA" false)
(lib.cmakeBool "GGML_HIP" true) # new one ? kinda undocumented ?
(lib.cmakeBool "GGML_HIPBLAS" true) # seems to be depr
(lib.cmakeBool "GGML_HIPBLAS" true)
(lib.cmakeBool "GGML_METAL" false)
(lib.cmakeBool "GGML_VULKAN" false)
(lib.cmakeBool "GGML_STATIC" false)
@ -135,7 +131,6 @@
(lib.cmakeBool "GGML_AVX512" false)
(lib.cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
(lib.cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmTargets))
(lib.cmakeFeature "AMDGPU_TARGETS" (builtins.concatStringsSep ";" rocmTargets))
];
});
};
@ -167,13 +162,13 @@
cargo-outdated
cargo-release
calc
# jre8 # needed for xmlls
jre8 # needed for xmlls
dart-sass
fish
inotify-tools
leptosfmt
mold
# nodejs_20
nodejs_20
pkg-config
rustywind
sccache
@ -184,15 +179,16 @@
buildInputs = buildInputs;
shellHook = ''
# setup node-modules
export NPM_LOCAL_PREFIX=${customNodeModules}/node_modules
(ln -s $NPM_LOCAL_PREFIX ./node_modules 2>/dev/null || unlink ./node_modules) && ln -s $NPM_LOCAL_PREFIX ./node_modules 2>/dev/null
# setup node-modules
export NPM_LOCAL_PREFIX=${customNodeModules}/node_modules
(ln -s $NPM_LOCAL_PREFIX ./node_modules 2>/dev/null || unlink ./node_modules) && ln -s $NPM_LOCAL_PREFIX ./node_modules 2>/dev/null
# export NIX_LD_LIBRARY_PATH=${pkgs.lib.makeLibraryPath buildInputs}:$NIX_LD_LIBRARY_PATH
# export NIX_LD_LIBRARY_PATH=${pkgs.lib.makeLibraryPath buildInputs}:$NIX_LD_LIBRARY_PATH
export LD_LIBRARY_PATH="${pkgs.lib.makeLibraryPath buildInputs}"
export LEPTOS_SASS_VERSION=1.71.0
export LEPTOS_TAILWIND_VERSION=3.4.1
export MALLOC_CONF=thp:always,metadata_thp:always
export LEPTOS_SASS_VERSION=1.71.0
export LEPTOS_TAILWIND_VERSION=3.4.1
export MALLOC_CONF=thp:always,metadata_thp:always
'';
};
});

View file

@ -5,37 +5,30 @@ models:
- name: "tabby-code"
port: 18080
internal_port: 28080
autostart: true
vram_usage: 26.7G # Coder-32B + draft 0.5B
ram_usage: 3G # Coder-32B + draft 0.5B
# vram_usage: 8.25G # Coder-7B
# ram_usage: 2.6G # Coder-7B
vram_usage: 8.25G
ram_usage: 2.6G
env:
CUDA_VISIBLE_DEVICES: 0
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
args:
# host: 0.0.0.0
model: /media/SanDisk/ai/models_live/Qwen2.5-Coder-7B-Instruct-Q6_K_L.gguf
gpu-layers: 9999
flash-attn: true
ctx-size: 32768
model: /media/SanDisk/ai/models_live/Qwen2.5-Coder-32B-Instruct-IQ4_XS.gguf
gpu-layers: 9999
model-draft: /media/SanDisk/ai/models_live/Qwen2.5-Coder-0.5B-Q8_0.gguf
gpu-layers-draft: 9999
draft-max: 16
draft-min: 5
host: 0.0.0.0
- name: "tabby-embeddings"
port: 18081
internal_port: 28081
vram_usage: 0.4G
ram_usage: 2.5G
vram_usage: 1G
ram_usage: 2G
env:
CUDA_VISIBLE_DEVICES: 0
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
args:
model: /media/SanDisk/ai/models_live/nomic-embed-text-v1-f32.gguf
gpu-layers: -1
gpu-layers: 9999
flash-attn: true
# host: 0.0.0.0
host: 0.0.0.0
embeddings: true
- name: "big-chat"
port: 18082
@ -50,21 +43,4 @@ models:
gpu-layers: 9999
ctx-size: 16384
flash-attn: true
# host: 0.0.0.0
- name: "bigger-chat"
port: 18083
internal_port: 28083
vram_usage: 29G
ram_usage: 4G
env:
CUDA_VISIBLE_DEVICES: 0
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
args:
model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf
gpu-layers: 9999
flash-attn: true
# ctx-size: 8192
ctx-size: 16384
# host: 0.0.0.0
cache-type-k: q8_0
cache-type-v: q8_0
host: 0.0.0.0

View file

@ -84,10 +84,6 @@ struct SharedState {
#[tokio::main]
async fn main() {
// TODO add autostart of models based on config
// abstract starting logic out of handler for this to allow seperate calls to start
// maybe add to SharedState & LLamaInstance ?
initialize_logger();
// Read and parse the YAML configuration
let config_str = std::fs::read_to_string("config.yaml").expect("Failed to read config.yaml");
@ -268,10 +264,9 @@ async fn handle_request(
cmd.kill_on_drop(true);
cmd.envs(model_config.env.clone());
cmd.args(&args);
// TODO use openport crate via pick_random_unused_port for determining these
cmd.arg("--port");
cmd.arg(format!("{}", model_config.internal_port));
cmd.stdout(Stdio::null()).stderr(Stdio::null()); // TODO save output and allow retrieval via api
cmd.stdout(Stdio::null()).stderr(Stdio::null());
tracing::info!("Starting llama-server with {:?}", cmd);
let process = Arc::new(Mutex::new(