Compare commits

..

3 commits

4 changed files with 85 additions and 52 deletions

52
flake.lock generated
View file

@ -5,11 +5,11 @@
"nixpkgs": "nixpkgs" "nixpkgs": "nixpkgs"
}, },
"locked": { "locked": {
"lastModified": 1722113426, "lastModified": 1728330715,
"narHash": "sha256-Yo/3loq572A8Su6aY5GP56knpuKYRvM2a1meP9oJZCw=", "narHash": "sha256-xRJ2nPOXb//u1jaBnDP56M7v5ldavjbtR6lfGqSvcKg=",
"owner": "numtide", "owner": "numtide",
"repo": "devshell", "repo": "devshell",
"rev": "67cce7359e4cd3c45296fb4aaf6a19e2a9c757ae", "rev": "dd6b80932022cea34a019e2bb32f6fa9e494dfef",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -23,11 +23,11 @@
"nixpkgs-lib": "nixpkgs-lib" "nixpkgs-lib": "nixpkgs-lib"
}, },
"locked": { "locked": {
"lastModified": 1725234343, "lastModified": 1730504689,
"narHash": "sha256-+ebgonl3NbiKD2UD0x4BszCZQ6sTfL4xioaM49o5B3Y=", "narHash": "sha256-hgmguH29K2fvs9szpq2r3pz2/8cJd2LPS+b4tfNFCwE=",
"owner": "hercules-ci", "owner": "hercules-ci",
"repo": "flake-parts", "repo": "flake-parts",
"rev": "567b938d64d4b4112ee253b9274472dc3a346eb6", "rev": "506278e768c2a08bec68eb62932193e341f55c90",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -41,11 +41,11 @@
"systems": "systems" "systems": "systems"
}, },
"locked": { "locked": {
"lastModified": 1710146030, "lastModified": 1726560853,
"narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=", "narHash": "sha256-X6rJYSESBVr3hBoH0WbKE5KvhPU5bloyZ2L4K60/fPQ=",
"owner": "numtide", "owner": "numtide",
"repo": "flake-utils", "repo": "flake-utils",
"rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a", "rev": "c1dfcf08411b08f6b8615f7d8971a2bfa81d5e8a",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -64,16 +64,16 @@
] ]
}, },
"locked": { "locked": {
"lastModified": 1728330908, "lastModified": 1732585640,
"narHash": "sha256-2N7yfI0N4Up+aYzq7++BqMXZhuPcQGskSuq0TUcK5V0=", "narHash": "sha256-sZxUPkGSTpcGgaRoB6X0xqodZMcqayCtOSceZxc0FjU=",
"owner": "ggerganov", "owner": "ggerganov",
"repo": "llama.cpp", "repo": "llama.cpp",
"rev": "6374743747b14db4eb73ce82ae449a2978bc3b47", "rev": "0eb4e12beebabae46d37b78742f4c5d4dbe52dc1",
"type": "github" "type": "github"
}, },
"original": { "original": {
"owner": "ggerganov", "owner": "ggerganov",
"ref": "b3896", "ref": "b4174",
"repo": "llama.cpp", "repo": "llama.cpp",
"type": "github" "type": "github"
} }
@ -96,23 +96,23 @@
}, },
"nixpkgs-lib": { "nixpkgs-lib": {
"locked": { "locked": {
"lastModified": 1725233747, "lastModified": 1730504152,
"narHash": "sha256-Ss8QWLXdr2JCBPcYChJhz4xJm+h/xjl4G0c0XlP6a74=", "narHash": "sha256-lXvH/vOfb4aGYyvFmZK/HlsNsr/0CVWlwYvo2rxJk3s=",
"type": "tarball", "type": "tarball",
"url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz" "url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
}, },
"original": { "original": {
"type": "tarball", "type": "tarball",
"url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz" "url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
} }
}, },
"nixpkgs_2": { "nixpkgs_2": {
"locked": { "locked": {
"lastModified": 1724208548, "lastModified": 1730958623,
"narHash": "sha256-8Aiur5lv2L8o9ErxHqS2F293MHiHCoRG8C4vCwhkeXo=", "narHash": "sha256-JwQZIGSYnRNOgDDoIgqKITrPVil+RMWHsZH1eE1VGN0=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "4c30668e1edb7348169407f218fa7c71a94b17f3", "rev": "85f7e662eda4fa3a995556527c87b2524b691933",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -124,11 +124,11 @@
}, },
"nixpkgs_3": { "nixpkgs_3": {
"locked": { "locked": {
"lastModified": 1718428119, "lastModified": 1728538411,
"narHash": "sha256-WdWDpNaq6u1IPtxtYHHWpl5BmabtpmLnMAx0RdJ/vo8=", "narHash": "sha256-f0SBJz1eZ2yOuKUr5CA9BHULGXVSn6miBuUWdTyhUhU=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "e6cea36f83499eb4e9cd184c8a8e823296b50ad5", "rev": "b69de56fac8c2b6f8fd27f2eca01dcda8e0a4221",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -170,11 +170,11 @@
"nixpkgs": "nixpkgs_3" "nixpkgs": "nixpkgs_3"
}, },
"locked": { "locked": {
"lastModified": 1724206841, "lastModified": 1731119076,
"narHash": "sha256-L8dKaX4T3k+TR2fEHCfGbH4UXdspovz/pj87iai9qmc=", "narHash": "sha256-2eVhmocCZHJlFAz6Mt3EwPdFFVAtGgIySJc1EHQVxcc=",
"owner": "oxalica", "owner": "oxalica",
"repo": "rust-overlay", "repo": "rust-overlay",
"rev": "45e98fbd62c32e5927e952d2833fa1ba4fb35a61", "rev": "23c4b3ba5f806fcf25d5a3b6b54fa0d07854c032",
"type": "github" "type": "github"
}, },
"original": { "original": {

View file

@ -2,12 +2,12 @@
description = "A Nix-flake-based Rust development environment"; description = "A Nix-flake-based Rust development environment";
nixConfig = { nixConfig = {
extra-substituters = [ extra-substituters = [
"https://nixcache.vlt81.de" # "https://nixcache.vlt81.de"
"https://llama-cpp.cachix.org" "https://llama-cpp.cachix.org"
"https://cuda-maintainers.cachix.org" "https://cuda-maintainers.cachix.org"
]; ];
extra-trusted-public-keys = [ extra-trusted-public-keys = [
"nixcache.vlt81.de:nw0FfUpePtL6P3IMNT9X6oln0Wg9REZINtkkI9SisqQ=" # "nixcache.vlt81.de:nw0FfUpePtL6P3IMNT9X6oln0Wg9REZINtkkI9SisqQ="
"llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc=" "llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
"cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=" "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
]; ];
@ -23,7 +23,7 @@
flake = false; flake = false;
}; };
llama-cpp = { llama-cpp = {
url = "github:ggerganov/llama.cpp/b3896"; url = "github:ggerganov/llama.cpp/b4174";
inputs.nixpkgs.follows = "nixpkgs"; inputs.nixpkgs.follows = "nixpkgs";
inputs.flake-parts.follows = "flake-parts"; inputs.flake-parts.follows = "flake-parts";
}; };
@ -49,6 +49,9 @@
(final: prev: { (final: prev: {
customRustToolchain = prev.rust-bin.fromRustupToolchainFile ./rust-toolchain.toml; customRustToolchain = prev.rust-bin.fromRustupToolchainFile ./rust-toolchain.toml;
}) })
(final: prev: {
nodejs-16_x = prev.nodePackages.nodejs; # needed for npmlock2nix
})
(final: prev: { (final: prev: {
npmlock2nix = import npmlock2nix { pkgs = prev; }; npmlock2nix = import npmlock2nix { pkgs = prev; };
}) })
@ -73,7 +76,7 @@
}; };
customNodeModules = pkgs.npmlock2nix.v2.node_modules { customNodeModules = pkgs.npmlock2nix.v2.node_modules {
src = ./.; src = ./.;
nodejs = pkgs.nodejs_20; nodejs = pkgs.nodejs_22;
}; };
buildInputs = with pkgs; [ buildInputs = with pkgs; [
harfbuzz harfbuzz
@ -121,7 +124,8 @@
(lib.cmakeBool "GGML_NATIVE" true) (lib.cmakeBool "GGML_NATIVE" true)
(lib.cmakeBool "GGML_BLAS" false) (lib.cmakeBool "GGML_BLAS" false)
(lib.cmakeBool "GGML_CUDA" false) (lib.cmakeBool "GGML_CUDA" false)
(lib.cmakeBool "GGML_HIPBLAS" true) (lib.cmakeBool "GGML_HIP" true) # new one ? kinda undocumented ?
(lib.cmakeBool "GGML_HIPBLAS" true) # seems to be depr
(lib.cmakeBool "GGML_METAL" false) (lib.cmakeBool "GGML_METAL" false)
(lib.cmakeBool "GGML_VULKAN" false) (lib.cmakeBool "GGML_VULKAN" false)
(lib.cmakeBool "GGML_STATIC" false) (lib.cmakeBool "GGML_STATIC" false)
@ -131,6 +135,7 @@
(lib.cmakeBool "GGML_AVX512" false) (lib.cmakeBool "GGML_AVX512" false)
(lib.cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang") (lib.cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
(lib.cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmTargets)) (lib.cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmTargets))
(lib.cmakeFeature "AMDGPU_TARGETS" (builtins.concatStringsSep ";" rocmTargets))
]; ];
}); });
}; };
@ -162,13 +167,13 @@
cargo-outdated cargo-outdated
cargo-release cargo-release
calc calc
jre8 # needed for xmlls # jre8 # needed for xmlls
dart-sass dart-sass
fish fish
inotify-tools inotify-tools
leptosfmt leptosfmt
mold mold
nodejs_20 # nodejs_20
pkg-config pkg-config
rustywind rustywind
sccache sccache
@ -179,16 +184,15 @@
buildInputs = buildInputs; buildInputs = buildInputs;
shellHook = '' shellHook = ''
# setup node-modules # setup node-modules
export NPM_LOCAL_PREFIX=${customNodeModules}/node_modules export NPM_LOCAL_PREFIX=${customNodeModules}/node_modules
(ln -s $NPM_LOCAL_PREFIX ./node_modules 2>/dev/null || unlink ./node_modules) && ln -s $NPM_LOCAL_PREFIX ./node_modules 2>/dev/null (ln -s $NPM_LOCAL_PREFIX ./node_modules 2>/dev/null || unlink ./node_modules) && ln -s $NPM_LOCAL_PREFIX ./node_modules 2>/dev/null
# export NIX_LD_LIBRARY_PATH=${pkgs.lib.makeLibraryPath buildInputs}:$NIX_LD_LIBRARY_PATH # export NIX_LD_LIBRARY_PATH=${pkgs.lib.makeLibraryPath buildInputs}:$NIX_LD_LIBRARY_PATH
export LD_LIBRARY_PATH="${pkgs.lib.makeLibraryPath buildInputs}" export LD_LIBRARY_PATH="${pkgs.lib.makeLibraryPath buildInputs}"
export LEPTOS_SASS_VERSION=1.71.0 export LEPTOS_SASS_VERSION=1.71.0
export LEPTOS_TAILWIND_VERSION=3.4.1 export LEPTOS_TAILWIND_VERSION=3.4.1
export MALLOC_CONF=thp:always,metadata_thp:always export MALLOC_CONF=thp:always,metadata_thp:always
''; '';
}; };
}); });

View file

@ -5,30 +5,37 @@ models:
- name: "tabby-code" - name: "tabby-code"
port: 18080 port: 18080
internal_port: 28080 internal_port: 28080
vram_usage: 8.25G autostart: true
ram_usage: 2.6G vram_usage: 26.7G # Coder-32B + draft 0.5B
ram_usage: 3G # Coder-32B + draft 0.5B
# vram_usage: 8.25G # Coder-7B
# ram_usage: 2.6G # Coder-7B
env: env:
CUDA_VISIBLE_DEVICES: 0 CUDA_VISIBLE_DEVICES: 0
HSA_OVERRIDE_GFX_VERSION: '11.0.0' HSA_OVERRIDE_GFX_VERSION: '11.0.0'
args: args:
model: /media/SanDisk/ai/models_live/Qwen2.5-Coder-7B-Instruct-Q6_K_L.gguf # host: 0.0.0.0
gpu-layers: 9999
flash-attn: true flash-attn: true
ctx-size: 32768 ctx-size: 32768
host: 0.0.0.0 model: /media/SanDisk/ai/models_live/Qwen2.5-Coder-32B-Instruct-IQ4_XS.gguf
gpu-layers: 9999
model-draft: /media/SanDisk/ai/models_live/Qwen2.5-Coder-0.5B-Q8_0.gguf
gpu-layers-draft: 9999
draft-max: 16
draft-min: 5
- name: "tabby-embeddings" - name: "tabby-embeddings"
port: 18081 port: 18081
internal_port: 28081 internal_port: 28081
vram_usage: 1G vram_usage: 0.4G
ram_usage: 2G ram_usage: 2.5G
env: env:
CUDA_VISIBLE_DEVICES: 0 CUDA_VISIBLE_DEVICES: 0
HSA_OVERRIDE_GFX_VERSION: '11.0.0' HSA_OVERRIDE_GFX_VERSION: '11.0.0'
args: args:
model: /media/SanDisk/ai/models_live/nomic-embed-text-v1-f32.gguf model: /media/SanDisk/ai/models_live/nomic-embed-text-v1-f32.gguf
gpu-layers: 9999 gpu-layers: -1
flash-attn: true flash-attn: true
host: 0.0.0.0 # host: 0.0.0.0
embeddings: true embeddings: true
- name: "big-chat" - name: "big-chat"
port: 18082 port: 18082
@ -43,4 +50,21 @@ models:
gpu-layers: 9999 gpu-layers: 9999
ctx-size: 16384 ctx-size: 16384
flash-attn: true flash-attn: true
host: 0.0.0.0 # host: 0.0.0.0
- name: "bigger-chat"
port: 18083
internal_port: 28083
vram_usage: 29G
ram_usage: 4G
env:
CUDA_VISIBLE_DEVICES: 0
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
args:
model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf
gpu-layers: 9999
flash-attn: true
# ctx-size: 8192
ctx-size: 16384
# host: 0.0.0.0
cache-type-k: q8_0
cache-type-v: q8_0

View file

@ -84,6 +84,10 @@ struct SharedState {
#[tokio::main] #[tokio::main]
async fn main() { async fn main() {
// TODO add autostart of models based on config
// abstract starting logic out of handler for this to allow seperate calls to start
// maybe add to SharedState & LLamaInstance ?
initialize_logger(); initialize_logger();
// Read and parse the YAML configuration // Read and parse the YAML configuration
let config_str = std::fs::read_to_string("config.yaml").expect("Failed to read config.yaml"); let config_str = std::fs::read_to_string("config.yaml").expect("Failed to read config.yaml");
@ -264,9 +268,10 @@ async fn handle_request(
cmd.kill_on_drop(true); cmd.kill_on_drop(true);
cmd.envs(model_config.env.clone()); cmd.envs(model_config.env.clone());
cmd.args(&args); cmd.args(&args);
// TODO use openport crate via pick_random_unused_port for determining these
cmd.arg("--port"); cmd.arg("--port");
cmd.arg(format!("{}", model_config.internal_port)); cmd.arg(format!("{}", model_config.internal_port));
cmd.stdout(Stdio::null()).stderr(Stdio::null()); cmd.stdout(Stdio::null()).stderr(Stdio::null()); // TODO save output and allow retrieval via api
tracing::info!("Starting llama-server with {:?}", cmd); tracing::info!("Starting llama-server with {:?}", cmd);
let process = Arc::new(Mutex::new( let process = Arc::new(Mutex::new(