From 03918657919a30f0602e2e01818e2b23f4483a91 Mon Sep 17 00:00:00 2001 From: Tristan Druyen Date: Tue, 8 Oct 2024 17:27:45 +0200 Subject: [PATCH] Optimize by stopping large models first --- Cargo.lock | 1 + flake.lock | 8 +++--- flake.nix | 5 ++-- llama_proxy_man/Cargo.toml | 1 + llama_proxy_man/config.yaml | 42 ++++++++++++++++++++++--------- llama_proxy_man/src/main.rs | 49 +++++++++++++++++++++++++++++-------- 6 files changed, 79 insertions(+), 27 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index dda78f3..d038a1b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4176,6 +4176,7 @@ dependencies = [ "axum", "futures", "hyper", + "itertools 0.13.0", "pin-project-lite", "reqwest", "reqwest-middleware", diff --git a/flake.lock b/flake.lock index c136b12..ecc1bd8 100644 --- a/flake.lock +++ b/flake.lock @@ -64,16 +64,16 @@ ] }, "locked": { - "lastModified": 1725824628, - "narHash": "sha256-oiVEb+PMKumREdoV1vEzxfSWFlHNNMgxADgfpFsR8pE=", + "lastModified": 1728330908, + "narHash": "sha256-2N7yfI0N4Up+aYzq7++BqMXZhuPcQGskSuq0TUcK5V0=", "owner": "ggerganov", "repo": "llama.cpp", - "rev": "daa9623ab051a8162ae750b150b9522571b55f21", + "rev": "6374743747b14db4eb73ce82ae449a2978bc3b47", "type": "github" }, "original": { "owner": "ggerganov", - "ref": "b3707", + "ref": "b3896", "repo": "llama.cpp", "type": "github" } diff --git a/flake.nix b/flake.nix index edff2d0..75ac496 100644 --- a/flake.nix +++ b/flake.nix @@ -23,7 +23,7 @@ flake = false; }; llama-cpp = { - url = "github:ggerganov/llama.cpp/b3707"; + url = "github:ggerganov/llama.cpp/b3896"; inputs.nixpkgs.follows = "nixpkgs"; inputs.flake-parts.follows = "flake-parts"; }; @@ -127,7 +127,8 @@ (lib.cmakeBool "GGML_STATIC" false) (lib.cmakeBool "GGML_FMA" true) (lib.cmakeBool "GGML_F16C" true) - (lib.cmakeBool "GGML_AVX512" true) + (lib.cmakeBool "GGML_AVX2" true) + (lib.cmakeBool "GGML_AVX512" false) (lib.cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang") (lib.cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmTargets)) ]; diff --git a/llama_proxy_man/Cargo.toml b/llama_proxy_man/Cargo.toml index e2a0ce2..23aa511 100644 --- a/llama_proxy_man/Cargo.toml +++ b/llama_proxy_man/Cargo.toml @@ -27,3 +27,4 @@ tower = { version = "0.4", features = ["tokio", "tracing"] } tower-http = { version = "0.5.2", features = ["trace"] } reqwest-retry = "0.6.1" reqwest-middleware = { version = "0.3.3", features = ["charset", "http2", "json", "multipart", "rustls-tls"] } +itertools = "0.13.0" diff --git a/llama_proxy_man/config.yaml b/llama_proxy_man/config.yaml index 667ed47..8aba5d6 100644 --- a/llama_proxy_man/config.yaml +++ b/llama_proxy_man/config.yaml @@ -1,26 +1,46 @@ hardware: ram: 64G - vram: 8G + vram: 30G models: - - port: 18080 + - name: "tabby-code" + port: 18080 internal_port: 28080 + vram_usage: 8.25G + ram_usage: 2.6G env: CUDA_VISIBLE_DEVICES: 0 HSA_OVERRIDE_GFX_VERSION: '11.0.0' args: - model: /home/tristand/Downloads/models/Phi-3.5-mini-instruct-Q6_K_L.gguf + model: /media/SanDisk/ai/models_live/Qwen2.5-Coder-7B-Instruct-Q6_K_L.gguf gpu-layers: 9999 - ctx-size: 4096 - vram_usage: 6G - ram_usage: 500M - - port: 18081 + flash-attn: true + ctx-size: 32768 + host: 0.0.0.0 + - name: "tabby-embeddings" + port: 18081 internal_port: 28081 + vram_usage: 1G + ram_usage: 2G env: CUDA_VISIBLE_DEVICES: 0 HSA_OVERRIDE_GFX_VERSION: '11.0.0' args: - model: /home/tristand/Downloads/models/Phi-3.5-mini-instruct-Q6_K_L.gguf + model: /media/SanDisk/ai/models_live/nomic-embed-text-v1-f32.gguf gpu-layers: 9999 - ctx-size: 4096 - vram_usage: 6G - ram_usage: 500M + flash-attn: true + host: 0.0.0.0 + embeddings: true + - name: "big-chat" + port: 18082 + internal_port: 28082 + vram_usage: 26.5G + ram_usage: 2.5G + env: + CUDA_VISIBLE_DEVICES: 0 + HSA_OVERRIDE_GFX_VERSION: '11.0.0' + args: + model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf + gpu-layers: 9999 + ctx-size: 16384 + flash-attn: true + host: 0.0.0.0 diff --git a/llama_proxy_man/src/main.rs b/llama_proxy_man/src/main.rs index 8d6ce5c..a0937b0 100644 --- a/llama_proxy_man/src/main.rs +++ b/llama_proxy_man/src/main.rs @@ -9,6 +9,7 @@ use axum::{ Router, }; use futures; +use itertools::Itertools; use reqwest::Client; use serde::Deserialize; use std::{collections::HashMap, net::SocketAddr, process::Stdio, sync::Arc}; @@ -55,6 +56,7 @@ struct Hardware { #[derive(Debug, Deserialize, Clone)] struct ModelConfig { + name: String, port: u16, internal_port: u16, env: HashMap, @@ -63,13 +65,14 @@ struct ModelConfig { ram_usage: String, } -#[derive(Clone)] +#[derive(Clone, Debug)] struct LlamaInstance { config: ModelConfig, process: Arc>, // busy: bool, } +#[derive(Clone, Debug)] struct SharedState { total_ram: u64, total_vram: u64, @@ -136,7 +139,7 @@ async fn main() { let addr = SocketAddr::from(([0, 0, 0, 0], model_config.port)); - println!("Listening on port {}", model_config.port); + tracing::info!(msg = "Listening", ?model_config); let listener = tokio::net::TcpListener::bind(&addr).await.unwrap(); axum::serve(listener, app.into_make_service()) @@ -198,22 +201,39 @@ async fn handle_request( instance.to_owned() } else { // Check resources - if state.used_ram + model_ram_usage > state.total_ram - || state.used_vram + model_vram_usage > state.total_vram + tracing::info!(msg = "Current state", ?state); + if ((state.used_ram + model_ram_usage) > state.total_ram) + || ((state.used_vram + model_vram_usage) > state.total_vram) { // Stop other instances let mut to_remove = Vec::new(); - for (port, instance) in state.instances.clone() { + // TODO Actual smart stopping logic + // - search for smallest single model to stop to get enough room + // - if not possible search for smallest number of models to stop with lowest + // amount of "overshot + let instances_by_size = + state + .instances + .clone() + .into_iter() + .sorted_by(|(_, el_a), (_, el_b)| { + Ord::cmp( + &parse_size(el_b.config.vram_usage.as_str()), + &parse_size(el_a.config.vram_usage.as_str()), + ) + }); + for (port, instance) in instances_by_size { // if !instance.busy { tracing::info!("Stopping instance on port {}", port); let mut process = instance.process.lock().await; process.kill().await.ok(); + to_remove.push(port); state.used_ram -= parse_size(&instance.config.ram_usage).unwrap_or(0); state.used_vram -= parse_size(&instance.config.vram_usage).unwrap_or(0); - to_remove.push(port); if state.used_ram + model_ram_usage <= state.total_ram && state.used_vram + model_vram_usage <= state.total_vram { + tracing::info!("Should have enough ram now"); break; } // } @@ -222,16 +242,25 @@ async fn handle_request( tracing::info!("Removing instance on port {}", port); state.instances.remove(&port); } + } else { + tracing::info!("Already enough res free"); } // Start new instance let args = model_config .args .iter() - .flat_map(|(k, v)| vec![format!("--{}", k), v.clone()]) + .flat_map(|(k, v)| { + if v == "true" { + vec![format!("--{}", k)] + } else { + vec![format!("--{}", k), v.clone()] + } + }) .collect::>(); let mut cmd = Command::new("llama-server"); + cmd.kill_on_drop(true); cmd.envs(model_config.env.clone()); cmd.args(&args); cmd.arg("--port"); @@ -273,7 +302,7 @@ async fn handle_request( .with(reqwest_retry::RetryTransientMiddleware::new_with_policy( retry_policy, )) - .build(); + .build(); let uri = format!( "http://127.0.0.1:{}{}", @@ -342,7 +371,7 @@ fn parse_size(size_str: &str) -> Option { let mut unit = String::new(); for c in size_str.chars() { - if c.is_digit(10) { + if c.is_digit(10) || c == '.' { num.push(c); } else { unit.push(c); @@ -355,7 +384,7 @@ fn parse_size(size_str: &str) -> Option { "g" | "gb" => 1024 * 1024 * 1024, "m" | "mb" => 1024 * 1024, "k" | "kb" => 1024, - _ => 1, + _ => panic!("Invalid Size"), }; let res = (num * multiplier as f64) as u64;