Optimize by stopping large models first

This commit is contained in:
Tristan D. 2024-10-08 17:27:45 +02:00
parent 6885baa679
commit 0391865791
Signed by: tristan
SSH key fingerprint: SHA256:3RU4RLOoM8oAjFU19f1W6t8uouZbA7GWkaSW6rjp1k8
6 changed files with 79 additions and 27 deletions

1
Cargo.lock generated
View file

@ -4176,6 +4176,7 @@ dependencies = [
"axum", "axum",
"futures", "futures",
"hyper", "hyper",
"itertools 0.13.0",
"pin-project-lite", "pin-project-lite",
"reqwest", "reqwest",
"reqwest-middleware", "reqwest-middleware",

8
flake.lock generated
View file

@ -64,16 +64,16 @@
] ]
}, },
"locked": { "locked": {
"lastModified": 1725824628, "lastModified": 1728330908,
"narHash": "sha256-oiVEb+PMKumREdoV1vEzxfSWFlHNNMgxADgfpFsR8pE=", "narHash": "sha256-2N7yfI0N4Up+aYzq7++BqMXZhuPcQGskSuq0TUcK5V0=",
"owner": "ggerganov", "owner": "ggerganov",
"repo": "llama.cpp", "repo": "llama.cpp",
"rev": "daa9623ab051a8162ae750b150b9522571b55f21", "rev": "6374743747b14db4eb73ce82ae449a2978bc3b47",
"type": "github" "type": "github"
}, },
"original": { "original": {
"owner": "ggerganov", "owner": "ggerganov",
"ref": "b3707", "ref": "b3896",
"repo": "llama.cpp", "repo": "llama.cpp",
"type": "github" "type": "github"
} }

View file

@ -23,7 +23,7 @@
flake = false; flake = false;
}; };
llama-cpp = { llama-cpp = {
url = "github:ggerganov/llama.cpp/b3707"; url = "github:ggerganov/llama.cpp/b3896";
inputs.nixpkgs.follows = "nixpkgs"; inputs.nixpkgs.follows = "nixpkgs";
inputs.flake-parts.follows = "flake-parts"; inputs.flake-parts.follows = "flake-parts";
}; };
@ -127,7 +127,8 @@
(lib.cmakeBool "GGML_STATIC" false) (lib.cmakeBool "GGML_STATIC" false)
(lib.cmakeBool "GGML_FMA" true) (lib.cmakeBool "GGML_FMA" true)
(lib.cmakeBool "GGML_F16C" true) (lib.cmakeBool "GGML_F16C" true)
(lib.cmakeBool "GGML_AVX512" true) (lib.cmakeBool "GGML_AVX2" true)
(lib.cmakeBool "GGML_AVX512" false)
(lib.cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang") (lib.cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
(lib.cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmTargets)) (lib.cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmTargets))
]; ];

View file

@ -27,3 +27,4 @@ tower = { version = "0.4", features = ["tokio", "tracing"] }
tower-http = { version = "0.5.2", features = ["trace"] } tower-http = { version = "0.5.2", features = ["trace"] }
reqwest-retry = "0.6.1" reqwest-retry = "0.6.1"
reqwest-middleware = { version = "0.3.3", features = ["charset", "http2", "json", "multipart", "rustls-tls"] } reqwest-middleware = { version = "0.3.3", features = ["charset", "http2", "json", "multipart", "rustls-tls"] }
itertools = "0.13.0"

View file

@ -1,26 +1,46 @@
hardware: hardware:
ram: 64G ram: 64G
vram: 8G vram: 30G
models: models:
- port: 18080 - name: "tabby-code"
port: 18080
internal_port: 28080 internal_port: 28080
vram_usage: 8.25G
ram_usage: 2.6G
env: env:
CUDA_VISIBLE_DEVICES: 0 CUDA_VISIBLE_DEVICES: 0
HSA_OVERRIDE_GFX_VERSION: '11.0.0' HSA_OVERRIDE_GFX_VERSION: '11.0.0'
args: args:
model: /home/tristand/Downloads/models/Phi-3.5-mini-instruct-Q6_K_L.gguf model: /media/SanDisk/ai/models_live/Qwen2.5-Coder-7B-Instruct-Q6_K_L.gguf
gpu-layers: 9999 gpu-layers: 9999
ctx-size: 4096 flash-attn: true
vram_usage: 6G ctx-size: 32768
ram_usage: 500M host: 0.0.0.0
- port: 18081 - name: "tabby-embeddings"
port: 18081
internal_port: 28081 internal_port: 28081
vram_usage: 1G
ram_usage: 2G
env: env:
CUDA_VISIBLE_DEVICES: 0 CUDA_VISIBLE_DEVICES: 0
HSA_OVERRIDE_GFX_VERSION: '11.0.0' HSA_OVERRIDE_GFX_VERSION: '11.0.0'
args: args:
model: /home/tristand/Downloads/models/Phi-3.5-mini-instruct-Q6_K_L.gguf model: /media/SanDisk/ai/models_live/nomic-embed-text-v1-f32.gguf
gpu-layers: 9999 gpu-layers: 9999
ctx-size: 4096 flash-attn: true
vram_usage: 6G host: 0.0.0.0
ram_usage: 500M embeddings: true
- name: "big-chat"
port: 18082
internal_port: 28082
vram_usage: 26.5G
ram_usage: 2.5G
env:
CUDA_VISIBLE_DEVICES: 0
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
args:
model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf
gpu-layers: 9999
ctx-size: 16384
flash-attn: true
host: 0.0.0.0

View file

@ -9,6 +9,7 @@ use axum::{
Router, Router,
}; };
use futures; use futures;
use itertools::Itertools;
use reqwest::Client; use reqwest::Client;
use serde::Deserialize; use serde::Deserialize;
use std::{collections::HashMap, net::SocketAddr, process::Stdio, sync::Arc}; use std::{collections::HashMap, net::SocketAddr, process::Stdio, sync::Arc};
@ -55,6 +56,7 @@ struct Hardware {
#[derive(Debug, Deserialize, Clone)] #[derive(Debug, Deserialize, Clone)]
struct ModelConfig { struct ModelConfig {
name: String,
port: u16, port: u16,
internal_port: u16, internal_port: u16,
env: HashMap<String, String>, env: HashMap<String, String>,
@ -63,13 +65,14 @@ struct ModelConfig {
ram_usage: String, ram_usage: String,
} }
#[derive(Clone)] #[derive(Clone, Debug)]
struct LlamaInstance { struct LlamaInstance {
config: ModelConfig, config: ModelConfig,
process: Arc<Mutex<Child>>, process: Arc<Mutex<Child>>,
// busy: bool, // busy: bool,
} }
#[derive(Clone, Debug)]
struct SharedState { struct SharedState {
total_ram: u64, total_ram: u64,
total_vram: u64, total_vram: u64,
@ -136,7 +139,7 @@ async fn main() {
let addr = SocketAddr::from(([0, 0, 0, 0], model_config.port)); let addr = SocketAddr::from(([0, 0, 0, 0], model_config.port));
println!("Listening on port {}", model_config.port); tracing::info!(msg = "Listening", ?model_config);
let listener = tokio::net::TcpListener::bind(&addr).await.unwrap(); let listener = tokio::net::TcpListener::bind(&addr).await.unwrap();
axum::serve(listener, app.into_make_service()) axum::serve(listener, app.into_make_service())
@ -198,22 +201,39 @@ async fn handle_request(
instance.to_owned() instance.to_owned()
} else { } else {
// Check resources // Check resources
if state.used_ram + model_ram_usage > state.total_ram tracing::info!(msg = "Current state", ?state);
|| state.used_vram + model_vram_usage > state.total_vram if ((state.used_ram + model_ram_usage) > state.total_ram)
|| ((state.used_vram + model_vram_usage) > state.total_vram)
{ {
// Stop other instances // Stop other instances
let mut to_remove = Vec::new(); let mut to_remove = Vec::new();
for (port, instance) in state.instances.clone() { // TODO Actual smart stopping logic
// - search for smallest single model to stop to get enough room
// - if not possible search for smallest number of models to stop with lowest
// amount of "overshot
let instances_by_size =
state
.instances
.clone()
.into_iter()
.sorted_by(|(_, el_a), (_, el_b)| {
Ord::cmp(
&parse_size(el_b.config.vram_usage.as_str()),
&parse_size(el_a.config.vram_usage.as_str()),
)
});
for (port, instance) in instances_by_size {
// if !instance.busy { // if !instance.busy {
tracing::info!("Stopping instance on port {}", port); tracing::info!("Stopping instance on port {}", port);
let mut process = instance.process.lock().await; let mut process = instance.process.lock().await;
process.kill().await.ok(); process.kill().await.ok();
to_remove.push(port);
state.used_ram -= parse_size(&instance.config.ram_usage).unwrap_or(0); state.used_ram -= parse_size(&instance.config.ram_usage).unwrap_or(0);
state.used_vram -= parse_size(&instance.config.vram_usage).unwrap_or(0); state.used_vram -= parse_size(&instance.config.vram_usage).unwrap_or(0);
to_remove.push(port);
if state.used_ram + model_ram_usage <= state.total_ram if state.used_ram + model_ram_usage <= state.total_ram
&& state.used_vram + model_vram_usage <= state.total_vram && state.used_vram + model_vram_usage <= state.total_vram
{ {
tracing::info!("Should have enough ram now");
break; break;
} }
// } // }
@ -222,16 +242,25 @@ async fn handle_request(
tracing::info!("Removing instance on port {}", port); tracing::info!("Removing instance on port {}", port);
state.instances.remove(&port); state.instances.remove(&port);
} }
} else {
tracing::info!("Already enough res free");
} }
// Start new instance // Start new instance
let args = model_config let args = model_config
.args .args
.iter() .iter()
.flat_map(|(k, v)| vec![format!("--{}", k), v.clone()]) .flat_map(|(k, v)| {
if v == "true" {
vec![format!("--{}", k)]
} else {
vec![format!("--{}", k), v.clone()]
}
})
.collect::<Vec<_>>(); .collect::<Vec<_>>();
let mut cmd = Command::new("llama-server"); let mut cmd = Command::new("llama-server");
cmd.kill_on_drop(true);
cmd.envs(model_config.env.clone()); cmd.envs(model_config.env.clone());
cmd.args(&args); cmd.args(&args);
cmd.arg("--port"); cmd.arg("--port");
@ -273,7 +302,7 @@ async fn handle_request(
.with(reqwest_retry::RetryTransientMiddleware::new_with_policy( .with(reqwest_retry::RetryTransientMiddleware::new_with_policy(
retry_policy, retry_policy,
)) ))
.build(); .build();
let uri = format!( let uri = format!(
"http://127.0.0.1:{}{}", "http://127.0.0.1:{}{}",
@ -342,7 +371,7 @@ fn parse_size(size_str: &str) -> Option<u64> {
let mut unit = String::new(); let mut unit = String::new();
for c in size_str.chars() { for c in size_str.chars() {
if c.is_digit(10) { if c.is_digit(10) || c == '.' {
num.push(c); num.push(c);
} else { } else {
unit.push(c); unit.push(c);
@ -355,7 +384,7 @@ fn parse_size(size_str: &str) -> Option<u64> {
"g" | "gb" => 1024 * 1024 * 1024, "g" | "gb" => 1024 * 1024 * 1024,
"m" | "mb" => 1024 * 1024, "m" | "mb" => 1024 * 1024,
"k" | "kb" => 1024, "k" | "kb" => 1024,
_ => 1, _ => panic!("Invalid Size"),
}; };
let res = (num * multiplier as f64) as u64; let res = (num * multiplier as f64) as u64;