Optimize by stopping large models first

This commit is contained in:
Tristan D. 2024-10-08 17:27:45 +02:00
parent 6885baa679
commit 0391865791
Signed by: tristan
SSH key fingerprint: SHA256:3RU4RLOoM8oAjFU19f1W6t8uouZbA7GWkaSW6rjp1k8
6 changed files with 79 additions and 27 deletions

1
Cargo.lock generated
View file

@ -4176,6 +4176,7 @@ dependencies = [
"axum",
"futures",
"hyper",
"itertools 0.13.0",
"pin-project-lite",
"reqwest",
"reqwest-middleware",

8
flake.lock generated
View file

@ -64,16 +64,16 @@
]
},
"locked": {
"lastModified": 1725824628,
"narHash": "sha256-oiVEb+PMKumREdoV1vEzxfSWFlHNNMgxADgfpFsR8pE=",
"lastModified": 1728330908,
"narHash": "sha256-2N7yfI0N4Up+aYzq7++BqMXZhuPcQGskSuq0TUcK5V0=",
"owner": "ggerganov",
"repo": "llama.cpp",
"rev": "daa9623ab051a8162ae750b150b9522571b55f21",
"rev": "6374743747b14db4eb73ce82ae449a2978bc3b47",
"type": "github"
},
"original": {
"owner": "ggerganov",
"ref": "b3707",
"ref": "b3896",
"repo": "llama.cpp",
"type": "github"
}

View file

@ -23,7 +23,7 @@
flake = false;
};
llama-cpp = {
url = "github:ggerganov/llama.cpp/b3707";
url = "github:ggerganov/llama.cpp/b3896";
inputs.nixpkgs.follows = "nixpkgs";
inputs.flake-parts.follows = "flake-parts";
};
@ -127,7 +127,8 @@
(lib.cmakeBool "GGML_STATIC" false)
(lib.cmakeBool "GGML_FMA" true)
(lib.cmakeBool "GGML_F16C" true)
(lib.cmakeBool "GGML_AVX512" true)
(lib.cmakeBool "GGML_AVX2" true)
(lib.cmakeBool "GGML_AVX512" false)
(lib.cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
(lib.cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmTargets))
];

View file

@ -27,3 +27,4 @@ tower = { version = "0.4", features = ["tokio", "tracing"] }
tower-http = { version = "0.5.2", features = ["trace"] }
reqwest-retry = "0.6.1"
reqwest-middleware = { version = "0.3.3", features = ["charset", "http2", "json", "multipart", "rustls-tls"] }
itertools = "0.13.0"

View file

@ -1,26 +1,46 @@
hardware:
ram: 64G
vram: 8G
vram: 30G
models:
- port: 18080
- name: "tabby-code"
port: 18080
internal_port: 28080
vram_usage: 8.25G
ram_usage: 2.6G
env:
CUDA_VISIBLE_DEVICES: 0
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
args:
model: /home/tristand/Downloads/models/Phi-3.5-mini-instruct-Q6_K_L.gguf
model: /media/SanDisk/ai/models_live/Qwen2.5-Coder-7B-Instruct-Q6_K_L.gguf
gpu-layers: 9999
ctx-size: 4096
vram_usage: 6G
ram_usage: 500M
- port: 18081
flash-attn: true
ctx-size: 32768
host: 0.0.0.0
- name: "tabby-embeddings"
port: 18081
internal_port: 28081
vram_usage: 1G
ram_usage: 2G
env:
CUDA_VISIBLE_DEVICES: 0
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
args:
model: /home/tristand/Downloads/models/Phi-3.5-mini-instruct-Q6_K_L.gguf
model: /media/SanDisk/ai/models_live/nomic-embed-text-v1-f32.gguf
gpu-layers: 9999
ctx-size: 4096
vram_usage: 6G
ram_usage: 500M
flash-attn: true
host: 0.0.0.0
embeddings: true
- name: "big-chat"
port: 18082
internal_port: 28082
vram_usage: 26.5G
ram_usage: 2.5G
env:
CUDA_VISIBLE_DEVICES: 0
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
args:
model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf
gpu-layers: 9999
ctx-size: 16384
flash-attn: true
host: 0.0.0.0

View file

@ -9,6 +9,7 @@ use axum::{
Router,
};
use futures;
use itertools::Itertools;
use reqwest::Client;
use serde::Deserialize;
use std::{collections::HashMap, net::SocketAddr, process::Stdio, sync::Arc};
@ -55,6 +56,7 @@ struct Hardware {
#[derive(Debug, Deserialize, Clone)]
struct ModelConfig {
name: String,
port: u16,
internal_port: u16,
env: HashMap<String, String>,
@ -63,13 +65,14 @@ struct ModelConfig {
ram_usage: String,
}
#[derive(Clone)]
#[derive(Clone, Debug)]
struct LlamaInstance {
config: ModelConfig,
process: Arc<Mutex<Child>>,
// busy: bool,
}
#[derive(Clone, Debug)]
struct SharedState {
total_ram: u64,
total_vram: u64,
@ -136,7 +139,7 @@ async fn main() {
let addr = SocketAddr::from(([0, 0, 0, 0], model_config.port));
println!("Listening on port {}", model_config.port);
tracing::info!(msg = "Listening", ?model_config);
let listener = tokio::net::TcpListener::bind(&addr).await.unwrap();
axum::serve(listener, app.into_make_service())
@ -198,22 +201,39 @@ async fn handle_request(
instance.to_owned()
} else {
// Check resources
if state.used_ram + model_ram_usage > state.total_ram
|| state.used_vram + model_vram_usage > state.total_vram
tracing::info!(msg = "Current state", ?state);
if ((state.used_ram + model_ram_usage) > state.total_ram)
|| ((state.used_vram + model_vram_usage) > state.total_vram)
{
// Stop other instances
let mut to_remove = Vec::new();
for (port, instance) in state.instances.clone() {
// TODO Actual smart stopping logic
// - search for smallest single model to stop to get enough room
// - if not possible search for smallest number of models to stop with lowest
// amount of "overshot
let instances_by_size =
state
.instances
.clone()
.into_iter()
.sorted_by(|(_, el_a), (_, el_b)| {
Ord::cmp(
&parse_size(el_b.config.vram_usage.as_str()),
&parse_size(el_a.config.vram_usage.as_str()),
)
});
for (port, instance) in instances_by_size {
// if !instance.busy {
tracing::info!("Stopping instance on port {}", port);
let mut process = instance.process.lock().await;
process.kill().await.ok();
to_remove.push(port);
state.used_ram -= parse_size(&instance.config.ram_usage).unwrap_or(0);
state.used_vram -= parse_size(&instance.config.vram_usage).unwrap_or(0);
to_remove.push(port);
if state.used_ram + model_ram_usage <= state.total_ram
&& state.used_vram + model_vram_usage <= state.total_vram
{
tracing::info!("Should have enough ram now");
break;
}
// }
@ -222,16 +242,25 @@ async fn handle_request(
tracing::info!("Removing instance on port {}", port);
state.instances.remove(&port);
}
} else {
tracing::info!("Already enough res free");
}
// Start new instance
let args = model_config
.args
.iter()
.flat_map(|(k, v)| vec![format!("--{}", k), v.clone()])
.flat_map(|(k, v)| {
if v == "true" {
vec![format!("--{}", k)]
} else {
vec![format!("--{}", k), v.clone()]
}
})
.collect::<Vec<_>>();
let mut cmd = Command::new("llama-server");
cmd.kill_on_drop(true);
cmd.envs(model_config.env.clone());
cmd.args(&args);
cmd.arg("--port");
@ -273,7 +302,7 @@ async fn handle_request(
.with(reqwest_retry::RetryTransientMiddleware::new_with_policy(
retry_policy,
))
.build();
.build();
let uri = format!(
"http://127.0.0.1:{}{}",
@ -342,7 +371,7 @@ fn parse_size(size_str: &str) -> Option<u64> {
let mut unit = String::new();
for c in size_str.chars() {
if c.is_digit(10) {
if c.is_digit(10) || c == '.' {
num.push(c);
} else {
unit.push(c);
@ -355,7 +384,7 @@ fn parse_size(size_str: &str) -> Option<u64> {
"g" | "gb" => 1024 * 1024 * 1024,
"m" | "mb" => 1024 * 1024,
"k" | "kb" => 1024,
_ => 1,
_ => panic!("Invalid Size"),
};
let res = (num * multiplier as f64) as u64;