Optimize by stopping large models first
This commit is contained in:
parent
6885baa679
commit
0391865791
6 changed files with 79 additions and 27 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -4176,6 +4176,7 @@ dependencies = [
|
||||||
"axum",
|
"axum",
|
||||||
"futures",
|
"futures",
|
||||||
"hyper",
|
"hyper",
|
||||||
|
"itertools 0.13.0",
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"reqwest-middleware",
|
"reqwest-middleware",
|
||||||
|
|
8
flake.lock
generated
8
flake.lock
generated
|
@ -64,16 +64,16 @@
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1725824628,
|
"lastModified": 1728330908,
|
||||||
"narHash": "sha256-oiVEb+PMKumREdoV1vEzxfSWFlHNNMgxADgfpFsR8pE=",
|
"narHash": "sha256-2N7yfI0N4Up+aYzq7++BqMXZhuPcQGskSuq0TUcK5V0=",
|
||||||
"owner": "ggerganov",
|
"owner": "ggerganov",
|
||||||
"repo": "llama.cpp",
|
"repo": "llama.cpp",
|
||||||
"rev": "daa9623ab051a8162ae750b150b9522571b55f21",
|
"rev": "6374743747b14db4eb73ce82ae449a2978bc3b47",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
"owner": "ggerganov",
|
"owner": "ggerganov",
|
||||||
"ref": "b3707",
|
"ref": "b3896",
|
||||||
"repo": "llama.cpp",
|
"repo": "llama.cpp",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,7 +23,7 @@
|
||||||
flake = false;
|
flake = false;
|
||||||
};
|
};
|
||||||
llama-cpp = {
|
llama-cpp = {
|
||||||
url = "github:ggerganov/llama.cpp/b3707";
|
url = "github:ggerganov/llama.cpp/b3896";
|
||||||
inputs.nixpkgs.follows = "nixpkgs";
|
inputs.nixpkgs.follows = "nixpkgs";
|
||||||
inputs.flake-parts.follows = "flake-parts";
|
inputs.flake-parts.follows = "flake-parts";
|
||||||
};
|
};
|
||||||
|
@ -127,7 +127,8 @@
|
||||||
(lib.cmakeBool "GGML_STATIC" false)
|
(lib.cmakeBool "GGML_STATIC" false)
|
||||||
(lib.cmakeBool "GGML_FMA" true)
|
(lib.cmakeBool "GGML_FMA" true)
|
||||||
(lib.cmakeBool "GGML_F16C" true)
|
(lib.cmakeBool "GGML_F16C" true)
|
||||||
(lib.cmakeBool "GGML_AVX512" true)
|
(lib.cmakeBool "GGML_AVX2" true)
|
||||||
|
(lib.cmakeBool "GGML_AVX512" false)
|
||||||
(lib.cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
|
(lib.cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
|
||||||
(lib.cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmTargets))
|
(lib.cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmTargets))
|
||||||
];
|
];
|
||||||
|
|
|
@ -27,3 +27,4 @@ tower = { version = "0.4", features = ["tokio", "tracing"] }
|
||||||
tower-http = { version = "0.5.2", features = ["trace"] }
|
tower-http = { version = "0.5.2", features = ["trace"] }
|
||||||
reqwest-retry = "0.6.1"
|
reqwest-retry = "0.6.1"
|
||||||
reqwest-middleware = { version = "0.3.3", features = ["charset", "http2", "json", "multipart", "rustls-tls"] }
|
reqwest-middleware = { version = "0.3.3", features = ["charset", "http2", "json", "multipart", "rustls-tls"] }
|
||||||
|
itertools = "0.13.0"
|
||||||
|
|
|
@ -1,26 +1,46 @@
|
||||||
hardware:
|
hardware:
|
||||||
ram: 64G
|
ram: 64G
|
||||||
vram: 8G
|
vram: 30G
|
||||||
models:
|
models:
|
||||||
- port: 18080
|
- name: "tabby-code"
|
||||||
|
port: 18080
|
||||||
internal_port: 28080
|
internal_port: 28080
|
||||||
|
vram_usage: 8.25G
|
||||||
|
ram_usage: 2.6G
|
||||||
env:
|
env:
|
||||||
CUDA_VISIBLE_DEVICES: 0
|
CUDA_VISIBLE_DEVICES: 0
|
||||||
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
||||||
args:
|
args:
|
||||||
model: /home/tristand/Downloads/models/Phi-3.5-mini-instruct-Q6_K_L.gguf
|
model: /media/SanDisk/ai/models_live/Qwen2.5-Coder-7B-Instruct-Q6_K_L.gguf
|
||||||
gpu-layers: 9999
|
gpu-layers: 9999
|
||||||
ctx-size: 4096
|
flash-attn: true
|
||||||
vram_usage: 6G
|
ctx-size: 32768
|
||||||
ram_usage: 500M
|
host: 0.0.0.0
|
||||||
- port: 18081
|
- name: "tabby-embeddings"
|
||||||
|
port: 18081
|
||||||
internal_port: 28081
|
internal_port: 28081
|
||||||
|
vram_usage: 1G
|
||||||
|
ram_usage: 2G
|
||||||
env:
|
env:
|
||||||
CUDA_VISIBLE_DEVICES: 0
|
CUDA_VISIBLE_DEVICES: 0
|
||||||
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
||||||
args:
|
args:
|
||||||
model: /home/tristand/Downloads/models/Phi-3.5-mini-instruct-Q6_K_L.gguf
|
model: /media/SanDisk/ai/models_live/nomic-embed-text-v1-f32.gguf
|
||||||
gpu-layers: 9999
|
gpu-layers: 9999
|
||||||
ctx-size: 4096
|
flash-attn: true
|
||||||
vram_usage: 6G
|
host: 0.0.0.0
|
||||||
ram_usage: 500M
|
embeddings: true
|
||||||
|
- name: "big-chat"
|
||||||
|
port: 18082
|
||||||
|
internal_port: 28082
|
||||||
|
vram_usage: 26.5G
|
||||||
|
ram_usage: 2.5G
|
||||||
|
env:
|
||||||
|
CUDA_VISIBLE_DEVICES: 0
|
||||||
|
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
||||||
|
args:
|
||||||
|
model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf
|
||||||
|
gpu-layers: 9999
|
||||||
|
ctx-size: 16384
|
||||||
|
flash-attn: true
|
||||||
|
host: 0.0.0.0
|
||||||
|
|
|
@ -9,6 +9,7 @@ use axum::{
|
||||||
Router,
|
Router,
|
||||||
};
|
};
|
||||||
use futures;
|
use futures;
|
||||||
|
use itertools::Itertools;
|
||||||
use reqwest::Client;
|
use reqwest::Client;
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use std::{collections::HashMap, net::SocketAddr, process::Stdio, sync::Arc};
|
use std::{collections::HashMap, net::SocketAddr, process::Stdio, sync::Arc};
|
||||||
|
@ -55,6 +56,7 @@ struct Hardware {
|
||||||
|
|
||||||
#[derive(Debug, Deserialize, Clone)]
|
#[derive(Debug, Deserialize, Clone)]
|
||||||
struct ModelConfig {
|
struct ModelConfig {
|
||||||
|
name: String,
|
||||||
port: u16,
|
port: u16,
|
||||||
internal_port: u16,
|
internal_port: u16,
|
||||||
env: HashMap<String, String>,
|
env: HashMap<String, String>,
|
||||||
|
@ -63,13 +65,14 @@ struct ModelConfig {
|
||||||
ram_usage: String,
|
ram_usage: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone, Debug)]
|
||||||
struct LlamaInstance {
|
struct LlamaInstance {
|
||||||
config: ModelConfig,
|
config: ModelConfig,
|
||||||
process: Arc<Mutex<Child>>,
|
process: Arc<Mutex<Child>>,
|
||||||
// busy: bool,
|
// busy: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
struct SharedState {
|
struct SharedState {
|
||||||
total_ram: u64,
|
total_ram: u64,
|
||||||
total_vram: u64,
|
total_vram: u64,
|
||||||
|
@ -136,7 +139,7 @@ async fn main() {
|
||||||
|
|
||||||
let addr = SocketAddr::from(([0, 0, 0, 0], model_config.port));
|
let addr = SocketAddr::from(([0, 0, 0, 0], model_config.port));
|
||||||
|
|
||||||
println!("Listening on port {}", model_config.port);
|
tracing::info!(msg = "Listening", ?model_config);
|
||||||
let listener = tokio::net::TcpListener::bind(&addr).await.unwrap();
|
let listener = tokio::net::TcpListener::bind(&addr).await.unwrap();
|
||||||
|
|
||||||
axum::serve(listener, app.into_make_service())
|
axum::serve(listener, app.into_make_service())
|
||||||
|
@ -198,22 +201,39 @@ async fn handle_request(
|
||||||
instance.to_owned()
|
instance.to_owned()
|
||||||
} else {
|
} else {
|
||||||
// Check resources
|
// Check resources
|
||||||
if state.used_ram + model_ram_usage > state.total_ram
|
tracing::info!(msg = "Current state", ?state);
|
||||||
|| state.used_vram + model_vram_usage > state.total_vram
|
if ((state.used_ram + model_ram_usage) > state.total_ram)
|
||||||
|
|| ((state.used_vram + model_vram_usage) > state.total_vram)
|
||||||
{
|
{
|
||||||
// Stop other instances
|
// Stop other instances
|
||||||
let mut to_remove = Vec::new();
|
let mut to_remove = Vec::new();
|
||||||
for (port, instance) in state.instances.clone() {
|
// TODO Actual smart stopping logic
|
||||||
|
// - search for smallest single model to stop to get enough room
|
||||||
|
// - if not possible search for smallest number of models to stop with lowest
|
||||||
|
// amount of "overshot
|
||||||
|
let instances_by_size =
|
||||||
|
state
|
||||||
|
.instances
|
||||||
|
.clone()
|
||||||
|
.into_iter()
|
||||||
|
.sorted_by(|(_, el_a), (_, el_b)| {
|
||||||
|
Ord::cmp(
|
||||||
|
&parse_size(el_b.config.vram_usage.as_str()),
|
||||||
|
&parse_size(el_a.config.vram_usage.as_str()),
|
||||||
|
)
|
||||||
|
});
|
||||||
|
for (port, instance) in instances_by_size {
|
||||||
// if !instance.busy {
|
// if !instance.busy {
|
||||||
tracing::info!("Stopping instance on port {}", port);
|
tracing::info!("Stopping instance on port {}", port);
|
||||||
let mut process = instance.process.lock().await;
|
let mut process = instance.process.lock().await;
|
||||||
process.kill().await.ok();
|
process.kill().await.ok();
|
||||||
|
to_remove.push(port);
|
||||||
state.used_ram -= parse_size(&instance.config.ram_usage).unwrap_or(0);
|
state.used_ram -= parse_size(&instance.config.ram_usage).unwrap_or(0);
|
||||||
state.used_vram -= parse_size(&instance.config.vram_usage).unwrap_or(0);
|
state.used_vram -= parse_size(&instance.config.vram_usage).unwrap_or(0);
|
||||||
to_remove.push(port);
|
|
||||||
if state.used_ram + model_ram_usage <= state.total_ram
|
if state.used_ram + model_ram_usage <= state.total_ram
|
||||||
&& state.used_vram + model_vram_usage <= state.total_vram
|
&& state.used_vram + model_vram_usage <= state.total_vram
|
||||||
{
|
{
|
||||||
|
tracing::info!("Should have enough ram now");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// }
|
// }
|
||||||
|
@ -222,16 +242,25 @@ async fn handle_request(
|
||||||
tracing::info!("Removing instance on port {}", port);
|
tracing::info!("Removing instance on port {}", port);
|
||||||
state.instances.remove(&port);
|
state.instances.remove(&port);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
tracing::info!("Already enough res free");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Start new instance
|
// Start new instance
|
||||||
let args = model_config
|
let args = model_config
|
||||||
.args
|
.args
|
||||||
.iter()
|
.iter()
|
||||||
.flat_map(|(k, v)| vec![format!("--{}", k), v.clone()])
|
.flat_map(|(k, v)| {
|
||||||
|
if v == "true" {
|
||||||
|
vec![format!("--{}", k)]
|
||||||
|
} else {
|
||||||
|
vec![format!("--{}", k), v.clone()]
|
||||||
|
}
|
||||||
|
})
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
let mut cmd = Command::new("llama-server");
|
let mut cmd = Command::new("llama-server");
|
||||||
|
cmd.kill_on_drop(true);
|
||||||
cmd.envs(model_config.env.clone());
|
cmd.envs(model_config.env.clone());
|
||||||
cmd.args(&args);
|
cmd.args(&args);
|
||||||
cmd.arg("--port");
|
cmd.arg("--port");
|
||||||
|
@ -273,7 +302,7 @@ async fn handle_request(
|
||||||
.with(reqwest_retry::RetryTransientMiddleware::new_with_policy(
|
.with(reqwest_retry::RetryTransientMiddleware::new_with_policy(
|
||||||
retry_policy,
|
retry_policy,
|
||||||
))
|
))
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
let uri = format!(
|
let uri = format!(
|
||||||
"http://127.0.0.1:{}{}",
|
"http://127.0.0.1:{}{}",
|
||||||
|
@ -342,7 +371,7 @@ fn parse_size(size_str: &str) -> Option<u64> {
|
||||||
let mut unit = String::new();
|
let mut unit = String::new();
|
||||||
|
|
||||||
for c in size_str.chars() {
|
for c in size_str.chars() {
|
||||||
if c.is_digit(10) {
|
if c.is_digit(10) || c == '.' {
|
||||||
num.push(c);
|
num.push(c);
|
||||||
} else {
|
} else {
|
||||||
unit.push(c);
|
unit.push(c);
|
||||||
|
@ -355,7 +384,7 @@ fn parse_size(size_str: &str) -> Option<u64> {
|
||||||
"g" | "gb" => 1024 * 1024 * 1024,
|
"g" | "gb" => 1024 * 1024 * 1024,
|
||||||
"m" | "mb" => 1024 * 1024,
|
"m" | "mb" => 1024 * 1024,
|
||||||
"k" | "kb" => 1024,
|
"k" | "kb" => 1024,
|
||||||
_ => 1,
|
_ => panic!("Invalid Size"),
|
||||||
};
|
};
|
||||||
|
|
||||||
let res = (num * multiplier as f64) as u64;
|
let res = (num * multiplier as f64) as u64;
|
||||||
|
|
Loading…
Add table
Reference in a new issue