From d33e4109c328829a2af36d30dce552e2d00a273b Mon Sep 17 00:00:00 2001 From: Tristan Druyen Date: Wed, 27 Nov 2024 10:04:02 +0100 Subject: [PATCH] Add automatic open port picking --- Cargo.lock | 10 ++++++++++ llama_proxy_man/Cargo.toml | 1 + llama_proxy_man/config.yaml | 30 ++++++++++++++++-------------- llama_proxy_man/src/main.rs | 32 ++++++++++++++++++++++++-------- 4 files changed, 51 insertions(+), 22 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b633a23..59b898b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4276,6 +4276,7 @@ dependencies = [ "futures", "hyper", "itertools 0.13.0", + "openport", "pin-project-lite", "reqwest", "reqwest-middleware", @@ -5075,6 +5076,15 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +[[package]] +name = "openport" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "365c699f76305b3e62588961a288be10f0819ef1391f25870a69b35a213577cc" +dependencies = [ + "rand 0.8.5", +] + [[package]] name = "openssl" version = "0.10.66" diff --git a/llama_proxy_man/Cargo.toml b/llama_proxy_man/Cargo.toml index 23aa511..f1e8ecf 100644 --- a/llama_proxy_man/Cargo.toml +++ b/llama_proxy_man/Cargo.toml @@ -28,3 +28,4 @@ tower-http = { version = "0.5.2", features = ["trace"] } reqwest-retry = "0.6.1" reqwest-middleware = { version = "0.3.3", features = ["charset", "http2", "json", "multipart", "rustls-tls"] } itertools = "0.13.0" +openport = { version = "0.1.1", features = ["rand"] } diff --git a/llama_proxy_man/config.yaml b/llama_proxy_man/config.yaml index c658f7a..eb637cc 100644 --- a/llama_proxy_man/config.yaml +++ b/llama_proxy_man/config.yaml @@ -1,10 +1,10 @@ hardware: - ram: 64G + ram: 48G vram: 30G models: - name: "tabby-code" port: 18080 - internal_port: 28080 + # internal_port: 28080 # Optional autostart: true vram_usage: 26.7G # Coder-32B + draft 0.5B ram_usage: 3G # Coder-32B + draft 0.5B @@ -25,7 +25,6 @@ models: draft-min: 5 - name: "tabby-embeddings" port: 18081 - internal_port: 28081 vram_usage: 0.4G ram_usage: 2.5G env: @@ -39,32 +38,35 @@ models: embeddings: true - name: "big-chat" port: 18082 - internal_port: 28082 vram_usage: 26.5G ram_usage: 2.5G env: CUDA_VISIBLE_DEVICES: 0 HSA_OVERRIDE_GFX_VERSION: '11.0.0' args: - model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf - gpu-layers: 9999 ctx-size: 16384 flash-attn: true - # host: 0.0.0.0 + model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf + gpu-layers: 9999 + model-draft: /media/SanDisk/ai/models_live/Qwen2.5-0.5B-Instruct-Q8_0.gguf + gpu-layers-draft: 9999 + # draft-max: 16 + # draft-min: 5 - name: "bigger-chat" port: 18083 - internal_port: 28083 vram_usage: 29G - ram_usage: 4G + ram_usage: 5G env: CUDA_VISIBLE_DEVICES: 0 HSA_OVERRIDE_GFX_VERSION: '11.0.0' args: - model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf - gpu-layers: 9999 - flash-attn: true - # ctx-size: 8192 ctx-size: 16384 - # host: 0.0.0.0 + flash-attn: true cache-type-k: q8_0 cache-type-v: q8_0 + model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf + gpu-layers: 9999 + model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf + gpu-layers-draft: 0 + # draft-max: 16 + # draft-min: 5 diff --git a/llama_proxy_man/src/main.rs b/llama_proxy_man/src/main.rs index 53d0d74..cf08966 100644 --- a/llama_proxy_man/src/main.rs +++ b/llama_proxy_man/src/main.rs @@ -42,24 +42,40 @@ pub fn initialize_logger() { }); } -#[derive(Debug, Deserialize)] +#[derive(Clone, Debug, Deserialize)] struct Config { hardware: Hardware, models: Vec, } -#[derive(Debug, Deserialize)] +impl Config { + // TODO split up into raw deser config and "parsed"/"processed" config which always has a port + fn pick_open_ports(self) -> Self { + let mut config = self.clone(); + for model in &mut config.models { + if model.internal_port.is_none() { + model.internal_port = Some( + openport::pick_random_unused_port() + .expect(format!("No open port found for {:?}", model).as_str()), + ); + } + } + config + } +} + +#[derive(Clone, Debug, Deserialize)] struct Hardware { ram: String, vram: String, } -#[derive(Debug, Deserialize, Clone)] +#[derive(Clone, Debug, Deserialize)] struct ModelConfig { #[allow(dead_code)] name: String, port: u16, - internal_port: u16, + internal_port: Option, env: HashMap, args: HashMap, vram_usage: String, @@ -91,7 +107,7 @@ async fn main() { initialize_logger(); // Read and parse the YAML configuration let config_str = std::fs::read_to_string("config.yaml").expect("Failed to read config.yaml"); - let config: Config = serde_yaml::from_str(&config_str).expect("Failed to parse config.yaml"); + let config: Config = serde_yaml::from_str::(&config_str).expect("Failed to parse config.yaml").pick_open_ports(); // Parse hardware resources let total_ram = parse_size(&config.hardware.ram).expect("Invalid RAM size in config"); @@ -270,7 +286,7 @@ async fn handle_request( cmd.args(&args); // TODO use openport crate via pick_random_unused_port for determining these cmd.arg("--port"); - cmd.arg(format!("{}", model_config.internal_port)); + cmd.arg(format!("{}", model_config.internal_port.expect("Unexpected empty port, should've been picked"))); cmd.stdout(Stdio::null()).stderr(Stdio::null()); // TODO save output and allow retrieval via api tracing::info!("Starting llama-server with {:?}", cmd); @@ -295,7 +311,7 @@ async fn handle_request( // Wait for the instance to be ready is_llama_instance_running(&instance).await?; - wait_for_port(model_config.internal_port).await?; + wait_for_port(model_config.internal_port.expect("Unexpected empty port, should've been picked")).await?; // Proxy the request let retry_policy = reqwest_retry::policies::ExponentialBackoff::builder() @@ -312,7 +328,7 @@ async fn handle_request( let uri = format!( "http://127.0.0.1:{}{}", - model_config.internal_port, + model_config.internal_port.expect("Unexpected empty port, should've been picked"), req.uri().path_and_query().map(|x| x.as_str()).unwrap_or("") );