From d33e4109c328829a2af36d30dce552e2d00a273b Mon Sep 17 00:00:00 2001
From: Tristan Druyen <tristan@vault81.de>
Date: Wed, 27 Nov 2024 10:04:02 +0100
Subject: [PATCH] Add automatic open port picking

---
 Cargo.lock                  | 10 ++++++++++
 llama_proxy_man/Cargo.toml  |  1 +
 llama_proxy_man/config.yaml | 30 ++++++++++++++++--------------
 llama_proxy_man/src/main.rs | 32 ++++++++++++++++++++++++--------
 4 files changed, 51 insertions(+), 22 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b633a23..59b898b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4276,6 +4276,7 @@ dependencies = [
  "futures",
  "hyper",
  "itertools 0.13.0",
+ "openport",
  "pin-project-lite",
  "reqwest",
  "reqwest-middleware",
@@ -5075,6 +5076,15 @@ version = "1.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
 
+[[package]]
+name = "openport"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "365c699f76305b3e62588961a288be10f0819ef1391f25870a69b35a213577cc"
+dependencies = [
+ "rand 0.8.5",
+]
+
 [[package]]
 name = "openssl"
 version = "0.10.66"
diff --git a/llama_proxy_man/Cargo.toml b/llama_proxy_man/Cargo.toml
index 23aa511..f1e8ecf 100644
--- a/llama_proxy_man/Cargo.toml
+++ b/llama_proxy_man/Cargo.toml
@@ -28,3 +28,4 @@ tower-http = { version = "0.5.2", features = ["trace"] }
 reqwest-retry = "0.6.1"
 reqwest-middleware = { version = "0.3.3", features = ["charset", "http2", "json", "multipart", "rustls-tls"] }
 itertools = "0.13.0"
+openport = { version = "0.1.1", features = ["rand"] }
diff --git a/llama_proxy_man/config.yaml b/llama_proxy_man/config.yaml
index c658f7a..eb637cc 100644
--- a/llama_proxy_man/config.yaml
+++ b/llama_proxy_man/config.yaml
@@ -1,10 +1,10 @@
 hardware:
-  ram: 64G
+  ram: 48G
   vram: 30G
 models:
   - name: "tabby-code"
     port: 18080
-    internal_port: 28080
+    # internal_port: 28080 # Optional
     autostart: true
     vram_usage: 26.7G   # Coder-32B + draft 0.5B
     ram_usage: 3G       # Coder-32B + draft 0.5B
@@ -25,7 +25,6 @@ models:
       draft-min: 5
   - name: "tabby-embeddings"
     port: 18081
-    internal_port: 28081
     vram_usage: 0.4G
     ram_usage: 2.5G
     env:
@@ -39,32 +38,35 @@ models:
       embeddings: true
   - name: "big-chat"
     port: 18082
-    internal_port: 28082
     vram_usage: 26.5G
     ram_usage: 2.5G
     env:
       CUDA_VISIBLE_DEVICES: 0
       HSA_OVERRIDE_GFX_VERSION: '11.0.0'
     args:
-      model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf
-      gpu-layers: 9999
       ctx-size: 16384
       flash-attn: true
-      # host: 0.0.0.0
+      model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf
+      gpu-layers: 9999
+      model-draft: /media/SanDisk/ai/models_live/Qwen2.5-0.5B-Instruct-Q8_0.gguf
+      gpu-layers-draft: 9999
+      # draft-max: 16
+      # draft-min: 5
   - name: "bigger-chat"
     port: 18083
-    internal_port: 28083
     vram_usage: 29G
-    ram_usage: 4G
+    ram_usage: 5G
     env:
       CUDA_VISIBLE_DEVICES: 0
       HSA_OVERRIDE_GFX_VERSION: '11.0.0'
     args:
-      model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf
-      gpu-layers: 9999
-      flash-attn: true
-      # ctx-size: 8192
       ctx-size: 16384
-      # host: 0.0.0.0
+      flash-attn: true
       cache-type-k: q8_0
       cache-type-v: q8_0
+      model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf
+      gpu-layers: 9999
+      model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf
+      gpu-layers-draft: 0
+      # draft-max: 16
+      # draft-min: 5
diff --git a/llama_proxy_man/src/main.rs b/llama_proxy_man/src/main.rs
index 53d0d74..cf08966 100644
--- a/llama_proxy_man/src/main.rs
+++ b/llama_proxy_man/src/main.rs
@@ -42,24 +42,40 @@ pub fn initialize_logger() {
     });
 }
 
-#[derive(Debug, Deserialize)]
+#[derive(Clone, Debug, Deserialize)]
 struct Config {
     hardware: Hardware,
     models: Vec<ModelConfig>,
 }
 
-#[derive(Debug, Deserialize)]
+impl Config {
+    // TODO split up into raw deser config and "parsed"/"processed" config which always has a port
+    fn pick_open_ports(self) -> Self {
+        let mut config = self.clone();
+        for model in &mut config.models {
+            if model.internal_port.is_none() {
+                model.internal_port = Some(
+                    openport::pick_random_unused_port()
+                        .expect(format!("No open port found for {:?}", model).as_str()),
+                );
+            }
+        }
+        config
+    }
+}
+
+#[derive(Clone, Debug, Deserialize)]
 struct Hardware {
     ram: String,
     vram: String,
 }
 
-#[derive(Debug, Deserialize, Clone)]
+#[derive(Clone, Debug, Deserialize)]
 struct ModelConfig {
     #[allow(dead_code)]
     name: String,
     port: u16,
-    internal_port: u16,
+    internal_port: Option<u16>,
     env: HashMap<String, String>,
     args: HashMap<String, String>,
     vram_usage: String,
@@ -91,7 +107,7 @@ async fn main() {
     initialize_logger();
     // Read and parse the YAML configuration
     let config_str = std::fs::read_to_string("config.yaml").expect("Failed to read config.yaml");
-    let config: Config = serde_yaml::from_str(&config_str).expect("Failed to parse config.yaml");
+    let config: Config = serde_yaml::from_str::<Config>(&config_str).expect("Failed to parse config.yaml").pick_open_ports();
 
     // Parse hardware resources
     let total_ram = parse_size(&config.hardware.ram).expect("Invalid RAM size in config");
@@ -270,7 +286,7 @@ async fn handle_request(
             cmd.args(&args);
             // TODO use openport crate via pick_random_unused_port for determining these
             cmd.arg("--port");
-            cmd.arg(format!("{}", model_config.internal_port));
+            cmd.arg(format!("{}", model_config.internal_port.expect("Unexpected empty port, should've been picked")));
             cmd.stdout(Stdio::null()).stderr(Stdio::null()); // TODO save output and allow retrieval via api
 
             tracing::info!("Starting llama-server with {:?}", cmd);
@@ -295,7 +311,7 @@ async fn handle_request(
 
     // Wait for the instance to be ready
     is_llama_instance_running(&instance).await?;
-    wait_for_port(model_config.internal_port).await?;
+    wait_for_port(model_config.internal_port.expect("Unexpected empty port, should've been picked")).await?;
 
     // Proxy the request
     let retry_policy = reqwest_retry::policies::ExponentialBackoff::builder()
@@ -312,7 +328,7 @@ async fn handle_request(
 
     let uri = format!(
         "http://127.0.0.1:{}{}",
-        model_config.internal_port,
+        model_config.internal_port.expect("Unexpected empty port, should've been picked"),
         req.uri().path_and_query().map(|x| x.as_str()).unwrap_or("")
     );