Add automatic open port picking

2024-11-27 10:04:02 +01:00 · 2024-11-27 10:04:02 +01:00 · d33e4109c3
commit d33e4109c3
parent 3b4655728d
4 changed files with 51 additions and 22 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -4276,6 +4276,7 @@ dependencies = [
 "futures",
 "hyper",
 "itertools 0.13.0",
 "openport",
 "pin-project-lite",
 "reqwest",
 "reqwest-middleware",
@ -5075,6 +5076,15 @@ version = "1.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
 [[package]]
 name = "openport"
 version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "365c699f76305b3e62588961a288be10f0819ef1391f25870a69b35a213577cc"
 dependencies = [
 "rand 0.8.5",
 ]
 [[package]]
 name = "openssl"
 version = "0.10.66"
--- a/llama_proxy_man/Cargo.toml
+++ b/llama_proxy_man/Cargo.toml
@ -28,3 +28,4 @@ tower-http = { version = "0.5.2", features = ["trace"] }
 reqwest-retry = "0.6.1"
 reqwest-middleware = { version = "0.3.3", features = ["charset", "http2", "json", "multipart", "rustls-tls"] }
 itertools = "0.13.0"
 openport = { version = "0.1.1", features = ["rand"] }
--- a/llama_proxy_man/config.yaml
+++ b/llama_proxy_man/config.yaml
@ -1,10 +1,10 @@
 hardware:
-  ram: 64G
+  ram: 48G
  vram: 30G
 models:
  - name: "tabby-code"
    port: 18080
-    internal_port: 28080
+    # internal_port: 28080 # Optional
    autostart: true
    vram_usage: 26.7G   # Coder-32B + draft 0.5B
    ram_usage: 3G       # Coder-32B + draft 0.5B
@ -25,7 +25,6 @@ models:
      draft-min: 5
  - name: "tabby-embeddings"
    port: 18081
    internal_port: 28081
    vram_usage: 0.4G
    ram_usage: 2.5G
    env:
@ -39,32 +38,35 @@ models:
      embeddings: true
  - name: "big-chat"
    port: 18082
    internal_port: 28082
    vram_usage: 26.5G
    ram_usage: 2.5G
    env:
      CUDA_VISIBLE_DEVICES: 0
      HSA_OVERRIDE_GFX_VERSION: '11.0.0'
    args:
      model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf
      gpu-layers: 9999
      ctx-size: 16384
      flash-attn: true
-      # host: 0.0.0.0
+      model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf
      gpu-layers: 9999
      model-draft: /media/SanDisk/ai/models_live/Qwen2.5-0.5B-Instruct-Q8_0.gguf
      gpu-layers-draft: 9999
      # draft-max: 16
      # draft-min: 5
  - name: "bigger-chat"
    port: 18083
    internal_port: 28083
    vram_usage: 29G
-    ram_usage: 4G
+    ram_usage: 5G
    env:
      CUDA_VISIBLE_DEVICES: 0
      HSA_OVERRIDE_GFX_VERSION: '11.0.0'
    args:
      model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf
      gpu-layers: 9999
      flash-attn: true
      # ctx-size: 8192
      ctx-size: 16384
-      # host: 0.0.0.0
+      flash-attn: true
      cache-type-k: q8_0
      cache-type-v: q8_0
      model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf
      gpu-layers: 9999
      model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf
      gpu-layers-draft: 0
      # draft-max: 16
      # draft-min: 5
--- a/llama_proxy_man/src/main.rs
+++ b/llama_proxy_man/src/main.rs
@ -42,24 +42,40 @@ pub fn initialize_logger() {
    });
 }
-#[derive(Debug, Deserialize)]
+#[derive(Clone, Debug, Deserialize)]
 struct Config {
    hardware: Hardware,
    models: Vec<ModelConfig>,
 }
-#[derive(Debug, Deserialize)]
+impl Config {
    // TODO split up into raw deser config and "parsed"/"processed" config which always has a port
    fn pick_open_ports(self) -> Self {
        let mut config = self.clone();
        for model in &mut config.models {
            if model.internal_port.is_none() {
                model.internal_port = Some(
                    openport::pick_random_unused_port()
                        .expect(format!("No open port found for {:?}", model).as_str()),
                );
            }
        }
        config
    }
 }
 #[derive(Clone, Debug, Deserialize)]
 struct Hardware {
    ram: String,
    vram: String,
 }
-#[derive(Debug, Deserialize, Clone)]
+#[derive(Clone, Debug, Deserialize)]
 struct ModelConfig {
    #[allow(dead_code)]
    name: String,
    port: u16,
-    internal_port: u16,
+    internal_port: Option<u16>,
    env: HashMap<String, String>,
    args: HashMap<String, String>,
    vram_usage: String,
@ -91,7 +107,7 @@ async fn main() {
    initialize_logger();
    // Read and parse the YAML configuration
    let config_str = std::fs::read_to_string("config.yaml").expect("Failed to read config.yaml");
-    let config: Config = serde_yaml::from_str(&config_str).expect("Failed to parse config.yaml");
+    let config: Config = serde_yaml::from_str::<Config>(&config_str).expect("Failed to parse config.yaml").pick_open_ports();
    // Parse hardware resources
    let total_ram = parse_size(&config.hardware.ram).expect("Invalid RAM size in config");
@ -270,7 +286,7 @@ async fn handle_request(
            cmd.args(&args);
            // TODO use openport crate via pick_random_unused_port for determining these
            cmd.arg("--port");
-            cmd.arg(format!("{}", model_config.internal_port));
+            cmd.arg(format!("{}", model_config.internal_port.expect("Unexpected empty port, should've been picked")));
            cmd.stdout(Stdio::null()).stderr(Stdio::null()); // TODO save output and allow retrieval via api
            tracing::info!("Starting llama-server with {:?}", cmd);
@ -295,7 +311,7 @@ async fn handle_request(
    // Wait for the instance to be ready
    is_llama_instance_running(&instance).await?;
-    wait_for_port(model_config.internal_port).await?;
+    wait_for_port(model_config.internal_port.expect("Unexpected empty port, should've been picked")).await?;
    // Proxy the request
    let retry_policy = reqwest_retry::policies::ExponentialBackoff::builder()
@ -312,7 +328,7 @@ async fn handle_request(
    let uri = format!(
        "http://127.0.0.1:{}{}",
-        model_config.internal_port,
+        model_config.internal_port.expect("Unexpected empty port, should've been picked"),
        req.uri().path_and_query().map(|x| x.as_str()).unwrap_or("")
    );