Add automatic open port picking

This commit is contained in:
Tristan D. 2024-11-27 10:04:02 +01:00
parent 3b4655728d
commit d33e4109c3
Signed by: tristan
SSH key fingerprint: SHA256:3RU4RLOoM8oAjFU19f1W6t8uouZbA7GWkaSW6rjp1k8
4 changed files with 51 additions and 22 deletions

10
Cargo.lock generated
View file

@ -4276,6 +4276,7 @@ dependencies = [
"futures",
"hyper",
"itertools 0.13.0",
"openport",
"pin-project-lite",
"reqwest",
"reqwest-middleware",
@ -5075,6 +5076,15 @@ version = "1.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
[[package]]
name = "openport"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "365c699f76305b3e62588961a288be10f0819ef1391f25870a69b35a213577cc"
dependencies = [
"rand 0.8.5",
]
[[package]]
name = "openssl"
version = "0.10.66"

View file

@ -28,3 +28,4 @@ tower-http = { version = "0.5.2", features = ["trace"] }
reqwest-retry = "0.6.1"
reqwest-middleware = { version = "0.3.3", features = ["charset", "http2", "json", "multipart", "rustls-tls"] }
itertools = "0.13.0"
openport = { version = "0.1.1", features = ["rand"] }

View file

@ -1,10 +1,10 @@
hardware:
ram: 64G
ram: 48G
vram: 30G
models:
- name: "tabby-code"
port: 18080
internal_port: 28080
# internal_port: 28080 # Optional
autostart: true
vram_usage: 26.7G # Coder-32B + draft 0.5B
ram_usage: 3G # Coder-32B + draft 0.5B
@ -25,7 +25,6 @@ models:
draft-min: 5
- name: "tabby-embeddings"
port: 18081
internal_port: 28081
vram_usage: 0.4G
ram_usage: 2.5G
env:
@ -39,32 +38,35 @@ models:
embeddings: true
- name: "big-chat"
port: 18082
internal_port: 28082
vram_usage: 26.5G
ram_usage: 2.5G
env:
CUDA_VISIBLE_DEVICES: 0
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
args:
model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf
gpu-layers: 9999
ctx-size: 16384
flash-attn: true
# host: 0.0.0.0
model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf
gpu-layers: 9999
model-draft: /media/SanDisk/ai/models_live/Qwen2.5-0.5B-Instruct-Q8_0.gguf
gpu-layers-draft: 9999
# draft-max: 16
# draft-min: 5
- name: "bigger-chat"
port: 18083
internal_port: 28083
vram_usage: 29G
ram_usage: 4G
ram_usage: 5G
env:
CUDA_VISIBLE_DEVICES: 0
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
args:
model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf
gpu-layers: 9999
flash-attn: true
# ctx-size: 8192
ctx-size: 16384
# host: 0.0.0.0
flash-attn: true
cache-type-k: q8_0
cache-type-v: q8_0
model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf
gpu-layers: 9999
model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf
gpu-layers-draft: 0
# draft-max: 16
# draft-min: 5

View file

@ -42,24 +42,40 @@ pub fn initialize_logger() {
});
}
#[derive(Debug, Deserialize)]
#[derive(Clone, Debug, Deserialize)]
struct Config {
hardware: Hardware,
models: Vec<ModelConfig>,
}
#[derive(Debug, Deserialize)]
impl Config {
// TODO split up into raw deser config and "parsed"/"processed" config which always has a port
fn pick_open_ports(self) -> Self {
let mut config = self.clone();
for model in &mut config.models {
if model.internal_port.is_none() {
model.internal_port = Some(
openport::pick_random_unused_port()
.expect(format!("No open port found for {:?}", model).as_str()),
);
}
}
config
}
}
#[derive(Clone, Debug, Deserialize)]
struct Hardware {
ram: String,
vram: String,
}
#[derive(Debug, Deserialize, Clone)]
#[derive(Clone, Debug, Deserialize)]
struct ModelConfig {
#[allow(dead_code)]
name: String,
port: u16,
internal_port: u16,
internal_port: Option<u16>,
env: HashMap<String, String>,
args: HashMap<String, String>,
vram_usage: String,
@ -91,7 +107,7 @@ async fn main() {
initialize_logger();
// Read and parse the YAML configuration
let config_str = std::fs::read_to_string("config.yaml").expect("Failed to read config.yaml");
let config: Config = serde_yaml::from_str(&config_str).expect("Failed to parse config.yaml");
let config: Config = serde_yaml::from_str::<Config>(&config_str).expect("Failed to parse config.yaml").pick_open_ports();
// Parse hardware resources
let total_ram = parse_size(&config.hardware.ram).expect("Invalid RAM size in config");
@ -270,7 +286,7 @@ async fn handle_request(
cmd.args(&args);
// TODO use openport crate via pick_random_unused_port for determining these
cmd.arg("--port");
cmd.arg(format!("{}", model_config.internal_port));
cmd.arg(format!("{}", model_config.internal_port.expect("Unexpected empty port, should've been picked")));
cmd.stdout(Stdio::null()).stderr(Stdio::null()); // TODO save output and allow retrieval via api
tracing::info!("Starting llama-server with {:?}", cmd);
@ -295,7 +311,7 @@ async fn handle_request(
// Wait for the instance to be ready
is_llama_instance_running(&instance).await?;
wait_for_port(model_config.internal_port).await?;
wait_for_port(model_config.internal_port.expect("Unexpected empty port, should've been picked")).await?;
// Proxy the request
let retry_policy = reqwest_retry::policies::ExponentialBackoff::builder()
@ -312,7 +328,7 @@ async fn handle_request(
let uri = format!(
"http://127.0.0.1:{}{}",
model_config.internal_port,
model_config.internal_port.expect("Unexpected empty port, should've been picked"),
req.uri().path_and_query().map(|x| x.as_str()).unwrap_or("")
);