Add automatic open port picking
This commit is contained in:
parent
3b4655728d
commit
d33e4109c3
4 changed files with 51 additions and 22 deletions
10
Cargo.lock
generated
10
Cargo.lock
generated
|
@ -4276,6 +4276,7 @@ dependencies = [
|
||||||
"futures",
|
"futures",
|
||||||
"hyper",
|
"hyper",
|
||||||
"itertools 0.13.0",
|
"itertools 0.13.0",
|
||||||
|
"openport",
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"reqwest-middleware",
|
"reqwest-middleware",
|
||||||
|
@ -5075,6 +5076,15 @@ version = "1.19.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
|
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "openport"
|
||||||
|
version = "0.1.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "365c699f76305b3e62588961a288be10f0819ef1391f25870a69b35a213577cc"
|
||||||
|
dependencies = [
|
||||||
|
"rand 0.8.5",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "openssl"
|
name = "openssl"
|
||||||
version = "0.10.66"
|
version = "0.10.66"
|
||||||
|
|
|
@ -28,3 +28,4 @@ tower-http = { version = "0.5.2", features = ["trace"] }
|
||||||
reqwest-retry = "0.6.1"
|
reqwest-retry = "0.6.1"
|
||||||
reqwest-middleware = { version = "0.3.3", features = ["charset", "http2", "json", "multipart", "rustls-tls"] }
|
reqwest-middleware = { version = "0.3.3", features = ["charset", "http2", "json", "multipart", "rustls-tls"] }
|
||||||
itertools = "0.13.0"
|
itertools = "0.13.0"
|
||||||
|
openport = { version = "0.1.1", features = ["rand"] }
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
hardware:
|
hardware:
|
||||||
ram: 64G
|
ram: 48G
|
||||||
vram: 30G
|
vram: 30G
|
||||||
models:
|
models:
|
||||||
- name: "tabby-code"
|
- name: "tabby-code"
|
||||||
port: 18080
|
port: 18080
|
||||||
internal_port: 28080
|
# internal_port: 28080 # Optional
|
||||||
autostart: true
|
autostart: true
|
||||||
vram_usage: 26.7G # Coder-32B + draft 0.5B
|
vram_usage: 26.7G # Coder-32B + draft 0.5B
|
||||||
ram_usage: 3G # Coder-32B + draft 0.5B
|
ram_usage: 3G # Coder-32B + draft 0.5B
|
||||||
|
@ -25,7 +25,6 @@ models:
|
||||||
draft-min: 5
|
draft-min: 5
|
||||||
- name: "tabby-embeddings"
|
- name: "tabby-embeddings"
|
||||||
port: 18081
|
port: 18081
|
||||||
internal_port: 28081
|
|
||||||
vram_usage: 0.4G
|
vram_usage: 0.4G
|
||||||
ram_usage: 2.5G
|
ram_usage: 2.5G
|
||||||
env:
|
env:
|
||||||
|
@ -39,32 +38,35 @@ models:
|
||||||
embeddings: true
|
embeddings: true
|
||||||
- name: "big-chat"
|
- name: "big-chat"
|
||||||
port: 18082
|
port: 18082
|
||||||
internal_port: 28082
|
|
||||||
vram_usage: 26.5G
|
vram_usage: 26.5G
|
||||||
ram_usage: 2.5G
|
ram_usage: 2.5G
|
||||||
env:
|
env:
|
||||||
CUDA_VISIBLE_DEVICES: 0
|
CUDA_VISIBLE_DEVICES: 0
|
||||||
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
||||||
args:
|
args:
|
||||||
model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf
|
|
||||||
gpu-layers: 9999
|
|
||||||
ctx-size: 16384
|
ctx-size: 16384
|
||||||
flash-attn: true
|
flash-attn: true
|
||||||
# host: 0.0.0.0
|
model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf
|
||||||
|
gpu-layers: 9999
|
||||||
|
model-draft: /media/SanDisk/ai/models_live/Qwen2.5-0.5B-Instruct-Q8_0.gguf
|
||||||
|
gpu-layers-draft: 9999
|
||||||
|
# draft-max: 16
|
||||||
|
# draft-min: 5
|
||||||
- name: "bigger-chat"
|
- name: "bigger-chat"
|
||||||
port: 18083
|
port: 18083
|
||||||
internal_port: 28083
|
|
||||||
vram_usage: 29G
|
vram_usage: 29G
|
||||||
ram_usage: 4G
|
ram_usage: 5G
|
||||||
env:
|
env:
|
||||||
CUDA_VISIBLE_DEVICES: 0
|
CUDA_VISIBLE_DEVICES: 0
|
||||||
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
||||||
args:
|
args:
|
||||||
model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf
|
|
||||||
gpu-layers: 9999
|
|
||||||
flash-attn: true
|
|
||||||
# ctx-size: 8192
|
|
||||||
ctx-size: 16384
|
ctx-size: 16384
|
||||||
# host: 0.0.0.0
|
flash-attn: true
|
||||||
cache-type-k: q8_0
|
cache-type-k: q8_0
|
||||||
cache-type-v: q8_0
|
cache-type-v: q8_0
|
||||||
|
model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf
|
||||||
|
gpu-layers: 9999
|
||||||
|
model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf
|
||||||
|
gpu-layers-draft: 0
|
||||||
|
# draft-max: 16
|
||||||
|
# draft-min: 5
|
||||||
|
|
|
@ -42,24 +42,40 @@ pub fn initialize_logger() {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
#[derive(Clone, Debug, Deserialize)]
|
||||||
struct Config {
|
struct Config {
|
||||||
hardware: Hardware,
|
hardware: Hardware,
|
||||||
models: Vec<ModelConfig>,
|
models: Vec<ModelConfig>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
impl Config {
|
||||||
|
// TODO split up into raw deser config and "parsed"/"processed" config which always has a port
|
||||||
|
fn pick_open_ports(self) -> Self {
|
||||||
|
let mut config = self.clone();
|
||||||
|
for model in &mut config.models {
|
||||||
|
if model.internal_port.is_none() {
|
||||||
|
model.internal_port = Some(
|
||||||
|
openport::pick_random_unused_port()
|
||||||
|
.expect(format!("No open port found for {:?}", model).as_str()),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
config
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug, Deserialize)]
|
||||||
struct Hardware {
|
struct Hardware {
|
||||||
ram: String,
|
ram: String,
|
||||||
vram: String,
|
vram: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize, Clone)]
|
#[derive(Clone, Debug, Deserialize)]
|
||||||
struct ModelConfig {
|
struct ModelConfig {
|
||||||
#[allow(dead_code)]
|
#[allow(dead_code)]
|
||||||
name: String,
|
name: String,
|
||||||
port: u16,
|
port: u16,
|
||||||
internal_port: u16,
|
internal_port: Option<u16>,
|
||||||
env: HashMap<String, String>,
|
env: HashMap<String, String>,
|
||||||
args: HashMap<String, String>,
|
args: HashMap<String, String>,
|
||||||
vram_usage: String,
|
vram_usage: String,
|
||||||
|
@ -91,7 +107,7 @@ async fn main() {
|
||||||
initialize_logger();
|
initialize_logger();
|
||||||
// Read and parse the YAML configuration
|
// Read and parse the YAML configuration
|
||||||
let config_str = std::fs::read_to_string("config.yaml").expect("Failed to read config.yaml");
|
let config_str = std::fs::read_to_string("config.yaml").expect("Failed to read config.yaml");
|
||||||
let config: Config = serde_yaml::from_str(&config_str).expect("Failed to parse config.yaml");
|
let config: Config = serde_yaml::from_str::<Config>(&config_str).expect("Failed to parse config.yaml").pick_open_ports();
|
||||||
|
|
||||||
// Parse hardware resources
|
// Parse hardware resources
|
||||||
let total_ram = parse_size(&config.hardware.ram).expect("Invalid RAM size in config");
|
let total_ram = parse_size(&config.hardware.ram).expect("Invalid RAM size in config");
|
||||||
|
@ -270,7 +286,7 @@ async fn handle_request(
|
||||||
cmd.args(&args);
|
cmd.args(&args);
|
||||||
// TODO use openport crate via pick_random_unused_port for determining these
|
// TODO use openport crate via pick_random_unused_port for determining these
|
||||||
cmd.arg("--port");
|
cmd.arg("--port");
|
||||||
cmd.arg(format!("{}", model_config.internal_port));
|
cmd.arg(format!("{}", model_config.internal_port.expect("Unexpected empty port, should've been picked")));
|
||||||
cmd.stdout(Stdio::null()).stderr(Stdio::null()); // TODO save output and allow retrieval via api
|
cmd.stdout(Stdio::null()).stderr(Stdio::null()); // TODO save output and allow retrieval via api
|
||||||
|
|
||||||
tracing::info!("Starting llama-server with {:?}", cmd);
|
tracing::info!("Starting llama-server with {:?}", cmd);
|
||||||
|
@ -295,7 +311,7 @@ async fn handle_request(
|
||||||
|
|
||||||
// Wait for the instance to be ready
|
// Wait for the instance to be ready
|
||||||
is_llama_instance_running(&instance).await?;
|
is_llama_instance_running(&instance).await?;
|
||||||
wait_for_port(model_config.internal_port).await?;
|
wait_for_port(model_config.internal_port.expect("Unexpected empty port, should've been picked")).await?;
|
||||||
|
|
||||||
// Proxy the request
|
// Proxy the request
|
||||||
let retry_policy = reqwest_retry::policies::ExponentialBackoff::builder()
|
let retry_policy = reqwest_retry::policies::ExponentialBackoff::builder()
|
||||||
|
@ -312,7 +328,7 @@ async fn handle_request(
|
||||||
|
|
||||||
let uri = format!(
|
let uri = format!(
|
||||||
"http://127.0.0.1:{}{}",
|
"http://127.0.0.1:{}{}",
|
||||||
model_config.internal_port,
|
model_config.internal_port.expect("Unexpected empty port, should've been picked"),
|
||||||
req.uri().path_and_query().map(|x| x.as_str()).unwrap_or("")
|
req.uri().path_and_query().map(|x| x.as_str()).unwrap_or("")
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue