Add automatic open port picking
This commit is contained in:
parent
3b4655728d
commit
d33e4109c3
4 changed files with 51 additions and 22 deletions
10
Cargo.lock
generated
10
Cargo.lock
generated
|
@ -4276,6 +4276,7 @@ dependencies = [
|
|||
"futures",
|
||||
"hyper",
|
||||
"itertools 0.13.0",
|
||||
"openport",
|
||||
"pin-project-lite",
|
||||
"reqwest",
|
||||
"reqwest-middleware",
|
||||
|
@ -5075,6 +5076,15 @@ version = "1.19.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
|
||||
|
||||
[[package]]
|
||||
name = "openport"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "365c699f76305b3e62588961a288be10f0819ef1391f25870a69b35a213577cc"
|
||||
dependencies = [
|
||||
"rand 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "openssl"
|
||||
version = "0.10.66"
|
||||
|
|
|
@ -28,3 +28,4 @@ tower-http = { version = "0.5.2", features = ["trace"] }
|
|||
reqwest-retry = "0.6.1"
|
||||
reqwest-middleware = { version = "0.3.3", features = ["charset", "http2", "json", "multipart", "rustls-tls"] }
|
||||
itertools = "0.13.0"
|
||||
openport = { version = "0.1.1", features = ["rand"] }
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
hardware:
|
||||
ram: 64G
|
||||
ram: 48G
|
||||
vram: 30G
|
||||
models:
|
||||
- name: "tabby-code"
|
||||
port: 18080
|
||||
internal_port: 28080
|
||||
# internal_port: 28080 # Optional
|
||||
autostart: true
|
||||
vram_usage: 26.7G # Coder-32B + draft 0.5B
|
||||
ram_usage: 3G # Coder-32B + draft 0.5B
|
||||
|
@ -25,7 +25,6 @@ models:
|
|||
draft-min: 5
|
||||
- name: "tabby-embeddings"
|
||||
port: 18081
|
||||
internal_port: 28081
|
||||
vram_usage: 0.4G
|
||||
ram_usage: 2.5G
|
||||
env:
|
||||
|
@ -39,32 +38,35 @@ models:
|
|||
embeddings: true
|
||||
- name: "big-chat"
|
||||
port: 18082
|
||||
internal_port: 28082
|
||||
vram_usage: 26.5G
|
||||
ram_usage: 2.5G
|
||||
env:
|
||||
CUDA_VISIBLE_DEVICES: 0
|
||||
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
||||
args:
|
||||
model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf
|
||||
gpu-layers: 9999
|
||||
ctx-size: 16384
|
||||
flash-attn: true
|
||||
# host: 0.0.0.0
|
||||
model: /media/SanDisk/ai/models_live/Qwen2.5-32B-Instruct-Q5_K_L.gguf
|
||||
gpu-layers: 9999
|
||||
model-draft: /media/SanDisk/ai/models_live/Qwen2.5-0.5B-Instruct-Q8_0.gguf
|
||||
gpu-layers-draft: 9999
|
||||
# draft-max: 16
|
||||
# draft-min: 5
|
||||
- name: "bigger-chat"
|
||||
port: 18083
|
||||
internal_port: 28083
|
||||
vram_usage: 29G
|
||||
ram_usage: 4G
|
||||
ram_usage: 5G
|
||||
env:
|
||||
CUDA_VISIBLE_DEVICES: 0
|
||||
HSA_OVERRIDE_GFX_VERSION: '11.0.0'
|
||||
args:
|
||||
model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf
|
||||
gpu-layers: 9999
|
||||
flash-attn: true
|
||||
# ctx-size: 8192
|
||||
ctx-size: 16384
|
||||
# host: 0.0.0.0
|
||||
flash-attn: true
|
||||
cache-type-k: q8_0
|
||||
cache-type-v: q8_0
|
||||
model: /media/SanDisk/ai/models_live/Llama-3.1-Nemotron-70B-Instruct-HF-IQ3_XXS.gguf
|
||||
gpu-layers: 9999
|
||||
model-draft: /media/SanDisk/ai/models_live/Llama-3.2-1B-Instruct-Q8_0.gguf
|
||||
gpu-layers-draft: 0
|
||||
# draft-max: 16
|
||||
# draft-min: 5
|
||||
|
|
|
@ -42,24 +42,40 @@ pub fn initialize_logger() {
|
|||
});
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
struct Config {
|
||||
hardware: Hardware,
|
||||
models: Vec<ModelConfig>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
impl Config {
|
||||
// TODO split up into raw deser config and "parsed"/"processed" config which always has a port
|
||||
fn pick_open_ports(self) -> Self {
|
||||
let mut config = self.clone();
|
||||
for model in &mut config.models {
|
||||
if model.internal_port.is_none() {
|
||||
model.internal_port = Some(
|
||||
openport::pick_random_unused_port()
|
||||
.expect(format!("No open port found for {:?}", model).as_str()),
|
||||
);
|
||||
}
|
||||
}
|
||||
config
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
struct Hardware {
|
||||
ram: String,
|
||||
vram: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
struct ModelConfig {
|
||||
#[allow(dead_code)]
|
||||
name: String,
|
||||
port: u16,
|
||||
internal_port: u16,
|
||||
internal_port: Option<u16>,
|
||||
env: HashMap<String, String>,
|
||||
args: HashMap<String, String>,
|
||||
vram_usage: String,
|
||||
|
@ -91,7 +107,7 @@ async fn main() {
|
|||
initialize_logger();
|
||||
// Read and parse the YAML configuration
|
||||
let config_str = std::fs::read_to_string("config.yaml").expect("Failed to read config.yaml");
|
||||
let config: Config = serde_yaml::from_str(&config_str).expect("Failed to parse config.yaml");
|
||||
let config: Config = serde_yaml::from_str::<Config>(&config_str).expect("Failed to parse config.yaml").pick_open_ports();
|
||||
|
||||
// Parse hardware resources
|
||||
let total_ram = parse_size(&config.hardware.ram).expect("Invalid RAM size in config");
|
||||
|
@ -270,7 +286,7 @@ async fn handle_request(
|
|||
cmd.args(&args);
|
||||
// TODO use openport crate via pick_random_unused_port for determining these
|
||||
cmd.arg("--port");
|
||||
cmd.arg(format!("{}", model_config.internal_port));
|
||||
cmd.arg(format!("{}", model_config.internal_port.expect("Unexpected empty port, should've been picked")));
|
||||
cmd.stdout(Stdio::null()).stderr(Stdio::null()); // TODO save output and allow retrieval via api
|
||||
|
||||
tracing::info!("Starting llama-server with {:?}", cmd);
|
||||
|
@ -295,7 +311,7 @@ async fn handle_request(
|
|||
|
||||
// Wait for the instance to be ready
|
||||
is_llama_instance_running(&instance).await?;
|
||||
wait_for_port(model_config.internal_port).await?;
|
||||
wait_for_port(model_config.internal_port.expect("Unexpected empty port, should've been picked")).await?;
|
||||
|
||||
// Proxy the request
|
||||
let retry_policy = reqwest_retry::policies::ExponentialBackoff::builder()
|
||||
|
@ -312,7 +328,7 @@ async fn handle_request(
|
|||
|
||||
let uri = format!(
|
||||
"http://127.0.0.1:{}{}",
|
||||
model_config.internal_port,
|
||||
model_config.internal_port.expect("Unexpected empty port, should've been picked"),
|
||||
req.uri().path_and_query().map(|x| x.as_str()).unwrap_or("")
|
||||
);
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue