hf_text-generation-inference/launcher/src/main.rs

use clap::Parser;
use serde_json::Value;
use std::env;
use std::io::{BufRead, BufReader, Read};
use std::path::Path;
use std::process::ExitCode;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::mpsc::TryRecvError;
use std::sync::Arc;
use std::sync::{mpsc, Mutex};
use std::thread;
use std::thread::sleep;
use std::time::{Duration, Instant};
use std::{fs, io};
use subprocess::{Popen, PopenConfig, PopenError, Redirection};

/// App Configuration
#[derive(Parser, Debug)]
#[clap(author, version, about, long_about = None)]
struct Args {
    #[clap(default_value = "bigscience/bloom-560m", long, env)]
    model_id: String,
    #[clap(long, env)]
    revision: Option<String>,
    #[clap(long, env)]
    num_shard: Option<usize>,
    #[clap(long, env)]
    quantize: bool,
    #[clap(default_value = "128", long, env)]
    max_concurrent_requests: usize,
    #[clap(default_value = "1000", long, env)]
    max_input_length: usize,
    #[clap(default_value = "32", long, env)]
    max_batch_size: usize,
    #[clap(default_value = "20", long, env)]
    max_waiting_tokens: usize,
    #[clap(default_value = "3000", long, short, env)]
    port: u16,
    #[clap(default_value = "/tmp/text-generation-server", long, env)]
    shard_uds_path: String,
    #[clap(default_value = "0.0.0.0", long, env)]
    master_addr: String,
    #[clap(default_value = "6000", long, env)]
    master_port: usize,
    #[clap(long, env)]
    json_output: bool,
}

fn main() -> ExitCode {
    // Pattern match configuration
    let Args {
        model_id,
        revision,
        num_shard,
        quantize,
        max_concurrent_requests,
        max_input_length,
        max_batch_size,
        max_waiting_tokens,
        port,
        shard_uds_path,
        master_addr,
        master_port,
        json_output,
    } = Args::parse();

    if json_output {
        tracing_subscriber::fmt().json().init();
    } else {
        tracing_subscriber::fmt().compact().init();
    }

    // By default we only have one master shard
    let num_shard = num_shard.unwrap_or(1);

    // Signal handler
    let running = Arc::new(AtomicBool::new(true));
    let r = running.clone();
    ctrlc::set_handler(move || {
        r.store(false, Ordering::SeqCst);
    })
    .expect("Error setting Ctrl-C handler");

    // Shared shutdown bool
    let shutdown = Arc::new(Mutex::new(false));
    // Shared shutdown channel
    // When shutting down, the main thread will wait for all senders to be dropped
    let (shutdown_sender, shutdown_receiver) = mpsc::channel();

    // Shared channel to track shard status
    let (status_sender, status_receiver) = mpsc::channel();

    // Start shard processes
    for rank in 0..num_shard {
        let model_id = model_id.clone();
        let revision = revision.clone();
        let uds_path = shard_uds_path.clone();
        let master_addr = master_addr.clone();
        let status_sender = status_sender.clone();
        let shutdown = shutdown.clone();
        let shutdown_sender = shutdown_sender.clone();
        thread::spawn(move || {
            shard_manager(
                model_id,
                revision,
                quantize,
                uds_path,
                rank,
                num_shard,
                master_addr,
                master_port,
                status_sender,
                shutdown,
                shutdown_sender,
            )
        });
    }
    drop(shutdown_sender);

    // Wait for shard to start
    let mut shard_ready = 0;
    while running.load(Ordering::SeqCst) {
        match status_receiver.try_recv() {
            Ok(ShardStatus::Ready) => {
                shard_ready += 1;
                if shard_ready == num_shard {
                    break;
                }
            }
            Err(TryRecvError::Empty) => {
                sleep(Duration::from_millis(100));
            }
            Ok(ShardStatus::Failed((rank, err))) => {
                tracing::error!("Shard {} failed to start:\n{}", rank, err);
                shutdown_shards(shutdown, &shutdown_receiver);
                return ExitCode::FAILURE;
            }
            Err(TryRecvError::Disconnected) => {
                tracing::error!("Shard status channel disconnected");
                shutdown_shards(shutdown, &shutdown_receiver);
                return ExitCode::FAILURE;
            }
        }
    }

    // We might have received a termination signal
    if !running.load(Ordering::SeqCst) {
        shutdown_shards(shutdown, &shutdown_receiver);
        return ExitCode::SUCCESS;
    }

    // All shard started
    // Start webserver
    tracing::info!("Starting Webserver");
    let mut argv = vec![
        "text-generation-router".to_string(),
        "--max-concurrent-requests".to_string(),
        max_concurrent_requests.to_string(),
        "--max-input-length".to_string(),
        max_input_length.to_string(),
        "--max-batch-size".to_string(),
        max_batch_size.to_string(),
        "--max-waiting-tokens".to_string(),
        max_waiting_tokens.to_string(),
        "--port".to_string(),
        port.to_string(),
        "--master-shard-uds-path".to_string(),
        format!("{}-0", shard_uds_path),
        "--tokenizer-name".to_string(),
        model_id,
    ];

    if json_output {
        argv.push("--json-output".to_string());
    }

    let mut webserver = match Popen::create(
        &argv,
        PopenConfig {
            stdout: Redirection::Pipe,
            stderr: Redirection::Pipe,
            // Needed for the shutdown procedure
            setpgid: true,
            ..Default::default()
        },
    ) {
        Ok(p) => p,
        Err(err) => {
            tracing::error!("Failed to start webserver: {}", err);
            if let PopenError::IoError(err) = err {
                if err.kind() == io::ErrorKind::NotFound {
                    tracing::error!("text-generation-router not found in PATH");
                    tracing::error!("Please install it with `make install-router`")
                }
            } else {
                tracing::error!("{}", err);
            }

            shutdown_shards(shutdown, &shutdown_receiver);
            return ExitCode::FAILURE;
        }
    };

    // Redirect STDOUT and STDERR to the console
    let webserver_stdout = webserver.stdout.take().unwrap();
    let webserver_stderr = webserver.stderr.take().unwrap();

    thread::spawn(move || {
        let stdout = BufReader::new(webserver_stdout);
        let stderr = BufReader::new(webserver_stderr);
        for line in stdout.lines() {
            println!("{}", line.unwrap());
        }
        for line in stderr.lines() {
            println!("{}", line.unwrap());
        }
    });

    // Default exit code
    let mut exit_code = ExitCode::SUCCESS;

    while running.load(Ordering::SeqCst) {
        if let Ok(ShardStatus::Failed((rank, err))) = status_receiver.try_recv() {
            tracing::error!("Shard {} failed:\n{}", rank, err);
            exit_code = ExitCode::FAILURE;
            break;
        };

        match webserver.poll() {
            Some(_) => {
                tracing::error!("Webserver Crashed");
                shutdown_shards(shutdown, &shutdown_receiver);
                return ExitCode::FAILURE;
            }
            None => {
                sleep(Duration::from_millis(100));
            }
        };
    }

    // Graceful termination
    webserver.terminate().unwrap();
    tracing::info!("Waiting for webserver to gracefully shutdown");
    webserver.wait_timeout(Duration::from_secs(90)).unwrap();
    tracing::info!("Webserver terminated");
    shutdown_shards(shutdown, &shutdown_receiver);

    exit_code
}

#[derive(Debug)]
enum ShardStatus {
    Ready,
    Failed((usize, String)),
}

#[allow(clippy::too_many_arguments)]
fn shard_manager(
    model_id: String,
    revision: Option<String>,
    quantize: bool,
    uds_path: String,
    rank: usize,
    world_size: usize,
    master_addr: String,
    master_port: usize,
    status_sender: mpsc::Sender<ShardStatus>,
    shutdown: Arc<Mutex<bool>>,
    _shutdown_sender: mpsc::Sender<()>,
) {
    // Get UDS path
    let uds_string = format!("{}-{}", uds_path, rank);
    let uds = Path::new(&uds_string);
    // Clean previous runs
    fs::remove_file(uds).unwrap_or_default();

    // Process args
    let mut shard_argv = vec![
        "text-generation-server".to_string(),
        "serve".to_string(),
        model_id,
        "--uds-path".to_string(),
        uds_path,
        "--logger-level".to_string(),
        "ERROR".to_string(),
        "--json-output".to_string(),
    ];

    if world_size > 1 {
        shard_argv.push("--sharded".to_string());
    }

    if quantize {
        shard_argv.push("--quantize".to_string())
    }

    if let Some(revision) = revision {
        shard_argv.push("--revision".to_string());
        shard_argv.push(revision)
    }

    let mut env = vec![
        ("RANK".into(), rank.to_string().into()),
        ("WORLD_SIZE".into(), world_size.to_string().into()),
        ("MASTER_ADDR".into(), master_addr.into()),
        ("MASTER_PORT".into(), master_port.to_string().into()),
        ("SAFETENSORS_FAST_GPU".into(), "1".into()),
    ];

    // If the HUGGINGFACE_HUB_CACHE env var is set, pass it to the shard
    // Useful when running inside a docker container
    if let Ok(huggingface_hub_cache) = env::var("HUGGINGFACE_HUB_CACHE") {
        env.push(("HUGGINGFACE_HUB_CACHE".into(), huggingface_hub_cache.into()));
    };

    // If the WEIGHTS_CACHE_OVERRIDE env var is set, pass it to the shard
    // Useful when running inside a HuggingFace Inference Endpoint
    if let Ok(weights_cache_override) = env::var("WEIGHTS_CACHE_OVERRIDE") {
        env.push((
            "WEIGHTS_CACHE_OVERRIDE".into(),
            weights_cache_override.into(),
        ));
    };

    // If the CUDA_VISIBLE_DEVICES env var is set, pass it to the shard
    if let Ok(cuda_visible_devices) = env::var("CUDA_VISIBLE_DEVICES") {
        env.push(("CUDA_VISIBLE_DEVICES".into(), cuda_visible_devices.into()));
    };

    // Start process
    tracing::info!("Starting shard {}", rank);
    let mut p = match Popen::create(
        &shard_argv,
        PopenConfig {
            stdout: Redirection::Pipe,
            stderr: Redirection::Pipe,
            // Needed for the shutdown procedure
            setpgid: true,
            // NCCL env vars
            env: Some(env),
            ..Default::default()
        },
    ) {
        Ok(p) => p,
        Err(err) => {
            if let PopenError::IoError(ref err) = err {
                if err.kind() == io::ErrorKind::NotFound {
                    tracing::error!("text-generation-server not found in PATH");
                    tracing::error!("Please install it with `make install-server`")
                }
            }
            status_sender
                .send(ShardStatus::Failed((rank, err.to_string())))
                .unwrap();
            return;
        }
    };

    // Redirect STDOUT to the console
    let shard_stdout = p.stdout.take().unwrap();

    thread::spawn(move || {
        // Enter shard-manager tracing span
        let stdout = BufReader::new(shard_stdout);
        let _span = tracing::span!(tracing::Level::INFO, "shard-manager", rank = rank).entered();
        for line in stdout.lines() {
            // Parse loguru logs
            if let Ok(value) = serde_json::from_str::<Value>(&line.unwrap()) {
                if let Some(text) = value.get("text") {
                    // Format escaped newlines
                    tracing::error!("{}", text.to_string().replace("\\n", "\n"));
                }
            }
        }
    });

    let mut ready = false;
    let start_time = Instant::now();
    let mut wait_time = Instant::now();
    loop {
        // Process exited
        if p.poll().is_some() {
            let mut err = String::new();
            p.stderr.take().unwrap().read_to_string(&mut err).unwrap();
            status_sender
                .send(ShardStatus::Failed((rank, err)))
                .unwrap();
            return;
        }

        // We received a shutdown signal
        if *shutdown.lock().unwrap() {
            p.terminate().unwrap();
            let _ = p.wait_timeout(Duration::from_secs(90));
            tracing::info!("Shard {} terminated", rank);
            return;
        }

        // Shard is ready
        if uds.exists() && !ready {
            tracing::info!("Shard {} ready in {:?}", rank, start_time.elapsed());
            status_sender.send(ShardStatus::Ready).unwrap();
            ready = true;
        } else if !ready && wait_time.elapsed() > Duration::from_secs(10) {
            tracing::info!("Waiting for shard {} to be ready...", rank);
            wait_time = Instant::now();
        }
        sleep(Duration::from_millis(100));
    }
}

fn shutdown_shards(shutdown: Arc<Mutex<bool>>, shutdown_receiver: &mpsc::Receiver<()>) {
    tracing::info!("Shutting down shards");
    // Update shutdown value to true
    // This will be picked up by the shard manager
    {
        let mut shutdown = shutdown.lock().unwrap();
        *shutdown = true;
    }

    // Wait for shards to shutdown
    // This will block till all shutdown_sender are dropped
    let _ = shutdown_receiver.recv();
}
v0.1.0 2022-10-18 07:19:03 -06:00			`use clap::Parser;`
feat(server): Support SantaCoder (#26) 2023-01-20 04:24:39 -07:00			`use serde_json::Value;`
feat(server): Use safetensors Co-authored-by: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> 2022-10-22 12:00:15 -06:00			`use std::env;`
v0.1.0 2022-10-18 07:19:03 -06:00			`use std::io::{BufRead, BufReader, Read};`
			`use std::path::Path;`
			`use std::process::ExitCode;`
			`use std::sync::atomic::{AtomicBool, Ordering};`
			`use std::sync::mpsc::TryRecvError;`
			`use std::sync::Arc;`
			`use std::sync::{mpsc, Mutex};`
			`use std::thread;`
			`use std::thread::sleep;`
			`use std::time::{Duration, Instant};`
			`use std::{fs, io};`
			`use subprocess::{Popen, PopenConfig, PopenError, Redirection};`

			`/// App Configuration`
			`#[derive(Parser, Debug)]`
			`#[clap(author, version, about, long_about = None)]`
			`struct Args {`
			`#[clap(default_value = "bigscience/bloom-560m", long, env)]`
feat(router): refactor API and add openAPI schemas (#53) 2023-02-03 04:43:37 -07:00			`model_id: String,`
v0.1.0 2022-10-18 07:19:03 -06:00			`#[clap(long, env)]`
feat(server): Support GPT-Neox (#39) 2023-01-31 10:53:56 -07:00			`revision: Option<String>,`
			`#[clap(long, env)]`
v0.1.0 2022-10-18 07:19:03 -06:00			`num_shard: Option<usize>,`
feat(server): Support bitsandbytes 2022-10-27 06:25:29 -06:00			`#[clap(long, env)]`
			`quantize: bool,`
v0.1.0 2022-10-18 07:19:03 -06:00			`#[clap(default_value = "128", long, env)]`
			`max_concurrent_requests: usize,`
			`#[clap(default_value = "1000", long, env)]`
			`max_input_length: usize,`
			`#[clap(default_value = "32", long, env)]`
			`max_batch_size: usize,`
feat(router): Add max_waiting_tokens 2022-10-21 08:40:05 -06:00			`#[clap(default_value = "20", long, env)]`
			`max_waiting_tokens: usize,`
v0.1.0 2022-10-18 07:19:03 -06:00			`#[clap(default_value = "3000", long, short, env)]`
			`port: u16,`
			`#[clap(default_value = "/tmp/text-generation-server", long, env)]`
			`shard_uds_path: String,`
fix(server): better handling of inference mode (#57) 2023-02-07 07:38:22 -07:00			`#[clap(default_value = "0.0.0.0", long, env)]`
v0.1.0 2022-10-18 07:19:03 -06:00			`master_addr: String,`
fix(server): better handling of inference mode (#57) 2023-02-07 07:38:22 -07:00			`#[clap(default_value = "6000", long, env)]`
v0.1.0 2022-10-18 07:19:03 -06:00			`master_port: usize,`
feat: Use json formatter by default in docker image 2022-11-02 10:29:56 -06:00			`#[clap(long, env)]`
			`json_output: bool,`
v0.1.0 2022-10-18 07:19:03 -06:00			`}`

			`fn main() -> ExitCode {`
			`// Pattern match configuration`
			`let Args {`
feat(router): refactor API and add openAPI schemas (#53) 2023-02-03 04:43:37 -07:00			`model_id,`
feat(server): Support GPT-Neox (#39) 2023-01-31 10:53:56 -07:00			`revision,`
v0.1.0 2022-10-18 07:19:03 -06:00			`num_shard,`
feat(server): Support bitsandbytes 2022-10-27 06:25:29 -06:00			`quantize,`
v0.1.0 2022-10-18 07:19:03 -06:00			`max_concurrent_requests,`
			`max_input_length,`
			`max_batch_size,`
feat(router): Add max_waiting_tokens 2022-10-21 08:40:05 -06:00			`max_waiting_tokens,`
v0.1.0 2022-10-18 07:19:03 -06:00			`port,`
			`shard_uds_path,`
			`master_addr,`
			`master_port,`
feat: Use json formatter by default in docker image 2022-11-02 10:29:56 -06:00			`json_output,`
v0.1.0 2022-10-18 07:19:03 -06:00			`} = Args::parse();`

feat: Use json formatter by default in docker image 2022-11-02 10:29:56 -06:00			`if json_output {`
			`tracing_subscriber::fmt().json().init();`
			`} else {`
			`tracing_subscriber::fmt().compact().init();`
			`}`

v0.1.0 2022-10-18 07:19:03 -06:00			`// By default we only have one master shard`
			`let num_shard = num_shard.unwrap_or(1);`

			`// Signal handler`
			`let running = Arc::new(AtomicBool::new(true));`
			`let r = running.clone();`
			`ctrlc::set_handler(move \|\| {`
			`r.store(false, Ordering::SeqCst);`
			`})`
			`.expect("Error setting Ctrl-C handler");`

			`// Shared shutdown bool`
			`let shutdown = Arc::new(Mutex::new(false));`
			`// Shared shutdown channel`
			`// When shutting down, the main thread will wait for all senders to be dropped`
			`let (shutdown_sender, shutdown_receiver) = mpsc::channel();`

			`// Shared channel to track shard status`
			`let (status_sender, status_receiver) = mpsc::channel();`

			`// Start shard processes`
			`for rank in 0..num_shard {`
feat(router): refactor API and add openAPI schemas (#53) 2023-02-03 04:43:37 -07:00			`let model_id = model_id.clone();`
feat(server): Support GPT-Neox (#39) 2023-01-31 10:53:56 -07:00			`let revision = revision.clone();`
v0.1.0 2022-10-18 07:19:03 -06:00			`let uds_path = shard_uds_path.clone();`
			`let master_addr = master_addr.clone();`
			`let status_sender = status_sender.clone();`
			`let shutdown = shutdown.clone();`
			`let shutdown_sender = shutdown_sender.clone();`
			`thread::spawn(move \|\| {`
			`shard_manager(`
feat(router): refactor API and add openAPI schemas (#53) 2023-02-03 04:43:37 -07:00			`model_id,`
feat(server): Support GPT-Neox (#39) 2023-01-31 10:53:56 -07:00			`revision,`
feat(server): Support bitsandbytes 2022-10-27 06:25:29 -06:00			`quantize,`
v0.1.0 2022-10-18 07:19:03 -06:00			`uds_path,`
			`rank,`
			`num_shard,`
			`master_addr,`
			`master_port,`
			`status_sender,`
			`shutdown,`
			`shutdown_sender,`
			`)`
			`});`
			`}`
			`drop(shutdown_sender);`

			`// Wait for shard to start`
			`let mut shard_ready = 0;`
			`while running.load(Ordering::SeqCst) {`
			`match status_receiver.try_recv() {`
			`Ok(ShardStatus::Ready) => {`
			`shard_ready += 1;`
			`if shard_ready == num_shard {`
			`break;`
			`}`
			`}`
			`Err(TryRecvError::Empty) => {`
			`sleep(Duration::from_millis(100));`
			`}`
			`Ok(ShardStatus::Failed((rank, err))) => {`
			`tracing::error!("Shard {} failed to start:\n{}", rank, err);`
			`shutdown_shards(shutdown, &shutdown_receiver);`
			`return ExitCode::FAILURE;`
			`}`
			`Err(TryRecvError::Disconnected) => {`
			`tracing::error!("Shard status channel disconnected");`
			`shutdown_shards(shutdown, &shutdown_receiver);`
			`return ExitCode::FAILURE;`
			`}`
			`}`
			`}`

			`// We might have received a termination signal`
			`if !running.load(Ordering::SeqCst) {`
			`shutdown_shards(shutdown, &shutdown_receiver);`
			`return ExitCode::SUCCESS;`
			`}`

			`// All shard started`
			`// Start webserver`
			`tracing::info!("Starting Webserver");`
feat: Use json formatter by default in docker image 2022-11-02 10:29:56 -06:00			`let mut argv = vec![`
			`"text-generation-router".to_string(),`
			`"--max-concurrent-requests".to_string(),`
			`max_concurrent_requests.to_string(),`
			`"--max-input-length".to_string(),`
			`max_input_length.to_string(),`
			`"--max-batch-size".to_string(),`
			`max_batch_size.to_string(),`
			`"--max-waiting-tokens".to_string(),`
			`max_waiting_tokens.to_string(),`
			`"--port".to_string(),`
			`port.to_string(),`
			`"--master-shard-uds-path".to_string(),`
			`format!("{}-0", shard_uds_path),`
			`"--tokenizer-name".to_string(),`
feat(router): refactor API and add openAPI schemas (#53) 2023-02-03 04:43:37 -07:00			`model_id,`
feat: Use json formatter by default in docker image 2022-11-02 10:29:56 -06:00			`];`

			`if json_output {`
			`argv.push("--json-output".to_string());`
			`}`

v0.1.0 2022-10-18 07:19:03 -06:00			`let mut webserver = match Popen::create(`
feat: Use json formatter by default in docker image 2022-11-02 10:29:56 -06:00			`&argv,`
v0.1.0 2022-10-18 07:19:03 -06:00			`PopenConfig {`
			`stdout: Redirection::Pipe,`
			`stderr: Redirection::Pipe,`
			`// Needed for the shutdown procedure`
			`setpgid: true,`
			`..Default::default()`
			`},`
			`) {`
			`Ok(p) => p,`
			`Err(err) => {`
			`tracing::error!("Failed to start webserver: {}", err);`
			`if let PopenError::IoError(err) = err {`
			`if err.kind() == io::ErrorKind::NotFound {`
			`tracing::error!("text-generation-router not found in PATH");`
			tracing::error!("Please install it with `make install-router`")
			`}`
feat(server): Support bitsandbytes 2022-10-27 06:25:29 -06:00			`} else {`
			`tracing::error!("{}", err);`
v0.1.0 2022-10-18 07:19:03 -06:00			`}`

			`shutdown_shards(shutdown, &shutdown_receiver);`
			`return ExitCode::FAILURE;`
			`}`
			`};`

			`// Redirect STDOUT and STDERR to the console`
			`let webserver_stdout = webserver.stdout.take().unwrap();`
			`let webserver_stderr = webserver.stderr.take().unwrap();`

			`thread::spawn(move \|\| {`
			`let stdout = BufReader::new(webserver_stdout);`
			`let stderr = BufReader::new(webserver_stderr);`
			`for line in stdout.lines() {`
			`println!("{}", line.unwrap());`
			`}`
			`for line in stderr.lines() {`
			`println!("{}", line.unwrap());`
			`}`
			`});`

			`// Default exit code`
			`let mut exit_code = ExitCode::SUCCESS;`

			`while running.load(Ordering::SeqCst) {`
			`if let Ok(ShardStatus::Failed((rank, err))) = status_receiver.try_recv() {`
			`tracing::error!("Shard {} failed:\n{}", rank, err);`
			`exit_code = ExitCode::FAILURE;`
			`break;`
			`};`

			`match webserver.poll() {`
			`Some(_) => {`
			`tracing::error!("Webserver Crashed");`
			`shutdown_shards(shutdown, &shutdown_receiver);`
			`return ExitCode::FAILURE;`
			`}`
			`None => {`
			`sleep(Duration::from_millis(100));`
			`}`
			`};`
			`}`

			`// Graceful termination`
			`webserver.terminate().unwrap();`
			`tracing::info!("Waiting for webserver to gracefully shutdown");`
			`webserver.wait_timeout(Duration::from_secs(90)).unwrap();`
			`tracing::info!("Webserver terminated");`
			`shutdown_shards(shutdown, &shutdown_receiver);`

			`exit_code`
			`}`

			`#[derive(Debug)]`
			`enum ShardStatus {`
			`Ready,`
			`Failed((usize, String)),`
			`}`

			`#[allow(clippy::too_many_arguments)]`
			`fn shard_manager(`
feat(router): refactor API and add openAPI schemas (#53) 2023-02-03 04:43:37 -07:00			`model_id: String,`
feat(server): Support GPT-Neox (#39) 2023-01-31 10:53:56 -07:00			`revision: Option<String>,`
feat(server): Support bitsandbytes 2022-10-27 06:25:29 -06:00			`quantize: bool,`
v0.1.0 2022-10-18 07:19:03 -06:00			`uds_path: String,`
			`rank: usize,`
			`world_size: usize,`
			`master_addr: String,`
			`master_port: usize,`
			`status_sender: mpsc::Sender<ShardStatus>,`
			`shutdown: Arc<Mutex<bool>>,`
			`_shutdown_sender: mpsc::Sender<()>,`
			`) {`
			`// Get UDS path`
			`let uds_string = format!("{}-{}", uds_path, rank);`
			`let uds = Path::new(&uds_string);`
			`// Clean previous runs`
			`fs::remove_file(uds).unwrap_or_default();`

			`// Process args`
			`let mut shard_argv = vec![`
feat(server): Support all AutoModelForCausalLM on a best effort basis 2022-10-28 11:24:00 -06:00			`"text-generation-server".to_string(),`
v0.1.0 2022-10-18 07:19:03 -06:00			`"serve".to_string(),`
feat(router): refactor API and add openAPI schemas (#53) 2023-02-03 04:43:37 -07:00			`model_id,`
v0.1.0 2022-10-18 07:19:03 -06:00			`"--uds-path".to_string(),`
			`uds_path,`
feat(launcher): Log server stdout (#19) Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2023-01-05 04:01:23 -07:00			`"--logger-level".to_string(),`
			`"ERROR".to_string(),`
			`"--json-output".to_string(),`
v0.1.0 2022-10-18 07:19:03 -06:00			`];`

			`if world_size > 1 {`
			`shard_argv.push("--sharded".to_string());`
			`}`

feat(server): Support bitsandbytes 2022-10-27 06:25:29 -06:00			`if quantize {`
			`shard_argv.push("--quantize".to_string())`
			`}`

feat(server): Support GPT-Neox (#39) 2023-01-31 10:53:56 -07:00			`if let Some(revision) = revision {`
			`shard_argv.push("--revision".to_string());`
			`shard_argv.push(revision)`
			`}`

feat(server): Use safetensors Co-authored-by: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> 2022-10-22 12:00:15 -06:00			`let mut env = vec![`
fix(server): Minor refactorization using new_zeros (#24) - Fix some type hints, in particular base tokenizer class - Make use of `tensor.new_zero/empty` methods - Simplify env var string parsing in launcher 2023-01-17 01:10:22 -07:00			`("RANK".into(), rank.to_string().into()),`
			`("WORLD_SIZE".into(), world_size.to_string().into()),`
			`("MASTER_ADDR".into(), master_addr.into()),`
			`("MASTER_PORT".into(), master_port.to_string().into()),`
			`("SAFETENSORS_FAST_GPU".into(), "1".into()),`
feat(server): Use safetensors Co-authored-by: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> 2022-10-22 12:00:15 -06:00			`];`

			`// If the HUGGINGFACE_HUB_CACHE env var is set, pass it to the shard`
			`// Useful when running inside a docker container`
			`if let Ok(huggingface_hub_cache) = env::var("HUGGINGFACE_HUB_CACHE") {`
feat(server): Support SantaCoder (#26) 2023-01-20 04:24:39 -07:00			`env.push(("HUGGINGFACE_HUB_CACHE".into(), huggingface_hub_cache.into()));`
feat(server): Use safetensors Co-authored-by: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> 2022-10-22 12:00:15 -06:00			`};`
v0.1.0 2022-10-18 07:19:03 -06:00
feat(server): allow the server to use a local weight cache (#49) 2023-02-01 08:22:10 -07:00			`// If the WEIGHTS_CACHE_OVERRIDE env var is set, pass it to the shard`
			`// Useful when running inside a HuggingFace Inference Endpoint`
			`if let Ok(weights_cache_override) = env::var("WEIGHTS_CACHE_OVERRIDE") {`
feat(router): use background task to manage request queue (#52) Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2023-02-02 06:59:27 -07:00			`env.push((`
			`"WEIGHTS_CACHE_OVERRIDE".into(),`
			`weights_cache_override.into(),`
			`));`
feat(server): allow the server to use a local weight cache (#49) 2023-02-01 08:22:10 -07:00			`};`

feat(launcher): Pass CUDA_VISIBLE_DEVICES to the shard 2022-11-04 11:31:08 -06:00			`// If the CUDA_VISIBLE_DEVICES env var is set, pass it to the shard`
			`if let Ok(cuda_visible_devices) = env::var("CUDA_VISIBLE_DEVICES") {`
feat(server): Support SantaCoder (#26) 2023-01-20 04:24:39 -07:00			`env.push(("CUDA_VISIBLE_DEVICES".into(), cuda_visible_devices.into()));`
feat(launcher): Pass CUDA_VISIBLE_DEVICES to the shard 2022-11-04 11:31:08 -06:00			`};`

v0.1.0 2022-10-18 07:19:03 -06:00			`// Start process`
			`tracing::info!("Starting shard {}", rank);`
			`let mut p = match Popen::create(`
			`&shard_argv,`
			`PopenConfig {`
			`stdout: Redirection::Pipe,`
			`stderr: Redirection::Pipe,`
			`// Needed for the shutdown procedure`
			`setpgid: true,`
			`// NCCL env vars`
feat(server): Use safetensors Co-authored-by: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> 2022-10-22 12:00:15 -06:00			`env: Some(env),`
v0.1.0 2022-10-18 07:19:03 -06:00			`..Default::default()`
			`},`
			`) {`
			`Ok(p) => p,`
			`Err(err) => {`
			`if let PopenError::IoError(ref err) = err {`
			`if err.kind() == io::ErrorKind::NotFound {`
feat(server): Support all AutoModelForCausalLM on a best effort basis 2022-10-28 11:24:00 -06:00			`tracing::error!("text-generation-server not found in PATH");`
v0.1.0 2022-10-18 07:19:03 -06:00			tracing::error!("Please install it with `make install-server`")
			`}`
			`}`
			`status_sender`
			`.send(ShardStatus::Failed((rank, err.to_string())))`
			`.unwrap();`
			`return;`
			`}`
			`};`

feat(launcher): Log server stdout (#19) Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2023-01-05 04:01:23 -07:00			`// Redirect STDOUT to the console`
			`let shard_stdout = p.stdout.take().unwrap();`

			`thread::spawn(move \|\| {`
			`// Enter shard-manager tracing span`
			`let stdout = BufReader::new(shard_stdout);`
			`let _span = tracing::span!(tracing::Level::INFO, "shard-manager", rank = rank).entered();`
			`for line in stdout.lines() {`
			`// Parse loguru logs`
			`if let Ok(value) = serde_json::from_str::<Value>(&line.unwrap()) {`
			`if let Some(text) = value.get("text") {`
			`// Format escaped newlines`
			`tracing::error!("{}", text.to_string().replace("\\n", "\n"));`
			`}`
			`}`
			`}`
			`});`

v0.1.0 2022-10-18 07:19:03 -06:00			`let mut ready = false;`
			`let start_time = Instant::now();`
feat(server): Use safetensors Co-authored-by: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> 2022-10-22 12:00:15 -06:00			`let mut wait_time = Instant::now();`
v0.1.0 2022-10-18 07:19:03 -06:00			`loop {`
			`// Process exited`
			`if p.poll().is_some() {`
			`let mut err = String::new();`
			`p.stderr.take().unwrap().read_to_string(&mut err).unwrap();`
			`status_sender`
			`.send(ShardStatus::Failed((rank, err)))`
			`.unwrap();`
			`return;`
			`}`

			`// We received a shutdown signal`
			`if *shutdown.lock().unwrap() {`
			`p.terminate().unwrap();`
			`let _ = p.wait_timeout(Duration::from_secs(90));`
			`tracing::info!("Shard {} terminated", rank);`
			`return;`
			`}`

			`// Shard is ready`
			`if uds.exists() && !ready {`
			`tracing::info!("Shard {} ready in {:?}", rank, start_time.elapsed());`
			`status_sender.send(ShardStatus::Ready).unwrap();`
			`ready = true;`
feat(server): Support bitsandbytes 2022-10-27 06:25:29 -06:00			`} else if !ready && wait_time.elapsed() > Duration::from_secs(10) {`
			`tracing::info!("Waiting for shard {} to be ready...", rank);`
			`wait_time = Instant::now();`
v0.1.0 2022-10-18 07:19:03 -06:00			`}`
feat(server): Use safetensors Co-authored-by: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> 2022-10-22 12:00:15 -06:00			`sleep(Duration::from_millis(100));`
v0.1.0 2022-10-18 07:19:03 -06:00			`}`
			`}`

			`fn shutdown_shards(shutdown: Arc<Mutex<bool>>, shutdown_receiver: &mpsc::Receiver<()>) {`
			`tracing::info!("Shutting down shards");`
			`// Update shutdown value to true`
			`// This will be picked up by the shard manager`
			`{`
			`let mut shutdown = shutdown.lock().unwrap();`
			`*shutdown = true;`
			`}`

			`// Wait for shards to shutdown`
			`// This will block till all shutdown_sender are dropped`
			`let _ = shutdown_receiver.recv();`
			`}`