Init

2022-10-08 12:30:12 +02:00 · 2022-10-08 12:30:12 +02:00 · 295831a481
commit 295831a481
43 changed files with 5060 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,37 @@
 # BLOOM Inference
 A Rust and gRPC server for BLOOM Inference.
 ## Install
 ```shell
 cd server
 pip install .
 ```
 ```
 cd router
 cargo build --release
 ```
 ## Run
 ```shell
 python server/bloom_inference/main.py bigscience/bloom --num-gpus 8 --shard-directory /dev/shm/models
 ```
 ```shell
 ./router/target/release/router
 ```
 ## TODO:
 - [ ] Improve model download
  - Store "shardable" layers separately and layer by layer
 - [ ] Add batching args to router CLI 
 - [ ] Add docstrings + comments everywhere as the codebase is fairly complicated
 - [ ] Add tests
 - [ ] Add shutdown logic in router and server
 - [ ] Improve multi-processing logic in server
 - [ ] Improve error handling everywhere
 - [ ] Improve past key layer indexing?
--- a/proto/generate.proto
+++ b/proto/generate.proto
@ -0,0 +1,83 @@
 syntax = "proto3";
 package generate.v1;
 service TextGeneration {
    /// Service discovery
    rpc ServiceDiscovery(Empty) returns (ServiceDiscoveryResponse) {}
    /// Empties batch cache
    rpc ClearCache(Empty) returns (Empty);
    /// Generate tokens for a batch without cache
    rpc Generate(Batch) returns (Response);
    /// Generate tokens for a batch with cache
    rpc GenerateWithCache(BatchCached) returns (Response);
 }
 message ServiceDiscoveryResponse {
    repeated string urls = 1;
 }
 message LogitsWarperParameters {
    float temperature = 1;
    uint32 top_k = 2;
    float top_p = 3;
    bool do_sample = 4;
 }
 message Request {
    /// Request ID
    uint64 id = 1;
    /// The generation context
    string inputs = 2;
    /// Logits Warper Parameters
    LogitsWarperParameters parameters = 3;
    /// Stopping criteria
    uint32 max_new_tokens = 4;
 }
 message Batch {
    /// Batch ID
    uint64 id = 1;
    /// Individual requests
    repeated Request requests = 2;
 }
 message BatchCached {
    /// Batch ID
    uint64 id = 1;
    /// Request ids within cache
    repeated uint64 request_ids = 2;
    /// Cache IDs
    repeated uint64 batch_cached_ids = 3;
    /// Batch size (sum of all batch sizes)
    uint32 total_batch_size = 4;
    /// Max sequence length
    uint32 max_sequence_length = 5;
 }
 message FinishedGeneration {
    /// ID of the original request
    uint64 id = 1;
    /// Output
    string output = 2;
 }
 message CacheEntry {
    /// Cache ID; same as batch ID
    uint64 id = 1;
    /// Requests present in cache entry
    repeated uint64 request_ids = 2;
    /// Sequence length
    uint32 sequence_length = 3;
 }
 message Response {
    /// Finished requests (optional)
    repeated FinishedGeneration finished = 1;
    /// Cache entry (optional)
    optional CacheEntry cache_entry = 2;
 }
 // Represent an empty message.
 message Empty {}
--- a/router/.gitignore
+++ b/router/.gitignore
@ -0,0 +1 @@
 /target
--- a/router/Cargo.lock
+++ b/router/Cargo.lock
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@ -0,0 +1,28 @@
 [package]
 name = "bloom-inference"
 version = "0.1.0"
 edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 [dependencies]
 bloom-inference-client = { path = "client" }
 futures = "0.3.24"
 parking_lot = "0.12.1"
 poem = "1.3.45"
 serde = "1.0.145"
 serde_json = "1.0.85"
 tokenizers = "0.13.0"
 tokio = { version = "1.21.1", features = ["rt-multi-thread", "parking_lot", "sync"] }
 tracing = "0.1.36"
 tracing-subscriber = "0.3.15"
 [workspace]
 members = [
    "client",
 ]
 [profile.release]
 debug = 1
 incremental = true
 lto = "off"
--- a/router/client/Cargo.toml
+++ b/router/client/Cargo.toml
@ -0,0 +1,19 @@
 [package]
 name = "bloom-inference-client"
 version = "0.1.0"
 edition = "2021"
 [dependencies]
 futures = "0.3.24"
 #grpc-error-details = { path = "../../grpc-error-details" }
 #grpc-metadata = { path = "../../grpc-metadata" }
 prost = "^0.9"
 thiserror = "1.0.37"
 tokio = { version = "1.21.2", features = ["sync"] }
 tonic = "^0.6"
 tower = "^0.4"
 tracing = "^0.1"
 tracing-error = "^0.2"
 [build-dependencies]
 tonic-build = "0.6.2"
--- a/router/client/build.rs
+++ b/router/client/build.rs
@ -0,0 +1,14 @@
 use std::fs;
 fn main() -> Result<(), Box<dyn std::error::Error>> {
    fs::create_dir("src/pb").unwrap_or(());
    tonic_build::configure()
        .build_client(true)
        .build_server(false)
        .out_dir("src/pb")
        .include_file("mod.rs")
        .compile(&["../../proto/generate.proto"], &["../../proto"])
        .unwrap_or_else(|e| panic!("protobuf compilation failed: {}", e));
    Ok(())
 }
--- a/router/client/src/client.rs
+++ b/router/client/src/client.rs
@ -0,0 +1,104 @@
 use crate::pb::generate::v1::text_generation_client::TextGenerationClient;
 use crate::pb::generate::v1::*;
 use crate::Result;
 use std::time::Duration;
 use tonic::transport::{Channel, Uri};
 use tower::timeout::Timeout;
 use tracing::*;
 /// BLOOM Inference gRPC client
 #[derive(Clone)]
 pub struct Client {
    stub: TextGenerationClient<Timeout<Channel>>,
 }
 impl Client {
    /// Returns a client connected to the given url. Requests exceeding timeout will fail.
    pub async fn connect(uri: Uri, timeout: Duration) -> Self {
        let channel = Channel::builder(uri)
            .connect()
            .await
            .expect("Transport error");
        let timeout_channel = Timeout::new(channel, timeout);
        Self {
            stub: TextGenerationClient::new(timeout_channel),
        }
    }
    /// Returns a client connected to the given unix socket. Requests exceeding timeout will fail.
    pub async fn connect_uds(path: String, timeout: Duration) -> Self {
        let channel = Channel::from_shared(format!("http://[::]:50051"))
            .unwrap()
            .connect_with_connector(tower::service_fn(move |_: Uri| {
                tokio::net::UnixStream::connect(path.clone())
            }))
            .await
            .expect("Transport error");
        let timeout_channel = Timeout::new(channel, timeout);
        Self {
            stub: TextGenerationClient::new(timeout_channel),
        }
    }
    #[instrument(skip(self))]
    pub async fn service_discovery(&mut self) -> Result<Vec<String>> {
        let request = tonic::Request::new(Empty {});
        let response = self
            .stub
            .service_discovery(request)
            .instrument(info_span!("service_discovery"))
            .await?;
        let urls = response
            .into_inner()
            .urls
            .into_iter()
            .map(|url| match url.strip_prefix("unix://") {
                None => url,
                Some(stripped_url) => stripped_url.to_string(),
            })
            .collect();
        Ok(urls)
    }
    #[instrument(skip(self))]
    pub async fn clear_cache(&mut self) -> Result<()> {
        let request = tonic::Request::new(Empty {});
        self.stub
            .clear_cache(request)
            .instrument(info_span!("clear_cache"))
            .await?;
        Ok(())
    }
    #[instrument(skip(self))]
    pub async fn generate(
        &mut self,
        request: Batch,
    ) -> Result<(Vec<FinishedGeneration>, Option<CacheEntry>)> {
        let request = tonic::Request::new(request);
        let response = self
            .stub
            .generate(request)
            .instrument(info_span!("generate"))
            .await?
            .into_inner();
        Ok((response.finished, response.cache_entry))
    }
    #[instrument(skip(self))]
    pub async fn generate_with_cache(
        &mut self,
        request: BatchCached,
    ) -> Result<(Vec<FinishedGeneration>, Option<CacheEntry>)> {
        let request = tonic::Request::new(request);
        let response = self
            .stub
            .generate_with_cache(request)
            .instrument(info_span!("generate_with_cache"))
            .await?
            .into_inner();
        Ok((response.finished, response.cache_entry))
    }
 }
--- a/router/client/src/lib.rs
+++ b/router/client/src/lib.rs
@ -0,0 +1,32 @@
 //! BLOOM Inference gRPC client library
 mod client;
 mod pb;
 mod sharded_client;
 pub use client::Client;
 pub use pb::generate::v1::{
    Batch, BatchCached, CacheEntry, FinishedGeneration, LogitsWarperParameters, Request,
 };
 pub use sharded_client::ShardedClient;
 use thiserror::Error;
 pub use tonic::transport::Uri;
 use tonic::Status;
 #[derive(Error, Debug, Clone)]
 #[error("Text generation client error: {msg:?}")]
 pub struct ClientError {
    msg: String,
    // source: Status,
 }
 impl From<Status> for ClientError {
    fn from(err: Status) -> Self {
        Self {
            msg: err.to_string(),
            // source: err,
        }
    }
 }
 pub type Result<T> = std::result::Result<T, ClientError>;
--- a/router/client/src/pb/.gitignore
+++ b/router/client/src/pb/.gitignore
@ -0,0 +1 @@
 *.rs
--- a/router/client/src/sharded_client.rs
+++ b/router/client/src/sharded_client.rs
@ -0,0 +1,106 @@
 use crate::Result;
 use crate::{Batch, BatchCached, CacheEntry, Client, FinishedGeneration};
 use futures::future::join_all;
 use std::time::Duration;
 use tokio::sync::{broadcast, mpsc};
 use tonic::transport::Uri;
 #[derive(Clone, Debug)]
 enum Command {
    Generate(
        Batch,
        mpsc::Sender<Result<(Vec<FinishedGeneration>, Option<CacheEntry>)>>,
    ),
    GenerateWithCache(
        BatchCached,
        mpsc::Sender<Result<(Vec<FinishedGeneration>, Option<CacheEntry>)>>,
    ),
    ClearCache(mpsc::Sender<Result<()>>),
 }
 async fn client_task(mut client: Client, mut request_subscriber: broadcast::Receiver<Command>) {
    while let Ok(message) = request_subscriber.recv().await {
        match message {
            Command::Generate(batch, response_tx) => {
                let result = client.generate(batch).await;
                response_tx.try_send(result).unwrap_or(());
            }
            Command::GenerateWithCache(batch_cached, response_tx) => {
                let result = client.generate_with_cache(batch_cached).await;
                response_tx.try_send(result).unwrap_or(());
            }
            Command::ClearCache(response_tx) => {
                let result = client.clear_cache().await;
                response_tx.try_send(result).unwrap_or(());
            }
        };
    }
 }
 pub struct ShardedClient {
    request_tx: broadcast::Sender<Command>,
 }
 impl ShardedClient {
    fn new(mut clients: Vec<Client>) -> Self {
        let (request_tx, _) = broadcast::channel(1);
        for client in clients.drain(..) {
            let request_subscriber = request_tx.subscribe();
            tokio::spawn(client_task(client, request_subscriber));
        }
        Self { request_tx }
    }
    async fn from_master_client(mut master_client: Client) -> Self {
        let uris = master_client.service_discovery().await.unwrap();
        let futures = uris
            .into_iter()
            .map(|path| Client::connect_uds(path, Duration::from_secs(5)));
        let clients = join_all(futures).await;
        Self::new(clients)
    }
    /// Returns a client connected to the given url. Requests exceeding timeout will fail.
    pub async fn connect(uri: Uri, timeout: Duration) -> Self {
        let master_client = Client::connect(uri, timeout).await;
        Self::from_master_client(master_client).await
    }
    /// Returns a client connected to the given unix socket. Requests exceeding timeout will fail.
    pub async fn connect_uds(path: String, timeout: Duration) -> Self {
        let master_client = Client::connect_uds(path, timeout).await;
        Self::from_master_client(master_client).await
    }
    pub async fn generate(
        &self,
        batch: Batch,
    ) -> Result<(Vec<FinishedGeneration>, Option<CacheEntry>)> {
        let (response_tx, mut response_rx) = mpsc::channel(1);
        self.request_tx
            .send(Command::Generate(batch, response_tx))
            .unwrap();
        response_rx.recv().await.unwrap()
    }
    pub async fn generate_with_cache(
        &self,
        batch_cached: BatchCached,
    ) -> Result<(Vec<FinishedGeneration>, Option<CacheEntry>)> {
        let (response_tx, mut response_rx) = mpsc::channel(1);
        self.request_tx
            .send(Command::GenerateWithCache(batch_cached, response_tx))
            .unwrap();
        response_rx.recv().await.unwrap()
    }
    pub async fn clear_cache(&self) -> Result<()> {
        let (response_tx, mut response_rx) = mpsc::channel(1);
        self.request_tx
            .send(Command::ClearCache(response_tx))
            .unwrap();
        response_rx.recv().await.unwrap()
    }
 }
--- a/router/src/db.rs
+++ b/router/src/db.rs
@ -0,0 +1,129 @@
 /// This code is massively inspired by Tokio mini-redis
 use crate::GenerateRequest;
 use bloom_inference_client::{Batch, ClientError, LogitsWarperParameters, Request};
 use parking_lot::RwLock;
 use std::collections::BTreeMap;
 use std::sync::Arc;
 use tokio::sync::oneshot::Sender;
 #[derive(Debug, Clone)]
 pub(crate) struct Db {
    pub shared: Arc<Shared>,
 }
 #[derive(Debug)]
 pub struct Shared {
    state: RwLock<State>,
 }
 #[derive(Debug)]
 struct State {
    entries: BTreeMap<u64, (Request, Sender<Result<String, ClientError>>)>,
    /// Identifier to use for the next expiration. Each expiration is associated
    /// with a unique identifier. See above for why.
    next_id: u64,
    next_batch_id: u64,
    /// Current batch id
    next_batch_start_id: u64,
 }
 impl Db {
    pub(crate) fn new() -> Self {
        let shared = Arc::new(Shared {
            state: RwLock::new(State {
                entries: BTreeMap::new(),
                next_id: 0,
                next_batch_id: 0,
                next_batch_start_id: 0,
            }),
        });
        Self { shared }
    }
    pub(crate) fn append(&self, request: GenerateRequest, sender: Sender<Result<String, ClientError>>) {
        let mut state = self.shared.state.write();
        let id = state.next_id;
        state.next_id += 1;
        let parameters = Some(LogitsWarperParameters {
            temperature: request.parameters.temperature,
            top_k: request.parameters.top_k,
            top_p: request.parameters.top_p,
            do_sample: request.parameters.do_sample,
        });
        let request = Request {
            id,
            inputs: request.inputs,
            parameters,
            max_new_tokens: request.parameters.max_new_tokens,
        };
        state.entries.insert(id, (request, sender));
    }
    pub(crate) fn remove(&self, id: &u64) -> Option<(Request, Sender<Result<String, ClientError>>)> {
        let mut state = self.shared.state.write();
        state.entries.remove(id)
    }
    pub(crate) fn len(&self) -> usize {
        let state = self.shared.state.read();
        state.entries.len()
    }
    fn next_requests(&self, max_size: usize) -> Option<(u64, Vec<Request>)> {
        let state = self.shared.state.read();
        let requests: Vec<Request> = state
            .entries
            .range(state.next_batch_start_id..)
            .take(max_size)
            .map(|(_, (request, _))| request.clone())
            .collect();
        if requests.is_empty() {
            None
        } else {
            let last_id = requests.last().unwrap().id;
            Some((last_id, requests))
        }
    }
    pub(crate) fn next_batch(&self, max_size: usize) -> Option<Batch> {
        if let Some((last_id, requests)) = self.next_requests(max_size) {
            let mut state = self.shared.state.write();
            let batch = Batch {
                id: state.next_batch_id,
                requests,
            };
            state.next_batch_start_id = last_id + 1;
            state.next_batch_id += 1;
            return Some(batch);
        }
        None
    }
    pub(crate) fn next_batch_minimum_size(
        &self,
        min_size: usize,
        max_size: usize,
    ) -> Option<Batch> {
        if let Some((last_id, requests)) = self.next_requests(max_size) {
            if requests.len() >= min_size {
                let mut state = self.shared.state.write();
                let batch = Batch {
                    id: state.next_batch_id,
                    requests,
                };
                state.next_batch_start_id = last_id + 1;
                state.next_batch_id += 1;
                return Some(batch);
            }
        }
        None
    }
 }
--- a/router/src/infer.rs
+++ b/router/src/infer.rs
@ -0,0 +1,130 @@
 use crate::{Db, GenerateRequest};
 use bloom_inference_client::{Batch, BatchCached, CacheEntry, ClientError, FinishedGeneration, ShardedClient};
 use std::sync::Arc;
 use tokio::sync::{oneshot, Notify};
 const MAX_LENGTH: usize = 128;
 pub struct InferError {}
 #[derive(Clone)]
 pub(crate) struct Infer {
    db: Db,
    shared: Arc<Shared>,
 }
 struct Shared {
    batching_task: Notify,
 }
 impl Infer {
    pub(crate) fn new(client: ShardedClient) -> Self {
        let db = Db::new();
        let shared = Arc::new(Shared {
            batching_task: Notify::new(),
        });
        tokio::spawn(batching_task(client, db.clone(), shared.clone()));
        Self { db, shared }
    }
    pub(crate) async fn infer(&self, request: GenerateRequest) -> Result<String, InferError> {
        if self.db.len() > MAX_LENGTH {
            return Err(InferError {});
        }
        let (request_tx, request_rx) = oneshot::channel();
        self.db.append(request, request_tx);
        self.shared.batching_task.notify_waiters();
        match request_rx.await.unwrap() {
            Ok(output) => Ok(output),
            Err(_) => Err(InferError {})
        }
    }
 }
 async fn batching_task(client: ShardedClient, db: Db, shared: Arc<Shared>) {
    loop {
        shared.batching_task.notified().await;
        if let Some(batch) = db.next_batch(32) {
            let mut cache_entry = infer_batch(batch, &client, &db).await;
            loop {
                if let Some(entry) = cache_entry {
                    let mut batch_cached_ids = vec![entry.id];
                    let mut total_batch_size = entry.request_ids.len();
                    let mut max_sequence_length = entry.sequence_length;
                    let mut request_ids = entry.request_ids;
                    if total_batch_size <= 16 {
                        if let Some(batch) = db.next_batch_minimum_size(16, 48) {
                            let other_cache_entry = infer_batch(batch, &client, &db).await;
                            if let Some(entry) = other_cache_entry {
                                batch_cached_ids.push(entry.id);
                                total_batch_size += entry.request_ids.len();
                                max_sequence_length =
                                    max_sequence_length.max(entry.sequence_length);
                                request_ids.extend(entry.request_ids.into_iter());
                            }
                        }
                    }
                    let batch_cached = BatchCached {
                        id: entry.id,
                        batch_cached_ids,
                        total_batch_size: total_batch_size as u32,
                        max_sequence_length,
                        request_ids,
                    };
                    cache_entry = infer_batch_cached(batch_cached, &client, &db).await;
                } else {
                    break;
                }
            }
        }
    }
 }
 async fn infer_batch_cached(batch: BatchCached, client: &ShardedClient, db: &Db) -> Option<CacheEntry> {
    match client.generate_with_cache(batch.clone()).await {
        Ok((finished, cache_entry)) => {
            send_finished(finished, db);
            cache_entry
        }
        Err(err) => {
            println!("{:?}", err);
            send_error(err, batch.request_ids, &db);
            None
        }
    }
 }
 async fn infer_batch(batch: Batch, client: &ShardedClient, db: &Db) -> Option<CacheEntry> {
    match client.generate(batch.clone()).await {
        Ok((finished, cache_entry)) => {
            send_finished(finished, db);
            cache_entry
        }
        Err(err) => {
            println!("{:?}", err);
            send_error(err, batch.requests.into_iter().map(|req| req.id).collect(), &db);
            None
        }
    }
 }
 fn send_error(error: ClientError, request_ids: Vec<u64>, db: &Db) {
    request_ids.into_iter().for_each(|id| {
        let (_, response_tx) = db.remove(&id).unwrap();
        response_tx.send(Err(error.clone())).unwrap_or(());
    });
 }
 fn send_finished(finished: Vec<FinishedGeneration>, db: &Db) {
    finished.into_iter().for_each(|output| {
        let (_, response_tx) = db.remove(&output.id).unwrap();
        response_tx.send(Ok(output.output)).unwrap_or(());
    });
 }
--- a/router/src/main.rs
+++ b/router/src/main.rs
@ -0,0 +1,125 @@
 use tokio::time::Instant;
 use poem;
 use poem::middleware::AddData;
 use poem::web::Data;
 use poem::{handler, listener::TcpListener, post, web::Json, EndpointExt, Result, Route, Server};
 use bloom_inference_client::ShardedClient;
 use serde::Deserialize;
 use std::time::Duration;
 use poem::http::StatusCode;
 use tracing::instrument;
 mod db;
 use db::Db;
 mod infer;
 use infer::Infer;
 #[derive(Clone, Debug, Deserialize)]
 struct GenerateParameters {
    #[serde(default = "default_temperature")]
    temperature: f32,
    #[serde(default = "default_top_k")]
    top_k: u32,
    #[serde(default = "default_top_p")]
    top_p: f32,
    #[serde(default = "default_do_sample")]
    do_sample: bool,
    #[serde(default = "default_max_new_tokens")]
    max_new_tokens: u32,
 }
 fn default_temperature() -> f32 {
    1.0
 }
 fn default_top_k() -> u32 {
    0
 }
 fn default_top_p() -> f32 {
    1.0
 }
 fn default_do_sample() -> bool {
    false
 }
 fn default_max_new_tokens() -> u32 {
    20
 }
 #[derive(Clone, Debug, Deserialize)]
 struct GenerateRequest {
    inputs: String,
    #[serde(default = "default_parameters")]
    parameters: GenerateParameters,
 }
 fn default_parameters() -> GenerateParameters {
    GenerateParameters {
        temperature: default_temperature(),
        top_k: default_top_k(),
        top_p: default_top_p(),
        do_sample: default_do_sample(),
        max_new_tokens: default_max_new_tokens(),
    }
 }
 #[handler]
 #[instrument(skip(infer), fields(time, time_per_token))]
 async fn generate(
    infer: Data<&Infer>,
    req: Json<GenerateRequest>,
 ) -> Result<Json<serde_json::Value>> {
    let start = Instant::now();
    let output = infer
        .infer(GenerateRequest {
            inputs: req.inputs.clone(),
            parameters: req.parameters.clone(),
        })
        .await;
    match output {
        Ok(generated_text) => {
            tracing::Span::current().record("time", format!("{:?}", start.elapsed()));
            tracing::Span::current().record("time_per_token", format!("{:?}", start.elapsed() / req.parameters.max_new_tokens));
            tracing::info!("response: {}", generated_text);
            Ok(Json(serde_json::json!({
                "generated_text": generated_text,
            })))
        }
        Err(_) => {
            Err(poem::Error::from_status(StatusCode::INTERNAL_SERVER_ERROR))
        }
    }
 }
 #[tokio::main]
 async fn main() -> Result<(), std::io::Error> {
    tracing_subscriber::fmt::init();
    let sharded_client =
        ShardedClient::connect_uds("/tmp/bloom-inference-0".to_string(), Duration::from_secs(5))
            .await;
    sharded_client
        .clear_cache()
        .await
        .expect("Unable to clear cache");
    tracing::info!("Connected");
    let infer = Infer::new(sharded_client);
    let app = Route::new()
        .at("/generate", post(generate))
        .with(AddData::new(infer));
    Server::new(TcpListener::bind("127.0.0.1:3000"))
        .run(app)
        .await
 }
--- a/server/.DS_Store
+++ b/server/.DS_Store
--- a/server/Makefile
+++ b/server/Makefile
@ -0,0 +1,20 @@
 gen-server:
 	mkdir bloom_inference/pb || true
 	python -m grpc_tools.protoc -I../proto --python_out=bloom_inference/pb --grpc_python_out=bloom_inference/pb ../proto/generate.proto
 	find bloom_inference/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
 	touch bloom_inference/pb/__init__.py
 unit-tests:
 	python -m pytest --cov=bloom_inference tests
 unit-tests-reporting:
 	python -m pytest --junitxml=report.xml --cov=bloom_inference tests
 pip-install:
 	pip install grpcio-tools
 	make gen-server
 	pip install .
 install:
 	poetry install
 	make gen-server
--- a/server/README.md
+++ b/server/README.md
@ -0,0 +1,15 @@
 # BLOOM Inference Python gRPC Server
 A Python gRPC server for BLOOM Inference
 ## Local Install (with poetry)
 ```shell
 make install
 ```
 ## Local Install (with pip)
 ```shell
 make pip-install
 ```
--- a/server/bloom_inference/.DS_Store
+++ b/server/bloom_inference/.DS_Store
--- a/server/bloom_inference/init.py
+++ b/server/bloom_inference/init.py
--- a/server/bloom_inference/pycache/init.cpython-39.pyc
+++ b/server/bloom_inference/pycache/init.cpython-39.pyc
--- a/server/bloom_inference/pycache/cache.cpython-39.pyc
+++ b/server/bloom_inference/pycache/cache.cpython-39.pyc
--- a/server/bloom_inference/pycache/model.cpython-39.pyc
+++ b/server/bloom_inference/pycache/model.cpython-39.pyc
--- a/server/bloom_inference/pycache/server.cpython-39.pyc
+++ b/server/bloom_inference/pycache/server.cpython-39.pyc
--- a/server/bloom_inference/pycache/shard_model.cpython-39.pyc
+++ b/server/bloom_inference/pycache/shard_model.cpython-39.pyc
--- a/server/bloom_inference/pycache/utils.cpython-39.pyc
+++ b/server/bloom_inference/pycache/utils.cpython-39.pyc
--- a/server/bloom_inference/cache.py
+++ b/server/bloom_inference/cache.py
@ -0,0 +1,48 @@
 import torch
 from dataclasses import dataclass
 from typing import Dict, Optional, List
 from bloom_inference.pb import generate_pb2
 from bloom_inference.utils import NextTokenChooser, StoppingCriteria
@dataclass
 class CacheEntry:
    batch_id: int
    request_ids: List[int]
    input_ids: Dict[str, torch.Tensor]
    all_input_ids: List[torch.Tensor]
    next_token_choosers: List[NextTokenChooser]
    stopping_criterias: List[StoppingCriteria]
    def __len__(self):
        return len(self.request_ids)
    def to_pb(self):
        return generate_pb2.CacheEntry(
            id=self.batch_id,
            request_ids=self.request_ids,
            sequence_length=max(len(entry) for entry in self.all_input_ids),
        )
 class Cache:
    def __init__(self):
        self.cache: Dict[str, CacheEntry] = {}
    def pop(self, batch_id: str) -> Optional[CacheEntry]:
        return self.cache.pop(batch_id, None)
    def set(self, entry: CacheEntry):
        if entry is not None:
            self.cache[entry.batch_id] = entry
    def delete(self, batch_id: str):
        del self.cache[batch_id]
    def clear(self):
        self.cache.clear()
    def __len__(self):
        return len(self.cache.keys())
--- a/server/bloom_inference/main.py
+++ b/server/bloom_inference/main.py
@ -0,0 +1,30 @@
 import typer
 from pathlib import Path
 from torch.distributed.launcher import launch_agent, LaunchConfig
 from typing import Optional
 from bloom_inference.server import serve
 def main(
    model_name: str,
    num_gpus: int = 1,
    shard_directory: Optional[Path] = None,
 ):
    if num_gpus == 1:
        serve(model_name, False, shard_directory)
    else:
        config = LaunchConfig(
            min_nodes=1,
            max_nodes=1,
            nproc_per_node=num_gpus,
            rdzv_backend="c10d",
            max_restarts=0,
        )
        launch_agent(config, serve, [model_name, True, shard_directory])
 if __name__ == "__main__":
    typer.run(main)
--- a/server/bloom_inference/model.py
+++ b/server/bloom_inference/model.py
@ -0,0 +1,428 @@
 import torch
 import torch.distributed
 from dataclasses import dataclass
 from pathlib import Path
 from typing import List, Tuple, Optional, Dict
 from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
 from transformers.modeling_utils import no_init_weights
 from bloom_inference.cache import CacheEntry
 from bloom_inference.pb import generate_pb2
 from bloom_inference.shard_model import shard_model, match_suffix
 from bloom_inference.utils import (
    StoppingCriteria,
    NextTokenChooser,
    initialize_torch_distributed,
    set_default_dtype,
 )
 torch.manual_seed(0)
@dataclass
 class Batch:
    batch_id: int
    request_ids: List[int]
    input_ids: Dict[str, torch.Tensor]
    all_input_ids: List[torch.Tensor]
    next_token_choosers: List[NextTokenChooser]
    stopping_criterias: List[StoppingCriteria]
    @classmethod
    def from_batch_pb(
            cls, pb: generate_pb2.Batch, tokenizer: AutoTokenizer, device: torch.device
    ) -> "Batch":
        request_ids = []
        inputs = []
        next_token_choosers = []
        stopping_criterias = []
        # Parse batch
        for r in pb.requests:
            request_ids.append(r.id)
            inputs.append(r.inputs)
            next_token_choosers.append(
                NextTokenChooser(
                    temperature=r.parameters.temperature,
                    top_k=r.parameters.top_k,
                    top_p=r.parameters.top_p,
                    do_sample=r.parameters.do_sample,
                )
            )
            stopping_criterias.append(StoppingCriteria(max_new_tokens=r.max_new_tokens))
        input_ids = tokenizer(inputs, return_tensors="pt", padding=True).to(device)
        all_input_ids = input_ids["input_ids"].unsqueeze(-1)
        return cls(
            pb.id,
            request_ids,
            input_ids,
            all_input_ids,
            next_token_choosers,
            stopping_criterias,
        )
    @classmethod
    def from_cache_entry(cls, cache_entry: CacheEntry) -> "Batch":
        return cls(
            cache_entry.batch_id,
            cache_entry.request_ids,
            cache_entry.input_ids,
            cache_entry.all_input_ids,
            cache_entry.next_token_choosers,
            cache_entry.stopping_criterias,
        )
    @classmethod
    def from_batch_cached_pb(cls, pb: generate_pb2.BatchCached, cache) -> "Batch":
        if len(pb.batch_cached_ids) == 1:
            cache_entry = cache.pop(pb.batch_cached_ids[0])
            if cache_entry is None:
                raise ValueError(f"Batch ID {pb.batch_id} not found in cache")
            return cls.from_cache_entry(cache_entry)
        total_batch_size = pb.total_batch_size
        max_sequence_length = pb.max_sequence_length
        input_ids = {"input_ids": None, "attention_mask": None, "past_key_values": []}
        request_ids = []
        all_input_ids = []
        next_token_choosers = []
        stopping_criterias = []
        start_index = 0
        for i, batch_id in enumerate(pb.batch_cached_ids):
            cache_entry = cache.pop(batch_id)
            if cache_entry is None:
                raise ValueError(f"Batch ID {batch_id} not found in cache")
            request_ids.extend(cache_entry.request_ids)
            all_input_ids.extend(cache_entry.all_input_ids)
            next_token_choosers.extend(cache_entry.next_token_choosers)
            stopping_criterias.extend(cache_entry.stopping_criterias)
            batch_size = len(cache_entry.request_ids)
            end_index = start_index + batch_size
            sequence_length = max(len(entry) for entry in cache_entry.all_input_ids)
            if input_ids["input_ids"] is None:
                input_ids["input_ids"] = torch.empty(
                    (total_batch_size, 1),
                    dtype=cache_entry.input_ids["input_ids"].dtype,
                    device=cache_entry.input_ids["input_ids"].device,
                )
            input_ids["input_ids"][start_index:end_index] = cache_entry.input_ids[
                "input_ids"
            ]
            if input_ids["attention_mask"] is None:
                input_ids["attention_mask"] = torch.zeros(
                    (total_batch_size, max_sequence_length),
                    dtype=cache_entry.input_ids["attention_mask"].dtype,
                    device=cache_entry.input_ids["attention_mask"].device,
                )
            input_ids["attention_mask"][
            start_index:end_index, -sequence_length:
            ] = cache_entry.input_ids["attention_mask"][:, -sequence_length:]
            for j, past in enumerate(cache_entry.input_ids["past_key_values"]):
                # TODO: this could be done without the views by using indices
                past_keys = past[0]
                past_values = past[1]
                _, head_dim, padded_sequence_length = past_keys.shape
                past_keys = past_keys.view(
                    batch_size, -1, head_dim, padded_sequence_length
                )
                past_values = past_values.view(
                    batch_size, -1, padded_sequence_length, head_dim
                )
                num_heads = past_keys.shape[1]
                if j == len(input_ids["past_key_values"]):
                    padded_past_keys = torch.zeros(
                        (
                            total_batch_size,
                            num_heads,
                            head_dim,
                            max_sequence_length - 1,
                        ),
                        dtype=past_keys.dtype,
                        device=past_keys.device,
                    )
                    padded_past_values = torch.zeros(
                        (
                            total_batch_size,
                            num_heads,
                            max_sequence_length - 1,
                            head_dim,
                        ),
                        dtype=past_values.dtype,
                        device=past_values.device,
                    )
                    input_ids["past_key_values"].append(
                        [padded_past_keys, padded_past_values]
                    )
                input_ids["past_key_values"][j][0][
                start_index:end_index, :, :, -(sequence_length - 1):
                ] = past_keys[:, :, :, -(sequence_length - 1):]
                input_ids["past_key_values"][j][1][
                start_index:end_index, :, -(sequence_length - 1):, :
                ] = past_values[:, :, -(sequence_length - 1):, :]
                if (i + 1) == len(pb.batch_cached_ids):
                    input_ids["past_key_values"][j][0] = input_ids["past_key_values"][
                        j
                    ][0].view(total_batch_size * num_heads, head_dim, -1)
                    input_ids["past_key_values"][j][1] = input_ids["past_key_values"][
                        j
                    ][1].view(total_batch_size * num_heads, -1, head_dim)
            start_index += batch_size
        assert pb.request_ids == request_ids
        return cls(
            pb.id,
            request_ids,
            input_ids,
            all_input_ids,
            next_token_choosers,
            stopping_criterias,
        )
@dataclass
 class FinishedGeneration:
    request_id: str
    output: str
    def to_pb(self) -> generate_pb2.FinishedGeneration:
        return generate_pb2.FinishedGeneration(id=self.request_id, output=self.output)
 class BLOOM:
    def __init__(self, model_name: str):
        if torch.cuda.is_available():
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
        self.model = (
            AutoModelForCausalLM.from_pretrained(model_name).eval().to(self.device)
        )
        self.num_heads = self.model.base_model.num_heads
    def forward(self, input_ids, attention_mask, past_key_values: Optional = None):
        # Model Forward
        return self.model.forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            past_key_values=past_key_values,
            use_cache=True,
        )
    def generate_token(
            self, batch: Batch
    ) -> Tuple[List[FinishedGeneration], Optional[CacheEntry]]:
        with torch.no_grad():
            outputs = self.forward(**batch.input_ids)
        # List of indices to cache
        cache_indices = []
        cache_past_indices = []
        # New input_ids for next forward; keep in cache
        cache_next_input_ids = []
        cache_all_input_ids = []
        # Finished requests
        finished_generations: List[FinishedGeneration] = []
        # Zipped iterator
        iterator = zip(
            batch.request_ids,
            outputs.logits,
            batch.next_token_choosers,
            batch.stopping_criterias,
            batch.all_input_ids,
        )
        # For each member of the batch
        for i, (
                request_id,
                logits,
                next_token_chooser,
                stopping_criteria,
                all_tokens,
        ) in enumerate(iterator):
            # Select next token
            next_token = next_token_chooser(all_tokens, logits.unsqueeze(0)[:, -1])
            # Append next token to all tokens
            all_tokens = torch.cat([all_tokens, next_token])
            # Evaluate stopping criteria
            if stopping_criteria(all_tokens):
                # Decode all tokens
                output = self.tokenizer.decode(
                    all_tokens.squeeze(-1), skip_special_tokens=True
                )
                # Add to the list of finished generations with the original request id
                finished_generations.append(FinishedGeneration(request_id, output))
            # must be added to the cache
            else:
                cache_indices.append(i)
                cache_past_indices.extend([j for j in range(i * self.num_heads, (i + 1) * self.num_heads)])
                cache_next_input_ids.append(next_token)
                cache_all_input_ids.append(all_tokens)
        # No cache is needed, we finished all generations in the batch
        if not cache_indices:
            return finished_generations, None
        # If we finished at least one generation
        cache_input_ids = {"input_ids": torch.cat(cache_next_input_ids, dim=0)}
        if finished_generations:
            # Apply indices to attention mask, past key values and other items that need to be cached
            cache_input_ids["attention_mask"] = batch.input_ids["attention_mask"][
                cache_indices
            ]
            cache_input_ids["past_key_values"] = [
                (keys[cache_past_indices], values[cache_past_indices])
                for keys, values in outputs["past_key_values"]
            ]
            cache_request_ids = [batch.request_ids[i] for i in cache_indices]
            cache_next_token_choosers = [
                batch.next_token_choosers[i] for i in cache_indices
            ]
            cache_stopping_criterias = [
                batch.stopping_criterias[i] for i in cache_indices
            ]
        else:
            cache_input_ids["attention_mask"] = batch.input_ids["attention_mask"]
            cache_input_ids["past_key_values"] = outputs["past_key_values"]
            cache_request_ids = batch.request_ids
            cache_next_token_choosers = batch.next_token_choosers
            cache_stopping_criterias = batch.stopping_criterias
        # Update attention_mask with padding as we added a new token to input_ids
        cache_input_ids["attention_mask"] = torch.cat(
            [
                cache_input_ids["attention_mask"],
                torch.ones((cache_input_ids["attention_mask"].shape[0], 1)).to(
                    cache_input_ids["attention_mask"].device
                ),
            ],
            dim=1,
        )
        cache_entry = CacheEntry(
            batch.batch_id,
            cache_request_ids,
            cache_input_ids,
            cache_all_input_ids,
            cache_next_token_choosers,
            cache_stopping_criterias,
        )
        return finished_generations, cache_entry
 class BLOOMSharded(BLOOM):
    def __init__(self, model_name: str, shard_directory: Path):
        super(BLOOM, self).__init__()
        self.process_group, self.rank, self.world_size = initialize_torch_distributed()
        self.master = self.rank == 0
        if torch.cuda.is_available():
            self.device = torch.device(f"cuda:{self.rank}")
            dtype = torch.bfloat16
        else:
            self.device = torch.device("cpu")
            dtype = torch.float32
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
        # shard state_dict
        if self.master:
            # TODO @thomasw21 do some caching
            shard_state_dict_paths = shard_model(
                model_name, shard_directory, tp_world_size=self.world_size, dtype=dtype
            )
            shard_state_dict_paths = [
                str(path.absolute()) for path in shard_state_dict_paths
            ]
        else:
            shard_state_dict_paths = [None] * self.world_size
        torch.distributed.broadcast_object_list(
            shard_state_dict_paths, src=0, group=self.process_group
        )
        shard_state_dict_path = shard_state_dict_paths[self.rank]
        config = AutoConfig.from_pretrained(
            model_name, slow_but_exact=False, tp_parallel=True
        )
        config.pad_token_id = 3
        # The flag below controls whether to allow TF32 on matmul. This flag defaults to False
        # in PyTorch 1.12 and later.
        torch.backends.cuda.matmul.allow_tf32 = True
        # The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
        torch.backends.cudnn.allow_tf32 = True
        with set_default_dtype(dtype):
            with no_init_weights():
                # we can probably set the device to `meta` here?
                model = AutoModelForCausalLM.from_config(config).to(dtype)
        torch.distributed.barrier(group=self.process_group)
        # print_rank_0(f"Initialized model")
        state_dict = torch.load(shard_state_dict_path)
        # TODO @thomasw21: HACK in order to transpose all weight prior
        for key in state_dict.keys():
            do_transpose = False
            if not match_suffix(key, "weight"):
                continue
            for potential_suffix in [
                "self_attention.query_key_value.weight",
                "self_attention.dense.weight",
                "dense_h_to_4h.weight",
                "dense_4h_to_h.weight",
            ]:
                if match_suffix(key, potential_suffix):
                    do_transpose = True
            if do_transpose:
                state_dict[key] = state_dict[key].transpose(1, 0).contiguous()
        model.load_state_dict(state_dict)
        self.model = model.to(self.device).eval()
        self.num_heads = config.n_head // self.process_group.size()
        torch.distributed.barrier(group=self.process_group)
    def forward(self, input_ids, attention_mask, past_key_values: Optional = None):
        outputs = self.model.forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            past_key_values=past_key_values,
            use_cache=True,
        )
        logits_shard = outputs.logits[:, -1, :].contiguous()
        batch_size, vocab_shard_size = logits_shard.shape
        vocab_size = self.world_size * vocab_shard_size
        logits = [torch.empty_like(logits_shard) for _ in range(self.world_size)]
        torch.distributed.all_gather(logits, logits_shard, group=self.process_group)
        logits = torch.cat(logits, dim=1).view(batch_size, 1, vocab_size)
        outputs.logits = logits
        return outputs
--- a/server/bloom_inference/pb/init.py
+++ b/server/bloom_inference/pb/init.py
--- a/server/bloom_inference/pb/init.py-e
+++ b/server/bloom_inference/pb/init.py-e
--- a/server/bloom_inference/pb/pycache/init.cpython-39.pyc
+++ b/server/bloom_inference/pb/pycache/init.cpython-39.pyc
--- a/server/bloom_inference/pb/pycache/generate_pb2.cpython-39.pyc
+++ b/server/bloom_inference/pb/pycache/generate_pb2.cpython-39.pyc
--- a/server/bloom_inference/pb/pycache/generate_pb2_grpc.cpython-39.pyc
+++ b/server/bloom_inference/pb/pycache/generate_pb2_grpc.cpython-39.pyc
--- a/server/bloom_inference/pb/generate_pb2.py
+++ b/server/bloom_inference/pb/generate_pb2.py
@ -0,0 +1,43 @@
 # -*- coding: utf-8 -*-
 # Generated by the protocol buffer compiler.  DO NOT EDIT!
 # source: generate.proto
 """Generated protocol buffer code."""
 from google.protobuf.internal import builder as _builder
 from google.protobuf import descriptor as _descriptor
 from google.protobuf import descriptor_pool as _descriptor_pool
 from google.protobuf import symbol_database as _symbol_database
 # @@protoc_insertion_point(imports)
 _sym_db = _symbol_database.Default()
 DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0egenerate.proto\x12\x0bgenerate.v1\"(\n\x18ServiceDiscoveryResponse\x12\x0c\n\x04urls\x18\x01 \x03(\t\"^\n\x16LogitsWarperParameters\x12\x13\n\x0btemperature\x18\x01 \x01(\x02\x12\r\n\x05top_k\x18\x02 \x01(\r\x12\r\n\x05top_p\x18\x03 \x01(\x02\x12\x11\n\tdo_sample\x18\x04 \x01(\x08\"v\n\x07Request\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x0e\n\x06inputs\x18\x02 \x01(\t\x12\x37\n\nparameters\x18\x03 \x01(\x0b\x32#.generate.v1.LogitsWarperParameters\x12\x16\n\x0emax_new_tokens\x18\x04 \x01(\r\";\n\x05\x42\x61tch\x12\n\n\x02id\x18\x01 \x01(\x04\x12&\n\x08requests\x18\x02 \x03(\x0b\x32\x14.generate.v1.Request\"\x7f\n\x0b\x42\x61tchCached\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x13\n\x0brequest_ids\x18\x02 \x03(\x04\x12\x18\n\x10\x62\x61tch_cached_ids\x18\x03 \x03(\x04\x12\x18\n\x10total_batch_size\x18\x04 \x01(\r\x12\x1b\n\x13max_sequence_length\x18\x05 \x01(\r\"0\n\x12\x46inishedGeneration\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x0e\n\x06output\x18\x02 \x01(\t\"F\n\nCacheEntry\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x13\n\x0brequest_ids\x18\x02 \x03(\x04\x12\x17\n\x0fsequence_length\x18\x03 \x01(\r\"\x80\x01\n\x08Response\x12\x31\n\x08\x66inished\x18\x01 \x03(\x0b\x32\x1f.generate.v1.FinishedGeneration\x12\x31\n\x0b\x63\x61\x63he_entry\x18\x02 \x01(\x0b\x32\x17.generate.v1.CacheEntryH\x00\x88\x01\x01\x42\x0e\n\x0c_cache_entry\"\x07\n\x05\x45mpty2\x94\x02\n\x0eTextGeneration\x12O\n\x10ServiceDiscovery\x12\x12.generate.v1.Empty\x1a%.generate.v1.ServiceDiscoveryResponse\"\x00\x12\x34\n\nClearCache\x12\x12.generate.v1.Empty\x1a\x12.generate.v1.Empty\x12\x35\n\x08Generate\x12\x12.generate.v1.Batch\x1a\x15.generate.v1.Response\x12\x44\n\x11GenerateWithCache\x12\x18.generate.v1.BatchCached\x1a\x15.generate.v1.Responseb\x06proto3')
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
 _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'generate_pb2', globals())
 if _descriptor._USE_C_DESCRIPTORS == False:
  DESCRIPTOR._options = None
  _SERVICEDISCOVERYRESPONSE._serialized_start=31
  _SERVICEDISCOVERYRESPONSE._serialized_end=71
  _LOGITSWARPERPARAMETERS._serialized_start=73
  _LOGITSWARPERPARAMETERS._serialized_end=167
  _REQUEST._serialized_start=169
  _REQUEST._serialized_end=287
  _BATCH._serialized_start=289
  _BATCH._serialized_end=348
  _BATCHCACHED._serialized_start=350
  _BATCHCACHED._serialized_end=477
  _FINISHEDGENERATION._serialized_start=479
  _FINISHEDGENERATION._serialized_end=527
  _CACHEENTRY._serialized_start=529
  _CACHEENTRY._serialized_end=599
  _RESPONSE._serialized_start=602
  _RESPONSE._serialized_end=730
  _EMPTY._serialized_start=732
  _EMPTY._serialized_end=739
  _TEXTGENERATION._serialized_start=742
  _TEXTGENERATION._serialized_end=1018
 # @@protoc_insertion_point(module_scope)
--- a/server/bloom_inference/pb/generate_pb2.py-e
+++ b/server/bloom_inference/pb/generate_pb2.py-e
@ -0,0 +1,43 @@
 # -*- coding: utf-8 -*-
 # Generated by the protocol buffer compiler.  DO NOT EDIT!
 # source: generate.proto
 """Generated protocol buffer code."""
 from google.protobuf.internal import builder as _builder
 from google.protobuf import descriptor as _descriptor
 from google.protobuf import descriptor_pool as _descriptor_pool
 from google.protobuf import symbol_database as _symbol_database
 # @@protoc_insertion_point(imports)
 _sym_db = _symbol_database.Default()
 DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0egenerate.proto\x12\x0bgenerate.v1\"(\n\x18ServiceDiscoveryResponse\x12\x0c\n\x04urls\x18\x01 \x03(\t\"^\n\x16LogitsWarperParameters\x12\x13\n\x0btemperature\x18\x01 \x01(\x02\x12\r\n\x05top_k\x18\x02 \x01(\r\x12\r\n\x05top_p\x18\x03 \x01(\x02\x12\x11\n\tdo_sample\x18\x04 \x01(\x08\"v\n\x07Request\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x0e\n\x06inputs\x18\x02 \x01(\t\x12\x37\n\nparameters\x18\x03 \x01(\x0b\x32#.generate.v1.LogitsWarperParameters\x12\x16\n\x0emax_new_tokens\x18\x04 \x01(\r\";\n\x05\x42\x61tch\x12\n\n\x02id\x18\x01 \x01(\x04\x12&\n\x08requests\x18\x02 \x03(\x0b\x32\x14.generate.v1.Request\"\x7f\n\x0b\x42\x61tchCached\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x13\n\x0brequest_ids\x18\x02 \x03(\x04\x12\x18\n\x10\x62\x61tch_cached_ids\x18\x03 \x03(\x04\x12\x18\n\x10total_batch_size\x18\x04 \x01(\r\x12\x1b\n\x13max_sequence_length\x18\x05 \x01(\r\"0\n\x12\x46inishedGeneration\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x0e\n\x06output\x18\x02 \x01(\t\"F\n\nCacheEntry\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x13\n\x0brequest_ids\x18\x02 \x03(\x04\x12\x17\n\x0fsequence_length\x18\x03 \x01(\r\"\x80\x01\n\x08Response\x12\x31\n\x08\x66inished\x18\x01 \x03(\x0b\x32\x1f.generate.v1.FinishedGeneration\x12\x31\n\x0b\x63\x61\x63he_entry\x18\x02 \x01(\x0b\x32\x17.generate.v1.CacheEntryH\x00\x88\x01\x01\x42\x0e\n\x0c_cache_entry\"\x07\n\x05\x45mpty2\x94\x02\n\x0eTextGeneration\x12O\n\x10ServiceDiscovery\x12\x12.generate.v1.Empty\x1a%.generate.v1.ServiceDiscoveryResponse\"\x00\x12\x34\n\nClearCache\x12\x12.generate.v1.Empty\x1a\x12.generate.v1.Empty\x12\x35\n\x08Generate\x12\x12.generate.v1.Batch\x1a\x15.generate.v1.Response\x12\x44\n\x11GenerateWithCache\x12\x18.generate.v1.BatchCached\x1a\x15.generate.v1.Responseb\x06proto3')
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
 _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'generate_pb2', globals())
 if _descriptor._USE_C_DESCRIPTORS == False:
  DESCRIPTOR._options = None
  _SERVICEDISCOVERYRESPONSE._serialized_start=31
  _SERVICEDISCOVERYRESPONSE._serialized_end=71
  _LOGITSWARPERPARAMETERS._serialized_start=73
  _LOGITSWARPERPARAMETERS._serialized_end=167
  _REQUEST._serialized_start=169
  _REQUEST._serialized_end=287
  _BATCH._serialized_start=289
  _BATCH._serialized_end=348
  _BATCHCACHED._serialized_start=350
  _BATCHCACHED._serialized_end=477
  _FINISHEDGENERATION._serialized_start=479
  _FINISHEDGENERATION._serialized_end=527
  _CACHEENTRY._serialized_start=529
  _CACHEENTRY._serialized_end=599
  _RESPONSE._serialized_start=602
  _RESPONSE._serialized_end=730
  _EMPTY._serialized_start=732
  _EMPTY._serialized_end=739
  _TEXTGENERATION._serialized_start=742
  _TEXTGENERATION._serialized_end=1018
 # @@protoc_insertion_point(module_scope)
--- a/server/bloom_inference/pb/generate_pb2_grpc.py
+++ b/server/bloom_inference/pb/generate_pb2_grpc.py
@ -0,0 +1,169 @@
 # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
 """Client and server classes corresponding to protobuf-defined services."""
 import grpc
 from . import generate_pb2 as generate__pb2
 class TextGenerationStub(object):
    """Missing associated documentation comment in .proto file."""
    def __init__(self, channel):
        """Constructor.
        Args:
            channel: A grpc.Channel.
        """
        self.ServiceDiscovery = channel.unary_unary(
                '/generate.v1.TextGeneration/ServiceDiscovery',
                request_serializer=generate__pb2.Empty.SerializeToString,
                response_deserializer=generate__pb2.ServiceDiscoveryResponse.FromString,
                )
        self.ClearCache = channel.unary_unary(
                '/generate.v1.TextGeneration/ClearCache',
                request_serializer=generate__pb2.Empty.SerializeToString,
                response_deserializer=generate__pb2.Empty.FromString,
                )
        self.Generate = channel.unary_unary(
                '/generate.v1.TextGeneration/Generate',
                request_serializer=generate__pb2.Batch.SerializeToString,
                response_deserializer=generate__pb2.Response.FromString,
                )
        self.GenerateWithCache = channel.unary_unary(
                '/generate.v1.TextGeneration/GenerateWithCache',
                request_serializer=generate__pb2.BatchCached.SerializeToString,
                response_deserializer=generate__pb2.Response.FromString,
                )
 class TextGenerationServicer(object):
    """Missing associated documentation comment in .proto file."""
    def ServiceDiscovery(self, request, context):
        """/ Service discovery
        """
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def ClearCache(self, request, context):
        """/ Empties batch cache
        """
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Generate(self, request, context):
        """/ Generate tokens for a batch without cache
        """
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def GenerateWithCache(self, request, context):
        """/ Generate tokens for a batch with cache
        """
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
 def add_TextGenerationServicer_to_server(servicer, server):
    rpc_method_handlers = {
            'ServiceDiscovery': grpc.unary_unary_rpc_method_handler(
                    servicer.ServiceDiscovery,
                    request_deserializer=generate__pb2.Empty.FromString,
                    response_serializer=generate__pb2.ServiceDiscoveryResponse.SerializeToString,
            ),
            'ClearCache': grpc.unary_unary_rpc_method_handler(
                    servicer.ClearCache,
                    request_deserializer=generate__pb2.Empty.FromString,
                    response_serializer=generate__pb2.Empty.SerializeToString,
            ),
            'Generate': grpc.unary_unary_rpc_method_handler(
                    servicer.Generate,
                    request_deserializer=generate__pb2.Batch.FromString,
                    response_serializer=generate__pb2.Response.SerializeToString,
            ),
            'GenerateWithCache': grpc.unary_unary_rpc_method_handler(
                    servicer.GenerateWithCache,
                    request_deserializer=generate__pb2.BatchCached.FromString,
                    response_serializer=generate__pb2.Response.SerializeToString,
            ),
    }
    generic_handler = grpc.method_handlers_generic_handler(
            'generate.v1.TextGeneration', rpc_method_handlers)
    server.add_generic_rpc_handlers((generic_handler,))
 # This class is part of an EXPERIMENTAL API.
 class TextGeneration(object):
    """Missing associated documentation comment in .proto file."""
    @staticmethod
    def ServiceDiscovery(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/generate.v1.TextGeneration/ServiceDiscovery',
            generate__pb2.Empty.SerializeToString,
            generate__pb2.ServiceDiscoveryResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def ClearCache(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/generate.v1.TextGeneration/ClearCache',
            generate__pb2.Empty.SerializeToString,
            generate__pb2.Empty.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Generate(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/generate.v1.TextGeneration/Generate',
            generate__pb2.Batch.SerializeToString,
            generate__pb2.Response.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def GenerateWithCache(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/generate.v1.TextGeneration/GenerateWithCache',
            generate__pb2.BatchCached.SerializeToString,
            generate__pb2.Response.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/server/bloom_inference/pb/generate_pb2_grpc.py-e
+++ b/server/bloom_inference/pb/generate_pb2_grpc.py-e
@ -0,0 +1,169 @@
 # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
 """Client and server classes corresponding to protobuf-defined services."""
 import grpc
 import generate_pb2 as generate__pb2
 class TextGenerationStub(object):
    """Missing associated documentation comment in .proto file."""
    def __init__(self, channel):
        """Constructor.
        Args:
            channel: A grpc.Channel.
        """
        self.ServiceDiscovery = channel.unary_unary(
                '/generate.v1.TextGeneration/ServiceDiscovery',
                request_serializer=generate__pb2.Empty.SerializeToString,
                response_deserializer=generate__pb2.ServiceDiscoveryResponse.FromString,
                )
        self.ClearCache = channel.unary_unary(
                '/generate.v1.TextGeneration/ClearCache',
                request_serializer=generate__pb2.Empty.SerializeToString,
                response_deserializer=generate__pb2.Empty.FromString,
                )
        self.Generate = channel.unary_unary(
                '/generate.v1.TextGeneration/Generate',
                request_serializer=generate__pb2.Batch.SerializeToString,
                response_deserializer=generate__pb2.Response.FromString,
                )
        self.GenerateWithCache = channel.unary_unary(
                '/generate.v1.TextGeneration/GenerateWithCache',
                request_serializer=generate__pb2.BatchCached.SerializeToString,
                response_deserializer=generate__pb2.Response.FromString,
                )
 class TextGenerationServicer(object):
    """Missing associated documentation comment in .proto file."""
    def ServiceDiscovery(self, request, context):
        """/ Service discovery
        """
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def ClearCache(self, request, context):
        """/ Empties batch cache
        """
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Generate(self, request, context):
        """/ Generate tokens for a batch without cache
        """
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def GenerateWithCache(self, request, context):
        """/ Generate tokens for a batch with cache
        """
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
 def add_TextGenerationServicer_to_server(servicer, server):
    rpc_method_handlers = {
            'ServiceDiscovery': grpc.unary_unary_rpc_method_handler(
                    servicer.ServiceDiscovery,
                    request_deserializer=generate__pb2.Empty.FromString,
                    response_serializer=generate__pb2.ServiceDiscoveryResponse.SerializeToString,
            ),
            'ClearCache': grpc.unary_unary_rpc_method_handler(
                    servicer.ClearCache,
                    request_deserializer=generate__pb2.Empty.FromString,
                    response_serializer=generate__pb2.Empty.SerializeToString,
            ),
            'Generate': grpc.unary_unary_rpc_method_handler(
                    servicer.Generate,
                    request_deserializer=generate__pb2.Batch.FromString,
                    response_serializer=generate__pb2.Response.SerializeToString,
            ),
            'GenerateWithCache': grpc.unary_unary_rpc_method_handler(
                    servicer.GenerateWithCache,
                    request_deserializer=generate__pb2.BatchCached.FromString,
                    response_serializer=generate__pb2.Response.SerializeToString,
            ),
    }
    generic_handler = grpc.method_handlers_generic_handler(
            'generate.v1.TextGeneration', rpc_method_handlers)
    server.add_generic_rpc_handlers((generic_handler,))
 # This class is part of an EXPERIMENTAL API.
 class TextGeneration(object):
    """Missing associated documentation comment in .proto file."""
    @staticmethod
    def ServiceDiscovery(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/generate.v1.TextGeneration/ServiceDiscovery',
            generate__pb2.Empty.SerializeToString,
            generate__pb2.ServiceDiscoveryResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def ClearCache(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/generate.v1.TextGeneration/ClearCache',
            generate__pb2.Empty.SerializeToString,
            generate__pb2.Empty.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Generate(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/generate.v1.TextGeneration/Generate',
            generate__pb2.Batch.SerializeToString,
            generate__pb2.Response.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def GenerateWithCache(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/generate.v1.TextGeneration/GenerateWithCache',
            generate__pb2.BatchCached.SerializeToString,
            generate__pb2.Response.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/server/bloom_inference/prepare_weights.py
+++ b/server/bloom_inference/prepare_weights.py
@ -0,0 +1,124 @@
 import torch
 from pathlib import Path
 from tqdm import tqdm
 MODEL_NAME = "bigscience/bloom"
 def match_suffix(text, suffix):
    return text[-len(suffix) :] == suffix
 def prepare_weights(hub_path: Path, save_path: Path, tp_world_size: int):
    save_paths = [
        save_path / f"{MODEL_NAME}_tp-rank-{tp_rank}-of-{tp_world_size}.pty"
        for tp_rank in range(tp_world_size)
    ]
    if all(save_path.exists() for save_path in save_paths):
        print("Weights are already prepared")
        return
    shards_state_dicts = [{} for _ in range(tp_world_size)]
    for weight_path in tqdm(hub_path.glob("*.bin")):
        state_dict = torch.load(weight_path, map_location="cpu")
        keys = list(state_dict.keys())
        for state_name in keys:
            state = state_dict[state_name]
            if any(
                match_suffix(state_name, candidate)
                for candidate in [
                    "self_attention.query_key_value.weight",
                    "self_attention.query_key_value.bias",
                    "mlp.dense_h_to_4h.weight",
                    "mlp.dense_h_to_4h.bias",
                    "word_embeddings.weight",
                    "lm_head.weight",
                ]
            ):
                output_size = state.shape[0]
                assert output_size % tp_world_size == 0
                block_size = output_size // tp_world_size
                sharded_weights = torch.split(state, block_size, dim=0)
                assert len(sharded_weights) == tp_world_size
                for tp_rank, shard in enumerate(sharded_weights):
                    assert shard.shape[0] == block_size
                    if match_suffix(state_name, "lm_head.weight"):
                        shards_state_dicts[tp_rank][state_name] = shard.detach().clone()
                    else:
                        shards_state_dicts[tp_rank][
                            "transformer." + state_name
                        ] = shard.detach().clone()
            elif any(
                match_suffix(state_name, candidate)
                for candidate in [
                    "self_attention.dense.weight",
                    "mlp.dense_4h_to_h.weight",
                    "lm_head.weight",
                ]
            ):
                input_size = state.shape[1]
                assert input_size % tp_world_size == 0
                block_size = input_size // tp_world_size
                sharded_weights = torch.split(state, block_size, dim=1)
                assert len(sharded_weights) == tp_world_size
                for tp_rank, shard in enumerate(sharded_weights):
                    assert shard.shape[1] == block_size
                    if match_suffix(state_name, "lm_head.weight"):
                        shards_state_dicts[tp_rank][state_name] = shard.detach().clone()
                    else:
                        shards_state_dicts[tp_rank][
                            "transformer." + state_name
                        ] = shard.detach().clone()
            elif any(
                match_suffix(state_name, candidate)
                for candidate in [
                    "self_attention.dense.bias",
                    "mlp.dense_4h_to_h.bias",
                ]
            ):
                shards_state_dicts[0][
                    "transformer." + state_name
                ] = state.detach().clone()
                for tp_rank in range(1, tp_world_size):
                    shards_state_dicts[tp_rank][
                        "transformer." + state_name
                    ] = torch.zeros_like(state)
            else:
                # We duplicate parameters across tp ranks
                for tp_rank in range(tp_world_size):
                    shards_state_dicts[tp_rank][
                        "transformer." + state_name
                    ] = state.detach().clone()
            del state_dict[state_name]  # delete key from state_dict
            del state  # delete tensor
    # we save state_dict
    for tp_rank, (save_path, shard_state_dict) in enumerate(
        zip(save_paths, shards_state_dicts)
    ):
        save_paths.append(save_path)
        save_path.parent.mkdir(parents=True, exist_ok=True)
        if save_path.exists():
            print(f"Skipping {save_path} as it already exists")
        else:
            torch.save(shard_state_dict, save_path)
    return save_paths
 if __name__ == "__main__":
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("--hub-path", required=True, type=str)
    parser.add_argument("--save-path", required=True, type=str)
    parser.add_argument("--world-size", required=True, type=int)
    args = parser.parse_args()
    prepare_weights(Path(args.hub_path), Path(args.save_path), args.world_size)
--- a/server/bloom_inference/server.py
+++ b/server/bloom_inference/server.py
@ -0,0 +1,91 @@
 import asyncio
 from grpc import aio
 from grpc_reflection.v1alpha import reflection
 from pathlib import Path
 from typing import Optional, List
 from bloom_inference.cache import Cache
 from bloom_inference.model import BLOOM, Batch, BLOOMSharded
 from bloom_inference.pb import generate_pb2_grpc, generate_pb2
 class TextGeneration(generate_pb2_grpc.TextGenerationServicer):
    def __init__(self, model: BLOOM, cache: Cache, server_urls: List[str]):
        self.cache = cache
        self.model = model
        self.server_urls = server_urls
    async def ServiceDiscovery(self, request, context):
        return generate_pb2.ServiceDiscoveryResponse(urls=self.server_urls)
    async def ClearCache(self, request, context):
        self.cache.clear()
        return generate_pb2.Empty()
    async def Generate(self, request, context):
        batch = Batch.from_batch_pb(request, self.model.tokenizer, self.model.device)
        finished_generations, cache_entry = self.model.generate_token(batch)
        self.cache.set(cache_entry)
        return generate_pb2.Response(
            finished=[
                finished_generation.to_pb()
                for finished_generation in finished_generations
            ],
            cache_entry=cache_entry.to_pb() if cache_entry else None,
        )
    async def GenerateWithCache(self, request, context):
        batch = Batch.from_batch_cached_pb(request, self.cache)
        finished_generations, cache_entry = self.model.generate_token(batch)
        self.cache.set(cache_entry)
        return generate_pb2.Response(
            finished=[
                finished_generation.to_pb()
                for finished_generation in finished_generations
            ],
            cache_entry=cache_entry.to_pb() if cache_entry else None,
        )
 def serve(model_name, sharded, shard_directory):
    async def serve_inner(
        model_name: str,
        sharded: bool = False,
        shard_directory: Optional[Path] = None,
    ):
        unix_socket_template = "unix:///tmp/bloom-inference-{}"
        if sharded:
            if shard_directory is None:
                raise ValueError("shard_directory must be set when sharded is True")
            model = BLOOMSharded(model_name, shard_directory)
            server_urls = [
                unix_socket_template.format(rank) for rank in range(model.world_size)
            ]
            local_url = unix_socket_template.format(model.rank)
        else:
            model = BLOOM(model_name)
            local_url = unix_socket_template.format(0)
            server_urls = [local_url]
        server = aio.server()
        generate_pb2_grpc.add_TextGenerationServicer_to_server(
            TextGeneration(model, Cache(), server_urls), server
        )
        SERVICE_NAMES = (
            generate_pb2.DESCRIPTOR.services_by_name["TextGeneration"].full_name,
            reflection.SERVICE_NAME,
        )
        reflection.enable_server_reflection(SERVICE_NAMES, server)
        server.add_insecure_port(local_url)
        await server.start()
        print("Server started at {}".format(local_url))
        await server.wait_for_termination()
    asyncio.run(serve_inner(model_name, sharded, shard_directory))
 if __name__ == "__main__":
    serve("bigscience/bloom-560m", True, Path("/tmp/models"))
--- a/server/bloom_inference/shard_model.py
+++ b/server/bloom_inference/shard_model.py
@ -0,0 +1,102 @@
 from pathlib import Path
 import torch
 from torch import nn
 from transformers import AutoModelForCausalLM
 def match_suffix(text, suffix):
    return text[-len(suffix) :] == suffix
 def shard_model(model_name: str, path: Path, tp_world_size: int, dtype: torch.dtype):
    """BLOOM specific sharding mechanism"""
    save_paths = [
        path / f"{model_name}_tp-rank-{tp_rank}-of-{tp_world_size}.pty"
        for tp_rank in range(tp_world_size)
    ]
    if all(save_path.exists() for save_path in save_paths):
        print("Loading already cached values")
        return save_paths
    model: nn.Module = AutoModelForCausalLM.from_pretrained(
        model_name, torch_dtype=dtype, local_files_only=True
    )
    shards_state_dicts = [{} for _ in range(tp_world_size)]
    state_dict = model.state_dict()
    keys = list(state_dict.keys())
    for state_name in keys:
        print(state_name)
        state = state_dict[state_name]
        if any(
            match_suffix(state_name, candidate)
            for candidate in [
                "self_attention.query_key_value.weight",
                "self_attention.query_key_value.bias",
                "mlp.dense_h_to_4h.weight",
                "mlp.dense_h_to_4h.bias",
                "transformer.word_embeddings.weight",
                "lm_head.weight",
            ]
        ):
            output_size = state.shape[0]
            assert output_size % tp_world_size == 0
            block_size = output_size // tp_world_size
            sharded_weights = torch.split(state, block_size, dim=0)
            assert len(sharded_weights) == tp_world_size
            for tp_rank, shard in enumerate(sharded_weights):
                assert shard.shape[0] == block_size
                shards_state_dicts[tp_rank][state_name] = shard.detach().clone()
        elif any(
            match_suffix(state_name, candidate)
            for candidate in [
                "self_attention.dense.weight",
                "mlp.dense_4h_to_h.weight",
                "lm_head.weight",
            ]
        ):
            input_size = state.shape[1]
            assert input_size % tp_world_size == 0
            block_size = input_size // tp_world_size
            sharded_weights = torch.split(state, block_size, dim=1)
            assert len(sharded_weights) == tp_world_size
            for tp_rank, shard in enumerate(sharded_weights):
                assert shard.shape[1] == block_size
                shards_state_dicts[tp_rank][state_name] = shard.detach().clone()
        elif any(
            match_suffix(state_name, candidate)
            for candidate in [
                "self_attention.dense.bias",
                "mlp.dense_4h_to_h.bias",
            ]
        ):
            shards_state_dicts[0][state_name] = state.detach().clone()
            for tp_rank in range(1, tp_world_size):
                shards_state_dicts[tp_rank][state_name] = torch.zeros_like(state)
        else:
            # We duplicate parameters across tp ranks
            for tp_rank in range(tp_world_size):
                shards_state_dicts[tp_rank][state_name] = state.detach().clone()
        del state_dict[state_name]  # delete key from state_dict
        del state  # delete tensor
    # we save state_dict
    for tp_rank, (save_path, shard_state_dict) in enumerate(
        zip(save_paths, shards_state_dicts)
    ):
        save_path.parent.mkdir(parents=True, exist_ok=True)
        torch.save(shard_state_dict, save_path)
        save_paths.append(save_path)
    return save_paths
 if __name__ == "__main__":
    model_name = "bigscience/bloom"
    save_path = Path("/data/shards")
    tp_world_size = 8
    dtype = torch.bfloat16
    shard_model(model_name, save_path, tp_world_size=tp_world_size, dtype=dtype)
--- a/server/bloom_inference/utils.py
+++ b/server/bloom_inference/utils.py
@ -0,0 +1,95 @@
 import os
 import contextlib
 import torch
 import torch.distributed
 from transformers.generation_logits_process import (
    LogitsProcessorList,
    TemperatureLogitsWarper,
    TopPLogitsWarper,
    TopKLogitsWarper,
 )
 class Sampling:
    def __call__(self, logits):
        probs = torch.nn.functional.softmax(logits, dim=-1)
        next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
        return next_tokens
 class Greedy:
    def __call__(self, logits):
        return logits.argmax(dim=-1)
 class NextTokenChooser:
    def __init__(self, temperature=1.0, top_k=None, top_p=None, do_sample=False):
        warpers = LogitsProcessorList()
        # the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files
        # all samplers can be found in `generation_utils_samplers.py`
        sampling = do_sample
        if temperature is not None and temperature != 1.0:
            temperature = float(temperature)
            warpers.append(TemperatureLogitsWarper(temperature))
            sampling = True
        if top_k is not None and top_k != 0:
            warpers.append(TopKLogitsWarper(top_k=top_k))
            sampling = True
        if top_p is not None and top_p < 1.0:
            warpers.append(TopPLogitsWarper(top_p=top_p))
            sampling = True
        self.warpers = warpers
        self.choice = Sampling() if sampling else Greedy()
    def __call__(self, input_ids, scores):
        scores = self.warpers(input_ids, scores)
        next_ids = self.choice(scores)
        return next_ids.unsqueeze(-1)
 class StoppingCriteria:
    def __init__(self, max_new_tokens=20):
        self.max_new_tokens = max_new_tokens
        self.current_tokens = 0
    def __call__(self, all_ids):
        self.current_tokens += 1
        if self.current_tokens >= self.max_new_tokens:
            return True
        return False
 def initialize_torch_distributed():
    rank = int(os.getenv("RANK", "0"))
    world_size = int(os.getenv("WORLD_SIZE", "1"))
    if torch.cuda.is_available():
        # initialized `torch.distributed`
        # Set the device id.
        assert world_size <= torch.cuda.device_count(), "Each process is one gpu"
        device = rank % torch.cuda.device_count()
        torch.cuda.set_device(device)
        backend = "nccl"
    else:
        backend = "gloo"
    # Call the init process.
    torch.distributed.init_process_group(
        backend=backend,
        world_size=world_size,
        rank=rank,
        init_method="tcp://localhost:6000",
    )
    return torch.distributed.distributed_c10d._get_default_group(), rank, world_size
@contextlib.contextmanager
 def set_default_dtype(dtype):
    saved_dtype = torch.get_default_dtype()
    torch.set_default_dtype(dtype)
    try:
        yield
    finally:
        torch.set_default_dtype(saved_dtype)
--- a/server/poetry.lock
+++ b/server/poetry.lock
@ -0,0 +1,480 @@
 [[package]]
 name = "accelerate"
 version = "0.12.0"
 description = "Accelerate"
 category = "main"
 optional = false
 python-versions = ">=3.7.0"
 [package.dependencies]
 numpy = ">=1.17"
 packaging = ">=20.0"
 psutil = "*"
 pyyaml = "*"
 torch = ">=1.4.0"
 [package.extras]
 dev = ["black (>=22.0,<23.0)", "datasets", "deepspeed (<0.7.0)", "evaluate", "flake8 (>=3.8.3)", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "parameterized", "pytest", "pytest-subtests", "pytest-xdist", "scipy", "sklearn", "tqdm", "transformers"]
 quality = ["black (>=22.0,<23.0)", "flake8 (>=3.8.3)", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)"]
 sagemaker = ["sagemaker"]
 test_dev = ["datasets", "deepspeed (<0.7.0)", "evaluate", "scipy", "sklearn", "tqdm", "transformers"]
 test_prod = ["parameterized", "pytest", "pytest-subtests", "pytest-xdist"]
 test_trackers = ["comet-ml", "tensorboard", "wandb"]
 testing = ["datasets", "deepspeed (<0.7.0)", "evaluate", "parameterized", "pytest", "pytest-subtests", "pytest-xdist", "scipy", "sklearn", "tqdm", "transformers"]
 [[package]]
 name = "click"
 version = "8.1.3"
 description = "Composable command line interface toolkit"
 category = "main"
 optional = false
 python-versions = ">=3.7"
 [package.dependencies]
 colorama = {version = "*", markers = "platform_system == \"Windows\""}
 [[package]]
 name = "colorama"
 version = "0.4.5"
 description = "Cross-platform colored terminal text."
 category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 [[package]]
 name = "grpcio"
 version = "1.49.1"
 description = "HTTP/2-based RPC framework"
 category = "main"
 optional = false
 python-versions = ">=3.7"
 [package.dependencies]
 six = ">=1.5.2"
 [package.extras]
 protobuf = ["grpcio-tools (>=1.49.1)"]
 [[package]]
 name = "grpcio-reflection"
 version = "1.49.1"
 description = "Standard Protobuf Reflection Service for gRPC"
 category = "main"
 optional = false
 python-versions = ">=3.6"
 [package.dependencies]
 grpcio = ">=1.49.1"
 protobuf = ">=4.21.3"
 [[package]]
 name = "grpcio-tools"
 version = "1.49.1"
 description = "Protobuf code generator for gRPC"
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 [package.dependencies]
 grpcio = ">=1.49.1"
 protobuf = ">=4.21.3,<5.0dev"
 setuptools = "*"
 [[package]]
 name = "numpy"
 version = "1.23.3"
 description = "NumPy is the fundamental package for array computing with Python."
 category = "main"
 optional = false
 python-versions = ">=3.8"
 [[package]]
 name = "packaging"
 version = "21.3"
 description = "Core utilities for Python packages"
 category = "main"
 optional = false
 python-versions = ">=3.6"
 [package.dependencies]
 pyparsing = ">=2.0.2,<3.0.5 || >3.0.5"
 [[package]]
 name = "protobuf"
 version = "4.21.7"
 description = ""
 category = "main"
 optional = false
 python-versions = ">=3.7"
 [[package]]
 name = "psutil"
 version = "5.9.2"
 description = "Cross-platform lib for process and system monitoring in Python."
 category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 [package.extras]
 test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
 [[package]]
 name = "pyparsing"
 version = "3.0.9"
 description = "pyparsing module - Classes and methods to define and execute parsing grammars"
 category = "main"
 optional = false
 python-versions = ">=3.6.8"
 [package.extras]
 diagrams = ["jinja2", "railroad-diagrams"]
 [[package]]
 name = "PyYAML"
 version = "6.0"
 description = "YAML parser and emitter for Python"
 category = "main"
 optional = false
 python-versions = ">=3.6"
 [[package]]
 name = "setuptools"
 version = "65.4.1"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 [package.extras]
 docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
 testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mock", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
 testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
 [[package]]
 name = "six"
 version = "1.16.0"
 description = "Python 2 and 3 compatibility utilities"
 category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
 [[package]]
 name = "torch"
 version = "1.12.1"
 description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
 category = "main"
 optional = false
 python-versions = ">=3.7.0"
 [package.dependencies]
 typing-extensions = "*"
 [[package]]
 name = "typer"
 version = "0.6.1"
 description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
 category = "main"
 optional = false
 python-versions = ">=3.6"
 [package.dependencies]
 click = ">=7.1.1,<9.0.0"
 [package.extras]
 all = ["colorama (>=0.4.3,<0.5.0)", "rich (>=10.11.0,<13.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
 dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2.17.0,<3.0.0)"]
 doc = ["mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)"]
 test = ["black (>=22.3.0,<23.0.0)", "coverage (>=5.2,<6.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<5.4.0)", "pytest-cov (>=2.10.0,<3.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<2.0.0)", "rich (>=10.11.0,<13.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
 [[package]]
 name = "typing-extensions"
 version = "4.3.0"
 description = "Backported and Experimental Type Hints for Python 3.7+"
 category = "main"
 optional = false
 python-versions = ">=3.7"
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.9"
 content-hash = "cedd0aebeb3731e2bbddf017a2ee6074c285866354272f8dfe930e9606437a25"
 [metadata.files]
 accelerate = [
    {file = "accelerate-0.12.0-py3-none-any.whl", hash = "sha256:7742ca5c9f15dd1e0a283305599c196e260af4717a561d1f544aeab27d828af6"},
    {file = "accelerate-0.12.0.tar.gz", hash = "sha256:e8b119c94fac31877620d5f9de311164ec81fa9dc9e175f0d0d4f50fc8d79473"},
 ]
 click = [
    {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"},
    {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"},
 ]
 colorama = [
    {file = "colorama-0.4.5-py2.py3-none-any.whl", hash = "sha256:854bf444933e37f5824ae7bfc1e98d5bce2ebe4160d46b5edf346a89358e99da"},
    {file = "colorama-0.4.5.tar.gz", hash = "sha256:e6c6b4334fc50988a639d9b98aa429a0b57da6e17b9a44f0451f930b6967b7a4"},
 ]
 grpcio = [
    {file = "grpcio-1.49.1-cp310-cp310-linux_armv7l.whl", hash = "sha256:fd86040232e805b8e6378b2348c928490ee595b058ce9aaa27ed8e4b0f172b20"},
    {file = "grpcio-1.49.1-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:6fd0c9cede9552bf00f8c5791d257d5bf3790d7057b26c59df08be5e7a1e021d"},
    {file = "grpcio-1.49.1-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:d0d402e158d4e84e49c158cb5204119d55e1baf363ee98d6cb5dce321c3a065d"},
    {file = "grpcio-1.49.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:822ceec743d42a627e64ea266059a62d214c5a3cdfcd0d7fe2b7a8e4e82527c7"},
    {file = "grpcio-1.49.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2106d9c16527f0a85e2eea6e6b91a74fc99579c60dd810d8690843ea02bc0f5f"},
    {file = "grpcio-1.49.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:52dd02b7e7868233c571b49bc38ebd347c3bb1ff8907bb0cb74cb5f00c790afc"},
    {file = "grpcio-1.49.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:120fecba2ec5d14b5a15d11063b39783fda8dc8d24addd83196acb6582cabd9b"},
    {file = "grpcio-1.49.1-cp310-cp310-win32.whl", hash = "sha256:f1a3b88e3c53c1a6e6bed635ec1bbb92201bb6a1f2db186179f7f3f244829788"},
    {file = "grpcio-1.49.1-cp310-cp310-win_amd64.whl", hash = "sha256:a7d0017b92d3850abea87c1bdec6ea41104e71c77bca44c3e17f175c6700af62"},
    {file = "grpcio-1.49.1-cp311-cp311-linux_armv7l.whl", hash = "sha256:9fb17ff8c0d56099ac6ebfa84f670c5a62228d6b5c695cf21c02160c2ac1446b"},
    {file = "grpcio-1.49.1-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:075f2d06e3db6b48a2157a1bcd52d6cbdca980dd18988fe6afdb41795d51625f"},
    {file = "grpcio-1.49.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:46d93a1b4572b461a227f1db6b8d35a88952db1c47e5fadcf8b8a2f0e1dd9201"},
    {file = "grpcio-1.49.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc79b2b37d779ac42341ddef40ad5bf0966a64af412c89fc2b062e3ddabb093f"},
    {file = "grpcio-1.49.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:5f8b3a971c7820ea9878f3fd70086240a36aeee15d1b7e9ecbc2743b0e785568"},
    {file = "grpcio-1.49.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49b301740cf5bc8fed4fee4c877570189ae3951432d79fa8e524b09353659811"},
    {file = "grpcio-1.49.1-cp311-cp311-win32.whl", hash = "sha256:1c66a25afc6c71d357867b341da594a5587db5849b48f4b7d5908d236bb62ede"},
    {file = "grpcio-1.49.1-cp311-cp311-win_amd64.whl", hash = "sha256:6b6c3a95d27846f4145d6967899b3ab25fffc6ae99544415e1adcacef84842d2"},
    {file = "grpcio-1.49.1-cp37-cp37m-linux_armv7l.whl", hash = "sha256:1cc400c8a2173d1c042997d98a9563e12d9bb3fb6ad36b7f355bc77c7663b8af"},
    {file = "grpcio-1.49.1-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:34f736bd4d0deae90015c0e383885b431444fe6b6c591dea288173df20603146"},
    {file = "grpcio-1.49.1-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:196082b9c89ebf0961dcd77cb114bed8171964c8e3063b9da2fb33536a6938ed"},
    {file = "grpcio-1.49.1-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8c9f89c42749890618cd3c2464e1fbf88446e3d2f67f1e334c8e5db2f3272bbd"},
    {file = "grpcio-1.49.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64419cb8a5b612cdb1550c2fd4acbb7d4fb263556cf4625f25522337e461509e"},
    {file = "grpcio-1.49.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:8a5272061826e6164f96e3255405ef6f73b88fd3e8bef464c7d061af8585ac62"},
    {file = "grpcio-1.49.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ea9d0172445241ad7cb49577314e39d0af2c5267395b3561d7ced5d70458a9f3"},
    {file = "grpcio-1.49.1-cp37-cp37m-win32.whl", hash = "sha256:2070e87d95991473244c72d96d13596c751cb35558e11f5df5414981e7ed2492"},
    {file = "grpcio-1.49.1-cp37-cp37m-win_amd64.whl", hash = "sha256:4fcedcab49baaa9db4a2d240ac81f2d57eb0052b1c6a9501b46b8ae912720fbf"},
    {file = "grpcio-1.49.1-cp38-cp38-linux_armv7l.whl", hash = "sha256:afbb3475cf7f4f7d380c2ca37ee826e51974f3e2665613996a91d6a58583a534"},
    {file = "grpcio-1.49.1-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:a4f9ba141380abde6c3adc1727f21529137a2552002243fa87c41a07e528245c"},
    {file = "grpcio-1.49.1-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:cf0a1fb18a7204b9c44623dfbd1465b363236ce70c7a4ed30402f9f60d8b743b"},
    {file = "grpcio-1.49.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:17bb6fe72784b630728c6cff9c9d10ccc3b6d04e85da6e0a7b27fb1d135fac62"},
    {file = "grpcio-1.49.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18305d5a082d1593b005a895c10041f833b16788e88b02bb81061f5ebcc465df"},
    {file = "grpcio-1.49.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:b6a1b39e59ac5a3067794a0e498911cf2e37e4b19ee9e9977dc5e7051714f13f"},
    {file = "grpcio-1.49.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:0e20d59aafc086b1cc68400463bddda6e41d3e5ed30851d1e2e0f6a2e7e342d3"},
    {file = "grpcio-1.49.1-cp38-cp38-win32.whl", hash = "sha256:e1e83233d4680863a421f3ee4a7a9b80d33cd27ee9ed7593bc93f6128302d3f2"},
    {file = "grpcio-1.49.1-cp38-cp38-win_amd64.whl", hash = "sha256:221d42c654d2a41fa31323216279c73ed17d92f533bc140a3390cc1bd78bf63c"},
    {file = "grpcio-1.49.1-cp39-cp39-linux_armv7l.whl", hash = "sha256:fa9e6e61391e99708ac87fc3436f6b7b9c6b845dc4639b406e5e61901e1aacde"},
    {file = "grpcio-1.49.1-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:9b449e966ef518ce9c860d21f8afe0b0f055220d95bc710301752ac1db96dd6a"},
    {file = "grpcio-1.49.1-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:aa34d2ad9f24e47fa9a3172801c676e4037d862247e39030165fe83821a7aafd"},
    {file = "grpcio-1.49.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5207f4eed1b775d264fcfe379d8541e1c43b878f2b63c0698f8f5c56c40f3d68"},
    {file = "grpcio-1.49.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b24a74651438d45619ac67004638856f76cc13d78b7478f2457754cbcb1c8ad"},
    {file = "grpcio-1.49.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:fe763781669790dc8b9618e7e677c839c87eae6cf28b655ee1fa69ae04eea03f"},
    {file = "grpcio-1.49.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2f2ff7ba0f8f431f32d4b4bc3a3713426949d3533b08466c4ff1b2b475932ca8"},
    {file = "grpcio-1.49.1-cp39-cp39-win32.whl", hash = "sha256:08ff74aec8ff457a89b97152d36cb811dcc1d17cd5a92a65933524e363327394"},
    {file = "grpcio-1.49.1-cp39-cp39-win_amd64.whl", hash = "sha256:274ffbb39717918c514b35176510ae9be06e1d93121e84d50b350861dcb9a705"},
    {file = "grpcio-1.49.1.tar.gz", hash = "sha256:d4725fc9ec8e8822906ae26bb26f5546891aa7fbc3443de970cc556d43a5c99f"},
 ]
 grpcio-reflection = [
    {file = "grpcio-reflection-1.49.1.tar.gz", hash = "sha256:b755dfe61d5255a02fb8d0d845bd0027847dee68bf0763a2b286d664ed07ec4d"},
    {file = "grpcio_reflection-1.49.1-py3-none-any.whl", hash = "sha256:70a325a83c1c1ab583d368711e5733cbef5e068ad2c17cbe77df6e47e0311d1f"},
 ]
 grpcio-tools = [
    {file = "grpcio-tools-1.49.1.tar.gz", hash = "sha256:84cc64e5b46bad43d5d7bd2fd772b656eba0366961187a847e908e2cb735db91"},
    {file = "grpcio_tools-1.49.1-cp310-cp310-linux_armv7l.whl", hash = "sha256:2dfb6c7ece84d46bd690b23d3e060d18115c8bc5047d2e8a33e6747ed323a348"},
    {file = "grpcio_tools-1.49.1-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:8f452a107c054a04db2570f7851a07f060313c6e841b0d394ce6030d598290e6"},
    {file = "grpcio_tools-1.49.1-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:6a198871b582287213c4d70792bf275e1d7cf34eed1d019f534ddf4cd15ab039"},
    {file = "grpcio_tools-1.49.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a0cca67a7d0287bdc855d81fdd38dc949c4273273a74f832f9e520abe4f20bc6"},
    {file = "grpcio_tools-1.49.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bdaff4c89eecb37c247b93025410db68114d97fa093cbb028e9bd7cda5912473"},
    {file = "grpcio_tools-1.49.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:bb8773118ad315db317d7b22b5ff75d649ca20931733281209e7cbd8c0fad53e"},
    {file = "grpcio_tools-1.49.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7cc5534023735b8a8f56760b7c533918f874ce5a9064d7c5456d2709ae2b31f9"},
    {file = "grpcio_tools-1.49.1-cp310-cp310-win32.whl", hash = "sha256:d277642acbe305f5586f9597b78fb9970d6633eb9f89c61e429c92c296c37129"},
    {file = "grpcio_tools-1.49.1-cp310-cp310-win_amd64.whl", hash = "sha256:eed599cf08fc1a06c72492d3c5750c32f58de3750eddd984af1f257c14326701"},
    {file = "grpcio_tools-1.49.1-cp311-cp311-linux_armv7l.whl", hash = "sha256:9e5c13809ab2f245398e8446c4c3b399a62d591db651e46806cccf52a700452e"},
    {file = "grpcio_tools-1.49.1-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:ab3d0ee9623720ee585fdf3753b3755d3144a4a8ae35bca8e3655fa2f41056be"},
    {file = "grpcio_tools-1.49.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ba87e3512bc91d78bf9febcfb522eadda171d2d4ddaf886066b0f01aa4929ad"},
    {file = "grpcio_tools-1.49.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13e13b3643e7577a3ec13b79689eb4d7548890b1e104c04b9ed6557a3c3dd452"},
    {file = "grpcio_tools-1.49.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:324f67d9cb4b7058b6ce45352fb64c20cc1fa04c34d97ad44772cfe6a4ae0cf5"},
    {file = "grpcio_tools-1.49.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a64bab81b220c50033f584f57978ebbea575f09c1ccee765cd5c462177988098"},
    {file = "grpcio_tools-1.49.1-cp311-cp311-win32.whl", hash = "sha256:f632d376f92f23e5931697a3acf1b38df7eb719774213d93c52e02acd2d529ac"},
    {file = "grpcio_tools-1.49.1-cp311-cp311-win_amd64.whl", hash = "sha256:28ff2b978d9509474928b9c096a0cce4eaa9c8f7046136aee1545f6211ed8126"},
    {file = "grpcio_tools-1.49.1-cp37-cp37m-linux_armv7l.whl", hash = "sha256:46afd3cb7e555187451a5d283f108cdef397952a662cb48680afc615b158864a"},
    {file = "grpcio_tools-1.49.1-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:9284568b728e41fa8f7e9c2e7399545d605f75d8072ef0e9aa2a05655cb679eb"},
    {file = "grpcio_tools-1.49.1-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:aa34442cf787732cb41f2aa6172007e24f480b8b9d3dc5166de80d63e9072ea4"},
    {file = "grpcio_tools-1.49.1-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3b8c9eb5a4250905414cd53a68caea3eb8f0c515aadb689e6e81b71ebe9ab5c6"},
    {file = "grpcio_tools-1.49.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ab15db024051bf21feb21c29cb2c3ea0a2e4f5cf341d46ef76e17fcf6aaef164"},
    {file = "grpcio_tools-1.49.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:502084b622f758bef620a9107c2db9fcdf66d26c7e0e481d6bb87db4dc917d70"},
    {file = "grpcio_tools-1.49.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:4085890b77c640085f82bf1e90a0ea166ce48000bc2f5180914b974783c9c0a8"},
    {file = "grpcio_tools-1.49.1-cp37-cp37m-win32.whl", hash = "sha256:da0edb984699769ce02e18e3392d54b59a7a3f93acd285a68043f5bde4fc028e"},
    {file = "grpcio_tools-1.49.1-cp37-cp37m-win_amd64.whl", hash = "sha256:9887cd622770271101a7dd1832845d64744c3f88fd11ccb2620394079197a42e"},
    {file = "grpcio_tools-1.49.1-cp38-cp38-linux_armv7l.whl", hash = "sha256:8440fe7dae6a40c279e3a24b82793735babd38ecbb0d07bb712ff9c8963185d9"},
    {file = "grpcio_tools-1.49.1-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:b5de2bb7dd6b6231da9b1556ade981513330b740e767f1d902c71ceee0a7d196"},
    {file = "grpcio_tools-1.49.1-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:1e6f06a763aea7836b63d9c117347f2bf7038008ceef72758815c9e09c5fb1fc"},
    {file = "grpcio_tools-1.49.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e31562f90120318c5395aabec0f2f69ad8c14b6676996b7730d9d2eaf9415d57"},
    {file = "grpcio_tools-1.49.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49ef9a4e389a618157a9daa9fafdfeeaef1ece9adda7f50f85db928f24d4b3e8"},
    {file = "grpcio_tools-1.49.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:b384cb8e8d9bcb55ee8f9b064374561c7a1a05d848249581403d36fc7060032f"},
    {file = "grpcio_tools-1.49.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:73732f77943ac3e898879cbb29c27253aa3c47566b8a59780fd24c6a54de1b66"},
    {file = "grpcio_tools-1.49.1-cp38-cp38-win32.whl", hash = "sha256:b594b2745a5ba9e7a76ce561bc5ab40bc65bb44743c505529b1e4f12af29104d"},
    {file = "grpcio_tools-1.49.1-cp38-cp38-win_amd64.whl", hash = "sha256:680fbc88f8709ddcabb88f86749f2d8e429160890cff2c70680880a6970d4eef"},
    {file = "grpcio_tools-1.49.1-cp39-cp39-linux_armv7l.whl", hash = "sha256:e8c3869121860f6767eedb7d24fc54dfd71e737fdfbb26e1334684606f3274fd"},
    {file = "grpcio_tools-1.49.1-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:73e9d7c886ba10e20c97d1dab0ff961ba5800757ae5e31be21b1cda8130c52f8"},
    {file = "grpcio_tools-1.49.1-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:1760de2dd2c4f08de87b039043a4797f3c17193656e7e3eb84e92f0517083c0c"},
    {file = "grpcio_tools-1.49.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd4b1e216dd04d9245ee8f4e601a1f98c25e6e417ea5cf8d825c50589a8b447e"},
    {file = "grpcio_tools-1.49.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c28751ab5955cae563d07677e799233f0fe1c0fc49d9cbd61ff1957e83617f"},
    {file = "grpcio_tools-1.49.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:c24239c3ee9ed16314c14b4e24437b5079ebc344f343f33629a582f8699f583b"},
    {file = "grpcio_tools-1.49.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:892d3dacf1942820f0b7a868a30e6fbcdf5bec08543b682c7274b0101cee632d"},
    {file = "grpcio_tools-1.49.1-cp39-cp39-win32.whl", hash = "sha256:704d21509ec06efc9d034dbe70e7152715aac004941f4f0f553cf3a0aff15bd5"},
    {file = "grpcio_tools-1.49.1-cp39-cp39-win_amd64.whl", hash = "sha256:1efa0c221c719433f441ac0e026fc3c4dbc9a1a08a552ecdc707775e2f2fbbae"},
 ]
 numpy = [
    {file = "numpy-1.23.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c9f707b5bb73bf277d812ded9896f9512a43edff72712f31667d0a8c2f8e71ee"},
    {file = "numpy-1.23.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ffcf105ecdd9396e05a8e58e81faaaf34d3f9875f137c7372450baa5d77c9a54"},
    {file = "numpy-1.23.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ea3f98a0ffce3f8f57675eb9119f3f4edb81888b6874bc1953f91e0b1d4f440"},
    {file = "numpy-1.23.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:004f0efcb2fe1c0bd6ae1fcfc69cc8b6bf2407e0f18be308612007a0762b4089"},
    {file = "numpy-1.23.3-cp310-cp310-win32.whl", hash = "sha256:98dcbc02e39b1658dc4b4508442a560fe3ca5ca0d989f0df062534e5ca3a5c1a"},
    {file = "numpy-1.23.3-cp310-cp310-win_amd64.whl", hash = "sha256:39a664e3d26ea854211867d20ebcc8023257c1800ae89773cbba9f9e97bae036"},
    {file = "numpy-1.23.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1f27b5322ac4067e67c8f9378b41c746d8feac8bdd0e0ffede5324667b8a075c"},
    {file = "numpy-1.23.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2ad3ec9a748a8943e6eb4358201f7e1c12ede35f510b1a2221b70af4bb64295c"},
    {file = "numpy-1.23.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bdc9febce3e68b697d931941b263c59e0c74e8f18861f4064c1f712562903411"},
    {file = "numpy-1.23.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:301c00cf5e60e08e04d842fc47df641d4a181e651c7135c50dc2762ffe293dbd"},
    {file = "numpy-1.23.3-cp311-cp311-win32.whl", hash = "sha256:7cd1328e5bdf0dee621912f5833648e2daca72e3839ec1d6695e91089625f0b4"},
    {file = "numpy-1.23.3-cp311-cp311-win_amd64.whl", hash = "sha256:8355fc10fd33a5a70981a5b8a0de51d10af3688d7a9e4a34fcc8fa0d7467bb7f"},
    {file = "numpy-1.23.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bc6e8da415f359b578b00bcfb1d08411c96e9a97f9e6c7adada554a0812a6cc6"},
    {file = "numpy-1.23.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:22d43376ee0acd547f3149b9ec12eec2f0ca4a6ab2f61753c5b29bb3e795ac4d"},
    {file = "numpy-1.23.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a64403f634e5ffdcd85e0b12c08f04b3080d3e840aef118721021f9b48fc1460"},
    {file = "numpy-1.23.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:efd9d3abe5774404becdb0748178b48a218f1d8c44e0375475732211ea47c67e"},
    {file = "numpy-1.23.3-cp38-cp38-win32.whl", hash = "sha256:f8c02ec3c4c4fcb718fdf89a6c6f709b14949408e8cf2a2be5bfa9c49548fd85"},
    {file = "numpy-1.23.3-cp38-cp38-win_amd64.whl", hash = "sha256:e868b0389c5ccfc092031a861d4e158ea164d8b7fdbb10e3b5689b4fc6498df6"},
    {file = "numpy-1.23.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:09f6b7bdffe57fc61d869a22f506049825d707b288039d30f26a0d0d8ea05164"},
    {file = "numpy-1.23.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8c79d7cf86d049d0c5089231a5bcd31edb03555bd93d81a16870aa98c6cfb79d"},
    {file = "numpy-1.23.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5d5420053bbb3dd64c30e58f9363d7a9c27444c3648e61460c1237f9ec3fa14"},
    {file = "numpy-1.23.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5422d6a1ea9b15577a9432e26608c73a78faf0b9039437b075cf322c92e98e7"},
    {file = "numpy-1.23.3-cp39-cp39-win32.whl", hash = "sha256:c1ba66c48b19cc9c2975c0d354f24058888cdc674bebadceb3cdc9ec403fb5d1"},
    {file = "numpy-1.23.3-cp39-cp39-win_amd64.whl", hash = "sha256:78a63d2df1d947bd9d1b11d35564c2f9e4b57898aae4626638056ec1a231c40c"},
    {file = "numpy-1.23.3-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:17c0e467ade9bda685d5ac7f5fa729d8d3e76b23195471adae2d6a6941bd2c18"},
    {file = "numpy-1.23.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:91b8d6768a75247026e951dce3b2aac79dc7e78622fc148329135ba189813584"},
    {file = "numpy-1.23.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:94c15ca4e52671a59219146ff584488907b1f9b3fc232622b47e2cf832e94fb8"},
    {file = "numpy-1.23.3.tar.gz", hash = "sha256:51bf49c0cd1d52be0a240aa66f3458afc4b95d8993d2d04f0d91fa60c10af6cd"},
 ]
 packaging = [
    {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"},
    {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"},
 ]
 protobuf = [
    {file = "protobuf-4.21.7-cp310-abi3-win32.whl", hash = "sha256:c7cb105d69a87416bd9023e64324e1c089593e6dae64d2536f06bcbe49cd97d8"},
    {file = "protobuf-4.21.7-cp310-abi3-win_amd64.whl", hash = "sha256:3ec85328a35a16463c6f419dbce3c0fc42b3e904d966f17f48bae39597c7a543"},
    {file = "protobuf-4.21.7-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:db9056b6a11cb5131036d734bcbf91ef3ef9235d6b681b2fc431cbfe5a7f2e56"},
    {file = "protobuf-4.21.7-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:ca200645d6235ce0df3ccfdff1567acbab35c4db222a97357806e015f85b5744"},
    {file = "protobuf-4.21.7-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:b019c79e23a80735cc8a71b95f76a49a262f579d6b84fd20a0b82279f40e2cc1"},
    {file = "protobuf-4.21.7-cp37-cp37m-win32.whl", hash = "sha256:d3f89ccf7182293feba2de2739c8bf34fed1ed7c65a5cf987be00311acac57c1"},
    {file = "protobuf-4.21.7-cp37-cp37m-win_amd64.whl", hash = "sha256:a74d96cd960b87b4b712797c741bb3ea3a913f5c2dc4b6cbe9c0f8360b75297d"},
    {file = "protobuf-4.21.7-cp38-cp38-win32.whl", hash = "sha256:8e09d1916386eca1ef1353767b6efcebc0a6859ed7f73cb7fb974feba3184830"},
    {file = "protobuf-4.21.7-cp38-cp38-win_amd64.whl", hash = "sha256:9e355f2a839d9930d83971b9f562395e13493f0e9211520f8913bd11efa53c02"},
    {file = "protobuf-4.21.7-cp39-cp39-win32.whl", hash = "sha256:f370c0a71712f8965023dd5b13277444d3cdfecc96b2c778b0e19acbfd60df6e"},
    {file = "protobuf-4.21.7-cp39-cp39-win_amd64.whl", hash = "sha256:9643684232b6b340b5e63bb69c9b4904cdd39e4303d498d1a92abddc7e895b7f"},
    {file = "protobuf-4.21.7-py2.py3-none-any.whl", hash = "sha256:8066322588d4b499869bf9f665ebe448e793036b552f68c585a9b28f1e393f66"},
    {file = "protobuf-4.21.7-py3-none-any.whl", hash = "sha256:58b81358ec6c0b5d50df761460ae2db58405c063fd415e1101209221a0a810e1"},
    {file = "protobuf-4.21.7.tar.gz", hash = "sha256:71d9dba03ed3432c878a801e2ea51e034b0ea01cf3a4344fb60166cb5f6c8757"},
 ]
 psutil = [
    {file = "psutil-5.9.2-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:8f024fbb26c8daf5d70287bb3edfafa22283c255287cf523c5d81721e8e5d82c"},
    {file = "psutil-5.9.2-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:b2f248ffc346f4f4f0d747ee1947963613216b06688be0be2e393986fe20dbbb"},
    {file = "psutil-5.9.2-cp27-cp27m-win32.whl", hash = "sha256:b1928b9bf478d31fdffdb57101d18f9b70ed4e9b0e41af751851813547b2a9ab"},
    {file = "psutil-5.9.2-cp27-cp27m-win_amd64.whl", hash = "sha256:404f4816c16a2fcc4eaa36d7eb49a66df2d083e829d3e39ee8759a411dbc9ecf"},
    {file = "psutil-5.9.2-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:94e621c6a4ddb2573d4d30cba074f6d1aa0186645917df42c811c473dd22b339"},
    {file = "psutil-5.9.2-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:256098b4f6ffea6441eb54ab3eb64db9ecef18f6a80d7ba91549195d55420f84"},
    {file = "psutil-5.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:614337922702e9be37a39954d67fdb9e855981624d8011a9927b8f2d3c9625d9"},
    {file = "psutil-5.9.2-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:39ec06dc6c934fb53df10c1672e299145ce609ff0611b569e75a88f313634969"},
    {file = "psutil-5.9.2-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e3ac2c0375ef498e74b9b4ec56df3c88be43fe56cac465627572dbfb21c4be34"},
    {file = "psutil-5.9.2-cp310-cp310-win32.whl", hash = "sha256:e4c4a7636ffc47b7141864f1c5e7d649f42c54e49da2dd3cceb1c5f5d29bfc85"},
    {file = "psutil-5.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:f4cb67215c10d4657e320037109939b1c1d2fd70ca3d76301992f89fe2edb1f1"},
    {file = "psutil-5.9.2-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:dc9bda7d5ced744622f157cc8d8bdd51735dafcecff807e928ff26bdb0ff097d"},
    {file = "psutil-5.9.2-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d75291912b945a7351d45df682f9644540d564d62115d4a20d45fa17dc2d48f8"},
    {file = "psutil-5.9.2-cp36-cp36m-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4018d5f9b6651f9896c7a7c2c9f4652e4eea53f10751c4e7d08a9093ab587ec"},
    {file = "psutil-5.9.2-cp36-cp36m-win32.whl", hash = "sha256:f40ba362fefc11d6bea4403f070078d60053ed422255bd838cd86a40674364c9"},
    {file = "psutil-5.9.2-cp36-cp36m-win_amd64.whl", hash = "sha256:9770c1d25aee91417eba7869139d629d6328a9422ce1cdd112bd56377ca98444"},
    {file = "psutil-5.9.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:42638876b7f5ef43cef8dcf640d3401b27a51ee3fa137cb2aa2e72e188414c32"},
    {file = "psutil-5.9.2-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91aa0dac0c64688667b4285fa29354acfb3e834e1fd98b535b9986c883c2ce1d"},
    {file = "psutil-5.9.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4fb54941aac044a61db9d8eb56fc5bee207db3bc58645d657249030e15ba3727"},
    {file = "psutil-5.9.2-cp37-cp37m-win32.whl", hash = "sha256:7cbb795dcd8ed8fd238bc9e9f64ab188f3f4096d2e811b5a82da53d164b84c3f"},
    {file = "psutil-5.9.2-cp37-cp37m-win_amd64.whl", hash = "sha256:5d39e3a2d5c40efa977c9a8dd4f679763c43c6c255b1340a56489955dbca767c"},
    {file = "psutil-5.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:fd331866628d18223a4265371fd255774affd86244fc307ef66eaf00de0633d5"},
    {file = "psutil-5.9.2-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b315febaebae813326296872fdb4be92ad3ce10d1d742a6b0c49fb619481ed0b"},
    {file = "psutil-5.9.2-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7929a516125f62399d6e8e026129c8835f6c5a3aab88c3fff1a05ee8feb840d"},
    {file = "psutil-5.9.2-cp38-cp38-win32.whl", hash = "sha256:561dec454853846d1dd0247b44c2e66a0a0c490f937086930ec4b8f83bf44f06"},
    {file = "psutil-5.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:67b33f27fc0427483b61563a16c90d9f3b547eeb7af0ef1b9fe024cdc9b3a6ea"},
    {file = "psutil-5.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b3591616fa07b15050b2f87e1cdefd06a554382e72866fcc0ab2be9d116486c8"},
    {file = "psutil-5.9.2-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:14b29f581b5edab1f133563272a6011925401804d52d603c5c606936b49c8b97"},
    {file = "psutil-5.9.2-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4642fd93785a29353d6917a23e2ac6177308ef5e8be5cc17008d885cb9f70f12"},
    {file = "psutil-5.9.2-cp39-cp39-win32.whl", hash = "sha256:ed29ea0b9a372c5188cdb2ad39f937900a10fb5478dc077283bf86eeac678ef1"},
    {file = "psutil-5.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:68b35cbff92d1f7103d8f1db77c977e72f49fcefae3d3d2b91c76b0e7aef48b8"},
    {file = "psutil-5.9.2.tar.gz", hash = "sha256:feb861a10b6c3bb00701063b37e4afc754f8217f0f09c42280586bd6ac712b5c"},
 ]
 pyparsing = [
    {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"},
    {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"},
 ]
 PyYAML = [
    {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"},
    {file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"},
    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc"},
    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b"},
    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"},
    {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"},
    {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"},
    {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"},
    {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"},
    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"},
    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"},
    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"},
    {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"},
    {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"},
    {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"},
    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"},
    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"},
    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4"},
    {file = "PyYAML-6.0-cp36-cp36m-win32.whl", hash = "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293"},
    {file = "PyYAML-6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57"},
    {file = "PyYAML-6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c"},
    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0"},
    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4"},
    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9"},
    {file = "PyYAML-6.0-cp37-cp37m-win32.whl", hash = "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737"},
    {file = "PyYAML-6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d"},
    {file = "PyYAML-6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b"},
    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba"},
    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34"},
    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287"},
    {file = "PyYAML-6.0-cp38-cp38-win32.whl", hash = "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78"},
    {file = "PyYAML-6.0-cp38-cp38-win_amd64.whl", hash = "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07"},
    {file = "PyYAML-6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b"},
    {file = "PyYAML-6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174"},
    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803"},
    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3"},
    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0"},
    {file = "PyYAML-6.0-cp39-cp39-win32.whl", hash = "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb"},
    {file = "PyYAML-6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c"},
    {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"},
 ]
 setuptools = [
    {file = "setuptools-65.4.1-py3-none-any.whl", hash = "sha256:1b6bdc6161661409c5f21508763dc63ab20a9ac2f8ba20029aaaa7fdb9118012"},
    {file = "setuptools-65.4.1.tar.gz", hash = "sha256:3050e338e5871e70c72983072fe34f6032ae1cdeeeb67338199c2f74e083a80e"},
 ]
 six = [
    {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
    {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
 ]
 torch = [
    {file = "torch-1.12.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:9c038662db894a23e49e385df13d47b2a777ffd56d9bcd5b832593fab0a7e286"},
    {file = "torch-1.12.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:4e1b9c14cf13fd2ab8d769529050629a0e68a6fc5cb8e84b4a3cc1dd8c4fe541"},
    {file = "torch-1.12.1-cp310-cp310-win_amd64.whl", hash = "sha256:e9c8f4a311ac29fc7e8e955cfb7733deb5dbe1bdaabf5d4af2765695824b7e0d"},
    {file = "torch-1.12.1-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:976c3f997cea38ee91a0dd3c3a42322785414748d1761ef926b789dfa97c6134"},
    {file = "torch-1.12.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:68104e4715a55c4bb29a85c6a8d57d820e0757da363be1ba680fa8cc5be17b52"},
    {file = "torch-1.12.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:743784ccea0dc8f2a3fe6a536bec8c4763bd82c1352f314937cb4008d4805de1"},
    {file = "torch-1.12.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:b5dbcca369800ce99ba7ae6dee3466607a66958afca3b740690d88168752abcf"},
    {file = "torch-1.12.1-cp37-cp37m-win_amd64.whl", hash = "sha256:f3b52a634e62821e747e872084ab32fbcb01b7fa7dbb7471b6218279f02a178a"},
    {file = "torch-1.12.1-cp37-none-macosx_10_9_x86_64.whl", hash = "sha256:8a34a2fbbaa07c921e1b203f59d3d6e00ed379f2b384445773bd14e328a5b6c8"},
    {file = "torch-1.12.1-cp37-none-macosx_11_0_arm64.whl", hash = "sha256:42f639501928caabb9d1d55ddd17f07cd694de146686c24489ab8c615c2871f2"},
    {file = "torch-1.12.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:0b44601ec56f7dd44ad8afc00846051162ef9c26a8579dda0a02194327f2d55e"},
    {file = "torch-1.12.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:cd26d8c5640c3a28c526d41ccdca14cf1cbca0d0f2e14e8263a7ac17194ab1d2"},
    {file = "torch-1.12.1-cp38-cp38-win_amd64.whl", hash = "sha256:42e115dab26f60c29e298559dbec88444175528b729ae994ec4c65d56fe267dd"},
    {file = "torch-1.12.1-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:a8320ba9ad87e80ca5a6a016e46ada4d1ba0c54626e135d99b2129a4541c509d"},
    {file = "torch-1.12.1-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:03e31c37711db2cd201e02de5826de875529e45a55631d317aadce2f1ed45aa8"},
    {file = "torch-1.12.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:9b356aea223772cd754edb4d9ecf2a025909b8615a7668ac7d5130f86e7ec421"},
    {file = "torch-1.12.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:6cf6f54b43c0c30335428195589bd00e764a6d27f3b9ba637aaa8c11aaf93073"},
    {file = "torch-1.12.1-cp39-cp39-win_amd64.whl", hash = "sha256:f00c721f489089dc6364a01fd84906348fe02243d0af737f944fddb36003400d"},
    {file = "torch-1.12.1-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:bfec2843daa654f04fda23ba823af03e7b6f7650a873cdb726752d0e3718dada"},
    {file = "torch-1.12.1-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:69fe2cae7c39ccadd65a123793d30e0db881f1c1927945519c5c17323131437e"},
 ]
 typer = [
    {file = "typer-0.6.1-py3-none-any.whl", hash = "sha256:54b19e5df18654070a82f8c2aa1da456a4ac16a2a83e6dcd9f170e291c56338e"},
    {file = "typer-0.6.1.tar.gz", hash = "sha256:2d5720a5e63f73eaf31edaa15f6ab87f35f0690f8ca233017d7d23d743a91d73"},
 ]
 typing-extensions = [
    {file = "typing_extensions-4.3.0-py3-none-any.whl", hash = "sha256:25642c956049920a5aa49edcdd6ab1e06d7e5d467fc00e0506c44ac86fbfca02"},
    {file = "typing_extensions-4.3.0.tar.gz", hash = "sha256:e6d2677a32f47fc7eb2795db1dd15c1f34eff616bcaf2cfb5e997f854fa1c4a6"},
 ]
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@ -0,0 +1,21 @@
 [tool.poetry]
 name = "bloom-inference"
 version = "0.1.0"
 description = "BLOOM Inference Python gRPC Server"
 authors = ["Olivier Dehaene <olivier@huggingface.co>"]
 [tool.poetry.dependencies]
 python = "^3.9"
 protobuf = "^4.21.7"
 grpcio = "^1.49.1"
 torch = "^1.12.1"
 typer = "^0.6.1"
 grpcio-reflection = "^1.49.1"
 accelerate = "^0.12.0"
 [tool.poetry.group.dev.dependencies]
 grpcio-tools = "^1.49.1"
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"