Init

2022-10-08 12:30:12 +02:00 · 2022-10-08 12:30:12 +02:00 · 295831a481
commit 295831a481
43 changed files with 5060 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,37 @@
+# BLOOM Inference
+
+A Rust and gRPC server for BLOOM Inference.
+
+## Install
+
+```shell
+cd server
+pip install .
+```
+
+```
+cd router
+cargo build --release
+```
+
+## Run
+
+```shell
+python server/bloom_inference/main.py bigscience/bloom --num-gpus 8 --shard-directory /dev/shm/models
+```
+
+```shell
+./router/target/release/router
+```
+
+## TODO:
+
+- [ ] Improve model download
+  - Store "shardable" layers separately and layer by layer
+- [ ] Add batching args to router CLI 
+- [ ] Add docstrings + comments everywhere as the codebase is fairly complicated
+- [ ] Add tests
+- [ ] Add shutdown logic in router and server
+- [ ] Improve multi-processing logic in server
+- [ ] Improve error handling everywhere
+- [ ] Improve past key layer indexing?
--- a/proto/generate.proto
+++ b/proto/generate.proto
@ -0,0 +1,83 @@
+syntax = "proto3";
+
+package generate.v1;
+
+service TextGeneration {
+    /// Service discovery
+    rpc ServiceDiscovery(Empty) returns (ServiceDiscoveryResponse) {}
+    /// Empties batch cache
+    rpc ClearCache(Empty) returns (Empty);
+    /// Generate tokens for a batch without cache
+    rpc Generate(Batch) returns (Response);
+    /// Generate tokens for a batch with cache
+    rpc GenerateWithCache(BatchCached) returns (Response);
+}
+
+message ServiceDiscoveryResponse {
+    repeated string urls = 1;
+}
+
+message LogitsWarperParameters {
+    float temperature = 1;
+    uint32 top_k = 2;
+    float top_p = 3;
+    bool do_sample = 4;
+}
+
+message Request {
+    /// Request ID
+    uint64 id = 1;
+    /// The generation context
+    string inputs = 2;
+    /// Logits Warper Parameters
+    LogitsWarperParameters parameters = 3;
+    /// Stopping criteria
+    uint32 max_new_tokens = 4;
+}
+
+message Batch {
+    /// Batch ID
+    uint64 id = 1;
+    /// Individual requests
+    repeated Request requests = 2;
+}
+
+message BatchCached {
+    /// Batch ID
+    uint64 id = 1;
+    /// Request ids within cache
+    repeated uint64 request_ids = 2;
+    /// Cache IDs
+    repeated uint64 batch_cached_ids = 3;
+    /// Batch size (sum of all batch sizes)
+    uint32 total_batch_size = 4;
+    /// Max sequence length
+    uint32 max_sequence_length = 5;
+}
+
+message FinishedGeneration {
+    /// ID of the original request
+    uint64 id = 1;
+    /// Output
+    string output = 2;
+}
+
+message CacheEntry {
+    /// Cache ID; same as batch ID
+    uint64 id = 1;
+    /// Requests present in cache entry
+    repeated uint64 request_ids = 2;
+    /// Sequence length
+    uint32 sequence_length = 3;
+}
+
+message Response {
+    /// Finished requests (optional)
+    repeated FinishedGeneration finished = 1;
+    /// Cache entry (optional)
+    optional CacheEntry cache_entry = 2;
+}
+
+
+// Represent an empty message.
+message Empty {}
--- a/router/.gitignore
+++ b/router/.gitignore
@ -0,0 +1 @@
+/target
--- a/router/Cargo.lock
+++ b/router/Cargo.lock
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@ -0,0 +1,28 @@
+[package]
+name = "bloom-inference"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+bloom-inference-client = { path = "client" }
+futures = "0.3.24"
+parking_lot = "0.12.1"
+poem = "1.3.45"
+serde = "1.0.145"
+serde_json = "1.0.85"
+tokenizers = "0.13.0"
+tokio = { version = "1.21.1", features = ["rt-multi-thread", "parking_lot", "sync"] }
+tracing = "0.1.36"
+tracing-subscriber = "0.3.15"
+
+[workspace]
+members = [
+    "client",
+]
+
+[profile.release]
+debug = 1
+incremental = true
+lto = "off"
--- a/router/client/Cargo.toml
+++ b/router/client/Cargo.toml
@ -0,0 +1,19 @@
+[package]
+name = "bloom-inference-client"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+futures = "0.3.24"
+#grpc-error-details = { path = "../../grpc-error-details" }
+#grpc-metadata = { path = "../../grpc-metadata" }
+prost = "^0.9"
+thiserror = "1.0.37"
+tokio = { version = "1.21.2", features = ["sync"] }
+tonic = "^0.6"
+tower = "^0.4"
+tracing = "^0.1"
+tracing-error = "^0.2"
+
+[build-dependencies]
+tonic-build = "0.6.2"
--- a/router/client/build.rs
+++ b/router/client/build.rs
@ -0,0 +1,14 @@
+use std::fs;
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    fs::create_dir("src/pb").unwrap_or(());
+    tonic_build::configure()
+        .build_client(true)
+        .build_server(false)
+        .out_dir("src/pb")
+        .include_file("mod.rs")
+        .compile(&["../../proto/generate.proto"], &["../../proto"])
+        .unwrap_or_else(|e| panic!("protobuf compilation failed: {}", e));
+
+    Ok(())
+}
--- a/router/client/src/client.rs
+++ b/router/client/src/client.rs
@ -0,0 +1,104 @@
+use crate::pb::generate::v1::text_generation_client::TextGenerationClient;
+use crate::pb::generate::v1::*;
+use crate::Result;
+use std::time::Duration;
+use tonic::transport::{Channel, Uri};
+use tower::timeout::Timeout;
+use tracing::*;
+
+/// BLOOM Inference gRPC client
+#[derive(Clone)]
+pub struct Client {
+    stub: TextGenerationClient<Timeout<Channel>>,
+}
+
+impl Client {
+    /// Returns a client connected to the given url. Requests exceeding timeout will fail.
+    pub async fn connect(uri: Uri, timeout: Duration) -> Self {
+        let channel = Channel::builder(uri)
+            .connect()
+            .await
+            .expect("Transport error");
+        let timeout_channel = Timeout::new(channel, timeout);
+
+        Self {
+            stub: TextGenerationClient::new(timeout_channel),
+        }
+    }
+
+    /// Returns a client connected to the given unix socket. Requests exceeding timeout will fail.
+    pub async fn connect_uds(path: String, timeout: Duration) -> Self {
+        let channel = Channel::from_shared(format!("http://[::]:50051"))
+            .unwrap()
+            .connect_with_connector(tower::service_fn(move |_: Uri| {
+                tokio::net::UnixStream::connect(path.clone())
+            }))
+            .await
+            .expect("Transport error");
+        let timeout_channel = Timeout::new(channel, timeout);
+
+        Self {
+            stub: TextGenerationClient::new(timeout_channel),
+        }
+    }
+
+    #[instrument(skip(self))]
+    pub async fn service_discovery(&mut self) -> Result<Vec<String>> {
+        let request = tonic::Request::new(Empty {});
+        let response = self
+            .stub
+            .service_discovery(request)
+            .instrument(info_span!("service_discovery"))
+            .await?;
+        let urls = response
+            .into_inner()
+            .urls
+            .into_iter()
+            .map(|url| match url.strip_prefix("unix://") {
+                None => url,
+                Some(stripped_url) => stripped_url.to_string(),
+            })
+            .collect();
+        Ok(urls)
+    }
+
+    #[instrument(skip(self))]
+    pub async fn clear_cache(&mut self) -> Result<()> {
+        let request = tonic::Request::new(Empty {});
+        self.stub
+            .clear_cache(request)
+            .instrument(info_span!("clear_cache"))
+            .await?;
+        Ok(())
+    }
+
+    #[instrument(skip(self))]
+    pub async fn generate(
+        &mut self,
+        request: Batch,
+    ) -> Result<(Vec<FinishedGeneration>, Option<CacheEntry>)> {
+        let request = tonic::Request::new(request);
+        let response = self
+            .stub
+            .generate(request)
+            .instrument(info_span!("generate"))
+            .await?
+            .into_inner();
+        Ok((response.finished, response.cache_entry))
+    }
+
+    #[instrument(skip(self))]
+    pub async fn generate_with_cache(
+        &mut self,
+        request: BatchCached,
+    ) -> Result<(Vec<FinishedGeneration>, Option<CacheEntry>)> {
+        let request = tonic::Request::new(request);
+        let response = self
+            .stub
+            .generate_with_cache(request)
+            .instrument(info_span!("generate_with_cache"))
+            .await?
+            .into_inner();
+        Ok((response.finished, response.cache_entry))
+    }
+}
--- a/router/client/src/lib.rs
+++ b/router/client/src/lib.rs
@ -0,0 +1,32 @@
+//! BLOOM Inference gRPC client library
+
+mod client;
+mod pb;
+mod sharded_client;
+
+pub use client::Client;
+pub use pb::generate::v1::{
+    Batch, BatchCached, CacheEntry, FinishedGeneration, LogitsWarperParameters, Request,
+};
+pub use sharded_client::ShardedClient;
+use thiserror::Error;
+pub use tonic::transport::Uri;
+use tonic::Status;
+
+#[derive(Error, Debug, Clone)]
+#[error("Text generation client error: {msg:?}")]
+pub struct ClientError {
+    msg: String,
+    // source: Status,
+}
+
+impl From<Status> for ClientError {
+    fn from(err: Status) -> Self {
+        Self {
+            msg: err.to_string(),
+            // source: err,
+        }
+    }
+}
+
+pub type Result<T> = std::result::Result<T, ClientError>;
--- a/router/client/src/pb/.gitignore
+++ b/router/client/src/pb/.gitignore
@ -0,0 +1 @@
+*.rs
--- a/router/client/src/sharded_client.rs
+++ b/router/client/src/sharded_client.rs
@ -0,0 +1,106 @@
+use crate::Result;
+use crate::{Batch, BatchCached, CacheEntry, Client, FinishedGeneration};
+use futures::future::join_all;
+use std::time::Duration;
+use tokio::sync::{broadcast, mpsc};
+use tonic::transport::Uri;
+
+#[derive(Clone, Debug)]
+enum Command {
+    Generate(
+        Batch,
+        mpsc::Sender<Result<(Vec<FinishedGeneration>, Option<CacheEntry>)>>,
+    ),
+    GenerateWithCache(
+        BatchCached,
+        mpsc::Sender<Result<(Vec<FinishedGeneration>, Option<CacheEntry>)>>,
+    ),
+    ClearCache(mpsc::Sender<Result<()>>),
+}
+
+async fn client_task(mut client: Client, mut request_subscriber: broadcast::Receiver<Command>) {
+    while let Ok(message) = request_subscriber.recv().await {
+        match message {
+            Command::Generate(batch, response_tx) => {
+                let result = client.generate(batch).await;
+                response_tx.try_send(result).unwrap_or(());
+            }
+            Command::GenerateWithCache(batch_cached, response_tx) => {
+                let result = client.generate_with_cache(batch_cached).await;
+                response_tx.try_send(result).unwrap_or(());
+            }
+            Command::ClearCache(response_tx) => {
+                let result = client.clear_cache().await;
+                response_tx.try_send(result).unwrap_or(());
+            }
+        };
+    }
+}
+
+pub struct ShardedClient {
+    request_tx: broadcast::Sender<Command>,
+}
+
+impl ShardedClient {
+    fn new(mut clients: Vec<Client>) -> Self {
+        let (request_tx, _) = broadcast::channel(1);
+
+        for client in clients.drain(..) {
+            let request_subscriber = request_tx.subscribe();
+            tokio::spawn(client_task(client, request_subscriber));
+        }
+
+        Self { request_tx }
+    }
+
+    async fn from_master_client(mut master_client: Client) -> Self {
+        let uris = master_client.service_discovery().await.unwrap();
+        let futures = uris
+            .into_iter()
+            .map(|path| Client::connect_uds(path, Duration::from_secs(5)));
+        let clients = join_all(futures).await;
+        Self::new(clients)
+    }
+
+    /// Returns a client connected to the given url. Requests exceeding timeout will fail.
+    pub async fn connect(uri: Uri, timeout: Duration) -> Self {
+        let master_client = Client::connect(uri, timeout).await;
+        Self::from_master_client(master_client).await
+    }
+
+    /// Returns a client connected to the given unix socket. Requests exceeding timeout will fail.
+    pub async fn connect_uds(path: String, timeout: Duration) -> Self {
+        let master_client = Client::connect_uds(path, timeout).await;
+        Self::from_master_client(master_client).await
+    }
+
+    pub async fn generate(
+        &self,
+        batch: Batch,
+    ) -> Result<(Vec<FinishedGeneration>, Option<CacheEntry>)> {
+        let (response_tx, mut response_rx) = mpsc::channel(1);
+        self.request_tx
+            .send(Command::Generate(batch, response_tx))
+            .unwrap();
+        response_rx.recv().await.unwrap()
+    }
+
+    pub async fn generate_with_cache(
+        &self,
+        batch_cached: BatchCached,
+    ) -> Result<(Vec<FinishedGeneration>, Option<CacheEntry>)> {
+        let (response_tx, mut response_rx) = mpsc::channel(1);
+        self.request_tx
+            .send(Command::GenerateWithCache(batch_cached, response_tx))
+            .unwrap();
+        response_rx.recv().await.unwrap()
+    }
+
+    pub async fn clear_cache(&self) -> Result<()> {
+        let (response_tx, mut response_rx) = mpsc::channel(1);
+        self.request_tx
+            .send(Command::ClearCache(response_tx))
+            .unwrap();
+        response_rx.recv().await.unwrap()
+    }
+}
--- a/router/src/db.rs
+++ b/router/src/db.rs
@ -0,0 +1,129 @@
+/// This code is massively inspired by Tokio mini-redis
+use crate::GenerateRequest;
+use bloom_inference_client::{Batch, ClientError, LogitsWarperParameters, Request};
+use parking_lot::RwLock;
+use std::collections::BTreeMap;
+use std::sync::Arc;
+use tokio::sync::oneshot::Sender;
+
+#[derive(Debug, Clone)]
+pub(crate) struct Db {
+    pub shared: Arc<Shared>,
+}
+
+#[derive(Debug)]
+pub struct Shared {
+    state: RwLock<State>,
+}
+
+#[derive(Debug)]
+struct State {
+    entries: BTreeMap<u64, (Request, Sender<Result<String, ClientError>>)>,
+
+    /// Identifier to use for the next expiration. Each expiration is associated
+    /// with a unique identifier. See above for why.
+    next_id: u64,
+
+    next_batch_id: u64,
+
+    /// Current batch id
+    next_batch_start_id: u64,
+}
+
+impl Db {
+    pub(crate) fn new() -> Self {
+        let shared = Arc::new(Shared {
+            state: RwLock::new(State {
+                entries: BTreeMap::new(),
+                next_id: 0,
+                next_batch_id: 0,
+                next_batch_start_id: 0,
+            }),
+        });
+
+        Self { shared }
+    }
+
+    pub(crate) fn append(&self, request: GenerateRequest, sender: Sender<Result<String, ClientError>>) {
+        let mut state = self.shared.state.write();
+
+        let id = state.next_id;
+        state.next_id += 1;
+
+        let parameters = Some(LogitsWarperParameters {
+            temperature: request.parameters.temperature,
+            top_k: request.parameters.top_k,
+            top_p: request.parameters.top_p,
+            do_sample: request.parameters.do_sample,
+        });
+        let request = Request {
+            id,
+            inputs: request.inputs,
+            parameters,
+            max_new_tokens: request.parameters.max_new_tokens,
+        };
+        state.entries.insert(id, (request, sender));
+    }
+
+    pub(crate) fn remove(&self, id: &u64) -> Option<(Request, Sender<Result<String, ClientError>>)> {
+        let mut state = self.shared.state.write();
+        state.entries.remove(id)
+    }
+
+    pub(crate) fn len(&self) -> usize {
+        let state = self.shared.state.read();
+        state.entries.len()
+    }
+
+    fn next_requests(&self, max_size: usize) -> Option<(u64, Vec<Request>)> {
+        let state = self.shared.state.read();
+
+        let requests: Vec<Request> = state
+            .entries
+            .range(state.next_batch_start_id..)
+            .take(max_size)
+            .map(|(_, (request, _))| request.clone())
+            .collect();
+
+        if requests.is_empty() {
+            None
+        } else {
+            let last_id = requests.last().unwrap().id;
+            Some((last_id, requests))
+        }
+    }
+
+    pub(crate) fn next_batch(&self, max_size: usize) -> Option<Batch> {
+        if let Some((last_id, requests)) = self.next_requests(max_size) {
+            let mut state = self.shared.state.write();
+            let batch = Batch {
+                id: state.next_batch_id,
+                requests,
+            };
+            state.next_batch_start_id = last_id + 1;
+            state.next_batch_id += 1;
+            return Some(batch);
+        }
+        None
+    }
+
+    pub(crate) fn next_batch_minimum_size(
+        &self,
+        min_size: usize,
+        max_size: usize,
+    ) -> Option<Batch> {
+        if let Some((last_id, requests)) = self.next_requests(max_size) {
+            if requests.len() >= min_size {
+                let mut state = self.shared.state.write();
+                let batch = Batch {
+                    id: state.next_batch_id,
+                    requests,
+                };
+                state.next_batch_start_id = last_id + 1;
+                state.next_batch_id += 1;
+                return Some(batch);
+            }
+        }
+        None
+    }
+}
--- a/router/src/infer.rs
+++ b/router/src/infer.rs
@ -0,0 +1,130 @@
+use crate::{Db, GenerateRequest};
+use bloom_inference_client::{Batch, BatchCached, CacheEntry, ClientError, FinishedGeneration, ShardedClient};
+use std::sync::Arc;
+use tokio::sync::{oneshot, Notify};
+
+const MAX_LENGTH: usize = 128;
+
+pub struct InferError {}
+
+#[derive(Clone)]
+pub(crate) struct Infer {
+    db: Db,
+    shared: Arc<Shared>,
+}
+
+struct Shared {
+    batching_task: Notify,
+}
+
+impl Infer {
+    pub(crate) fn new(client: ShardedClient) -> Self {
+        let db = Db::new();
+        let shared = Arc::new(Shared {
+            batching_task: Notify::new(),
+        });
+
+        tokio::spawn(batching_task(client, db.clone(), shared.clone()));
+
+        Self { db, shared }
+    }
+
+    pub(crate) async fn infer(&self, request: GenerateRequest) -> Result<String, InferError> {
+        if self.db.len() > MAX_LENGTH {
+            return Err(InferError {});
+        }
+        let (request_tx, request_rx) = oneshot::channel();
+        self.db.append(request, request_tx);
+        self.shared.batching_task.notify_waiters();
+        match request_rx.await.unwrap() {
+            Ok(output) => Ok(output),
+            Err(_) => Err(InferError {})
+        }
+    }
+}
+
+async fn batching_task(client: ShardedClient, db: Db, shared: Arc<Shared>) {
+    loop {
+        shared.batching_task.notified().await;
+
+        if let Some(batch) = db.next_batch(32) {
+            let mut cache_entry = infer_batch(batch, &client, &db).await;
+
+            loop {
+                if let Some(entry) = cache_entry {
+                    let mut batch_cached_ids = vec![entry.id];
+                    let mut total_batch_size = entry.request_ids.len();
+                    let mut max_sequence_length = entry.sequence_length;
+                    let mut request_ids = entry.request_ids;
+
+                    if total_batch_size <= 16 {
+                        if let Some(batch) = db.next_batch_minimum_size(16, 48) {
+                            let other_cache_entry = infer_batch(batch, &client, &db).await;
+
+                            if let Some(entry) = other_cache_entry {
+                                batch_cached_ids.push(entry.id);
+                                total_batch_size += entry.request_ids.len();
+                                max_sequence_length =
+                                    max_sequence_length.max(entry.sequence_length);
+                                request_ids.extend(entry.request_ids.into_iter());
+                            }
+                        }
+                    }
+
+                    let batch_cached = BatchCached {
+                        id: entry.id,
+                        batch_cached_ids,
+                        total_batch_size: total_batch_size as u32,
+                        max_sequence_length,
+                        request_ids,
+                    };
+                    cache_entry = infer_batch_cached(batch_cached, &client, &db).await;
+                } else {
+                    break;
+                }
+            }
+        }
+    }
+}
+
+async fn infer_batch_cached(batch: BatchCached, client: &ShardedClient, db: &Db) -> Option<CacheEntry> {
+    match client.generate_with_cache(batch.clone()).await {
+        Ok((finished, cache_entry)) => {
+            send_finished(finished, db);
+            cache_entry
+        }
+        Err(err) => {
+            println!("{:?}", err);
+            send_error(err, batch.request_ids, &db);
+            None
+        }
+    }
+}
+
+async fn infer_batch(batch: Batch, client: &ShardedClient, db: &Db) -> Option<CacheEntry> {
+    match client.generate(batch.clone()).await {
+        Ok((finished, cache_entry)) => {
+            send_finished(finished, db);
+            cache_entry
+        }
+        Err(err) => {
+            println!("{:?}", err);
+            send_error(err, batch.requests.into_iter().map(|req| req.id).collect(), &db);
+            None
+        }
+    }
+}
+
+fn send_error(error: ClientError, request_ids: Vec<u64>, db: &Db) {
+    request_ids.into_iter().for_each(|id| {
+        let (_, response_tx) = db.remove(&id).unwrap();
+        response_tx.send(Err(error.clone())).unwrap_or(());
+    });
+}
+
+fn send_finished(finished: Vec<FinishedGeneration>, db: &Db) {
+    finished.into_iter().for_each(|output| {
+        let (_, response_tx) = db.remove(&output.id).unwrap();
+        response_tx.send(Ok(output.output)).unwrap_or(());
+    });
+}
--- a/router/src/main.rs
+++ b/router/src/main.rs
@ -0,0 +1,125 @@
+use tokio::time::Instant;
+
+use poem;
+use poem::middleware::AddData;
+use poem::web::Data;
+use poem::{handler, listener::TcpListener, post, web::Json, EndpointExt, Result, Route, Server};
+
+use bloom_inference_client::ShardedClient;
+use serde::Deserialize;
+use std::time::Duration;
+use poem::http::StatusCode;
+use tracing::instrument;
+
+mod db;
+
+use db::Db;
+
+mod infer;
+
+use infer::Infer;
+
+#[derive(Clone, Debug, Deserialize)]
+struct GenerateParameters {
+    #[serde(default = "default_temperature")]
+    temperature: f32,
+    #[serde(default = "default_top_k")]
+    top_k: u32,
+    #[serde(default = "default_top_p")]
+    top_p: f32,
+    #[serde(default = "default_do_sample")]
+    do_sample: bool,
+    #[serde(default = "default_max_new_tokens")]
+    max_new_tokens: u32,
+}
+
+fn default_temperature() -> f32 {
+    1.0
+}
+
+fn default_top_k() -> u32 {
+    0
+}
+
+fn default_top_p() -> f32 {
+    1.0
+}
+
+fn default_do_sample() -> bool {
+    false
+}
+
+fn default_max_new_tokens() -> u32 {
+    20
+}
+
+#[derive(Clone, Debug, Deserialize)]
+struct GenerateRequest {
+    inputs: String,
+    #[serde(default = "default_parameters")]
+    parameters: GenerateParameters,
+}
+
+fn default_parameters() -> GenerateParameters {
+    GenerateParameters {
+        temperature: default_temperature(),
+        top_k: default_top_k(),
+        top_p: default_top_p(),
+        do_sample: default_do_sample(),
+        max_new_tokens: default_max_new_tokens(),
+    }
+}
+
+#[handler]
+#[instrument(skip(infer), fields(time, time_per_token))]
+async fn generate(
+    infer: Data<&Infer>,
+    req: Json<GenerateRequest>,
+) -> Result<Json<serde_json::Value>> {
+    let start = Instant::now();
+
+    let output = infer
+        .infer(GenerateRequest {
+            inputs: req.inputs.clone(),
+            parameters: req.parameters.clone(),
+        })
+        .await;
+
+    match output {
+        Ok(generated_text) => {
+            tracing::Span::current().record("time", format!("{:?}", start.elapsed()));
+            tracing::Span::current().record("time_per_token", format!("{:?}", start.elapsed() / req.parameters.max_new_tokens));
+            tracing::info!("response: {}", generated_text);
+
+            Ok(Json(serde_json::json!({
+                "generated_text": generated_text,
+            })))
+        }
+        Err(_) => {
+            Err(poem::Error::from_status(StatusCode::INTERNAL_SERVER_ERROR))
+        }
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<(), std::io::Error> {
+    tracing_subscriber::fmt::init();
+
+    let sharded_client =
+        ShardedClient::connect_uds("/tmp/bloom-inference-0".to_string(), Duration::from_secs(5))
+            .await;
+    sharded_client
+        .clear_cache()
+        .await
+        .expect("Unable to clear cache");
+    tracing::info!("Connected");
+
+    let infer = Infer::new(sharded_client);
+
+    let app = Route::new()
+        .at("/generate", post(generate))
+        .with(AddData::new(infer));
+    Server::new(TcpListener::bind("127.0.0.1:3000"))
+        .run(app)
+        .await
+}
--- a/server/.DS_Store
+++ b/server/.DS_Store
--- a/server/Makefile
+++ b/server/Makefile
@ -0,0 +1,20 @@
+gen-server:
+	mkdir bloom_inference/pb || true
+	python -m grpc_tools.protoc -I../proto --python_out=bloom_inference/pb --grpc_python_out=bloom_inference/pb ../proto/generate.proto
+	find bloom_inference/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
+	touch bloom_inference/pb/__init__.py
+
+unit-tests:
+	python -m pytest --cov=bloom_inference tests
+
+unit-tests-reporting:
+	python -m pytest --junitxml=report.xml --cov=bloom_inference tests
+
+pip-install:
+	pip install grpcio-tools
+	make gen-server
+	pip install .
+
+install:
+	poetry install
+	make gen-server
--- a/server/README.md
+++ b/server/README.md
@ -0,0 +1,15 @@
+# BLOOM Inference Python gRPC Server
+
+A Python gRPC server for BLOOM Inference
+
+## Local Install (with poetry)
+
+```shell
+make install
+```
+
+## Local Install (with pip)
+
+```shell
+make pip-install
+```
--- a/server/bloom_inference/.DS_Store
+++ b/server/bloom_inference/.DS_Store
--- a/server/bloom_inference/init.py
+++ b/server/bloom_inference/init.py
--- a/server/bloom_inference/pycache/init.cpython-39.pyc
+++ b/server/bloom_inference/pycache/init.cpython-39.pyc
--- a/server/bloom_inference/pycache/cache.cpython-39.pyc
+++ b/server/bloom_inference/pycache/cache.cpython-39.pyc
--- a/server/bloom_inference/pycache/model.cpython-39.pyc
+++ b/server/bloom_inference/pycache/model.cpython-39.pyc
--- a/server/bloom_inference/pycache/server.cpython-39.pyc
+++ b/server/bloom_inference/pycache/server.cpython-39.pyc
--- a/server/bloom_inference/pycache/shard_model.cpython-39.pyc
+++ b/server/bloom_inference/pycache/shard_model.cpython-39.pyc
--- a/server/bloom_inference/pycache/utils.cpython-39.pyc
+++ b/server/bloom_inference/pycache/utils.cpython-39.pyc
--- a/server/bloom_inference/cache.py
+++ b/server/bloom_inference/cache.py
@ -0,0 +1,48 @@
+import torch
+
+from dataclasses import dataclass
+from typing import Dict, Optional, List
+
+from bloom_inference.pb import generate_pb2
+from bloom_inference.utils import NextTokenChooser, StoppingCriteria
+
+
+@dataclass
+class CacheEntry:
+    batch_id: int
+    request_ids: List[int]
+    input_ids: Dict[str, torch.Tensor]
+    all_input_ids: List[torch.Tensor]
+    next_token_choosers: List[NextTokenChooser]
+    stopping_criterias: List[StoppingCriteria]
+
+    def __len__(self):
+        return len(self.request_ids)
+
+    def to_pb(self):
+        return generate_pb2.CacheEntry(
+            id=self.batch_id,
+            request_ids=self.request_ids,
+            sequence_length=max(len(entry) for entry in self.all_input_ids),
+        )
+
+
+class Cache:
+    def __init__(self):
+        self.cache: Dict[str, CacheEntry] = {}
+
+    def pop(self, batch_id: str) -> Optional[CacheEntry]:
+        return self.cache.pop(batch_id, None)
+
+    def set(self, entry: CacheEntry):
+        if entry is not None:
+            self.cache[entry.batch_id] = entry
+
+    def delete(self, batch_id: str):
+        del self.cache[batch_id]
+
+    def clear(self):
+        self.cache.clear()
+
+    def __len__(self):
+        return len(self.cache.keys())
--- a/server/bloom_inference/main.py
+++ b/server/bloom_inference/main.py
@ -0,0 +1,30 @@
+import typer
+
+from pathlib import Path
+from torch.distributed.launcher import launch_agent, LaunchConfig
+from typing import Optional
+
+from bloom_inference.server import serve
+
+
+def main(
+    model_name: str,
+    num_gpus: int = 1,
+    shard_directory: Optional[Path] = None,
+):
+    if num_gpus == 1:
+        serve(model_name, False, shard_directory)
+
+    else:
+        config = LaunchConfig(
+            min_nodes=1,
+            max_nodes=1,
+            nproc_per_node=num_gpus,
+            rdzv_backend="c10d",
+            max_restarts=0,
+        )
+        launch_agent(config, serve, [model_name, True, shard_directory])
+
+
+if __name__ == "__main__":
+    typer.run(main)
--- a/server/bloom_inference/model.py
+++ b/server/bloom_inference/model.py
@ -0,0 +1,428 @@
+import torch
+import torch.distributed
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Tuple, Optional, Dict
+
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
+from transformers.modeling_utils import no_init_weights
+
+from bloom_inference.cache import CacheEntry
+from bloom_inference.pb import generate_pb2
+from bloom_inference.shard_model import shard_model, match_suffix
+from bloom_inference.utils import (
+    StoppingCriteria,
+    NextTokenChooser,
+    initialize_torch_distributed,
+    set_default_dtype,
+)
+
+torch.manual_seed(0)
+
+
+@dataclass
+class Batch:
+    batch_id: int
+    request_ids: List[int]
+    input_ids: Dict[str, torch.Tensor]
+    all_input_ids: List[torch.Tensor]
+    next_token_choosers: List[NextTokenChooser]
+    stopping_criterias: List[StoppingCriteria]
+
+    @classmethod
+    def from_batch_pb(
+            cls, pb: generate_pb2.Batch, tokenizer: AutoTokenizer, device: torch.device
+    ) -> "Batch":
+        request_ids = []
+        inputs = []
+        next_token_choosers = []
+        stopping_criterias = []
+
+        # Parse batch
+        for r in pb.requests:
+            request_ids.append(r.id)
+            inputs.append(r.inputs)
+            next_token_choosers.append(
+                NextTokenChooser(
+                    temperature=r.parameters.temperature,
+                    top_k=r.parameters.top_k,
+                    top_p=r.parameters.top_p,
+                    do_sample=r.parameters.do_sample,
+                )
+            )
+            stopping_criterias.append(StoppingCriteria(max_new_tokens=r.max_new_tokens))
+
+        input_ids = tokenizer(inputs, return_tensors="pt", padding=True).to(device)
+        all_input_ids = input_ids["input_ids"].unsqueeze(-1)
+
+        return cls(
+            pb.id,
+            request_ids,
+            input_ids,
+            all_input_ids,
+            next_token_choosers,
+            stopping_criterias,
+        )
+
+    @classmethod
+    def from_cache_entry(cls, cache_entry: CacheEntry) -> "Batch":
+        return cls(
+            cache_entry.batch_id,
+            cache_entry.request_ids,
+            cache_entry.input_ids,
+            cache_entry.all_input_ids,
+            cache_entry.next_token_choosers,
+            cache_entry.stopping_criterias,
+        )
+
+    @classmethod
+    def from_batch_cached_pb(cls, pb: generate_pb2.BatchCached, cache) -> "Batch":
+        if len(pb.batch_cached_ids) == 1:
+            cache_entry = cache.pop(pb.batch_cached_ids[0])
+            if cache_entry is None:
+                raise ValueError(f"Batch ID {pb.batch_id} not found in cache")
+            return cls.from_cache_entry(cache_entry)
+
+        total_batch_size = pb.total_batch_size
+        max_sequence_length = pb.max_sequence_length
+        input_ids = {"input_ids": None, "attention_mask": None, "past_key_values": []}
+        request_ids = []
+        all_input_ids = []
+        next_token_choosers = []
+        stopping_criterias = []
+        start_index = 0
+        for i, batch_id in enumerate(pb.batch_cached_ids):
+            cache_entry = cache.pop(batch_id)
+            if cache_entry is None:
+                raise ValueError(f"Batch ID {batch_id} not found in cache")
+            request_ids.extend(cache_entry.request_ids)
+            all_input_ids.extend(cache_entry.all_input_ids)
+            next_token_choosers.extend(cache_entry.next_token_choosers)
+            stopping_criterias.extend(cache_entry.stopping_criterias)
+
+            batch_size = len(cache_entry.request_ids)
+            end_index = start_index + batch_size
+            sequence_length = max(len(entry) for entry in cache_entry.all_input_ids)
+
+            if input_ids["input_ids"] is None:
+                input_ids["input_ids"] = torch.empty(
+                    (total_batch_size, 1),
+                    dtype=cache_entry.input_ids["input_ids"].dtype,
+                    device=cache_entry.input_ids["input_ids"].device,
+                )
+
+            input_ids["input_ids"][start_index:end_index] = cache_entry.input_ids[
+                "input_ids"
+            ]
+
+            if input_ids["attention_mask"] is None:
+                input_ids["attention_mask"] = torch.zeros(
+                    (total_batch_size, max_sequence_length),
+                    dtype=cache_entry.input_ids["attention_mask"].dtype,
+                    device=cache_entry.input_ids["attention_mask"].device,
+                )
+
+            input_ids["attention_mask"][
+            start_index:end_index, -sequence_length:
+            ] = cache_entry.input_ids["attention_mask"][:, -sequence_length:]
+
+            for j, past in enumerate(cache_entry.input_ids["past_key_values"]):
+                # TODO: this could be done without the views by using indices
+                past_keys = past[0]
+                past_values = past[1]
+
+                _, head_dim, padded_sequence_length = past_keys.shape
+
+                past_keys = past_keys.view(
+                    batch_size, -1, head_dim, padded_sequence_length
+                )
+                past_values = past_values.view(
+                    batch_size, -1, padded_sequence_length, head_dim
+                )
+                num_heads = past_keys.shape[1]
+
+                if j == len(input_ids["past_key_values"]):
+                    padded_past_keys = torch.zeros(
+                        (
+                            total_batch_size,
+                            num_heads,
+                            head_dim,
+                            max_sequence_length - 1,
+                        ),
+                        dtype=past_keys.dtype,
+                        device=past_keys.device,
+                    )
+                    padded_past_values = torch.zeros(
+                        (
+                            total_batch_size,
+                            num_heads,
+                            max_sequence_length - 1,
+                            head_dim,
+                        ),
+                        dtype=past_values.dtype,
+                        device=past_values.device,
+                    )
+                    input_ids["past_key_values"].append(
+                        [padded_past_keys, padded_past_values]
+                    )
+
+                input_ids["past_key_values"][j][0][
+                start_index:end_index, :, :, -(sequence_length - 1):
+                ] = past_keys[:, :, :, -(sequence_length - 1):]
+
+                input_ids["past_key_values"][j][1][
+                start_index:end_index, :, -(sequence_length - 1):, :
+                ] = past_values[:, :, -(sequence_length - 1):, :]
+
+                if (i + 1) == len(pb.batch_cached_ids):
+                    input_ids["past_key_values"][j][0] = input_ids["past_key_values"][
+                        j
+                    ][0].view(total_batch_size * num_heads, head_dim, -1)
+                    input_ids["past_key_values"][j][1] = input_ids["past_key_values"][
+                        j
+                    ][1].view(total_batch_size * num_heads, -1, head_dim)
+
+            start_index += batch_size
+
+        assert pb.request_ids == request_ids
+
+        return cls(
+            pb.id,
+            request_ids,
+            input_ids,
+            all_input_ids,
+            next_token_choosers,
+            stopping_criterias,
+        )
+
+
+@dataclass
+class FinishedGeneration:
+    request_id: str
+    output: str
+
+    def to_pb(self) -> generate_pb2.FinishedGeneration:
+        return generate_pb2.FinishedGeneration(id=self.request_id, output=self.output)
+
+
+class BLOOM:
+    def __init__(self, model_name: str):
+        if torch.cuda.is_available():
+            self.device = torch.device("cuda")
+        else:
+            self.device = torch.device("cpu")
+
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
+        self.model = (
+            AutoModelForCausalLM.from_pretrained(model_name).eval().to(self.device)
+        )
+        self.num_heads = self.model.base_model.num_heads
+
+    def forward(self, input_ids, attention_mask, past_key_values: Optional = None):
+        # Model Forward
+        return self.model.forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            use_cache=True,
+        )
+
+    def generate_token(
+            self, batch: Batch
+    ) -> Tuple[List[FinishedGeneration], Optional[CacheEntry]]:
+        with torch.no_grad():
+            outputs = self.forward(**batch.input_ids)
+
+        # List of indices to cache
+        cache_indices = []
+        cache_past_indices = []
+
+        # New input_ids for next forward; keep in cache
+        cache_next_input_ids = []
+        cache_all_input_ids = []
+
+        # Finished requests
+        finished_generations: List[FinishedGeneration] = []
+
+        # Zipped iterator
+        iterator = zip(
+            batch.request_ids,
+            outputs.logits,
+            batch.next_token_choosers,
+            batch.stopping_criterias,
+            batch.all_input_ids,
+        )
+
+        # For each member of the batch
+        for i, (
+                request_id,
+                logits,
+                next_token_chooser,
+                stopping_criteria,
+                all_tokens,
+        ) in enumerate(iterator):
+            # Select next token
+            next_token = next_token_chooser(all_tokens, logits.unsqueeze(0)[:, -1])
+
+            # Append next token to all tokens
+            all_tokens = torch.cat([all_tokens, next_token])
+
+            # Evaluate stopping criteria
+            if stopping_criteria(all_tokens):
+                # Decode all tokens
+                output = self.tokenizer.decode(
+                    all_tokens.squeeze(-1), skip_special_tokens=True
+                )
+                # Add to the list of finished generations with the original request id
+                finished_generations.append(FinishedGeneration(request_id, output))
+            # must be added to the cache
+            else:
+                cache_indices.append(i)
+                cache_past_indices.extend([j for j in range(i * self.num_heads, (i + 1) * self.num_heads)])
+                cache_next_input_ids.append(next_token)
+                cache_all_input_ids.append(all_tokens)
+
+        # No cache is needed, we finished all generations in the batch
+        if not cache_indices:
+            return finished_generations, None
+
+        # If we finished at least one generation
+        cache_input_ids = {"input_ids": torch.cat(cache_next_input_ids, dim=0)}
+        if finished_generations:
+            # Apply indices to attention mask, past key values and other items that need to be cached
+            cache_input_ids["attention_mask"] = batch.input_ids["attention_mask"][
+                cache_indices
+            ]
+            cache_input_ids["past_key_values"] = [
+                (keys[cache_past_indices], values[cache_past_indices])
+                for keys, values in outputs["past_key_values"]
+            ]
+            cache_request_ids = [batch.request_ids[i] for i in cache_indices]
+            cache_next_token_choosers = [
+                batch.next_token_choosers[i] for i in cache_indices
+            ]
+            cache_stopping_criterias = [
+                batch.stopping_criterias[i] for i in cache_indices
+            ]
+        else:
+            cache_input_ids["attention_mask"] = batch.input_ids["attention_mask"]
+            cache_input_ids["past_key_values"] = outputs["past_key_values"]
+            cache_request_ids = batch.request_ids
+            cache_next_token_choosers = batch.next_token_choosers
+            cache_stopping_criterias = batch.stopping_criterias
+
+        # Update attention_mask with padding as we added a new token to input_ids
+        cache_input_ids["attention_mask"] = torch.cat(
+            [
+                cache_input_ids["attention_mask"],
+                torch.ones((cache_input_ids["attention_mask"].shape[0], 1)).to(
+                    cache_input_ids["attention_mask"].device
+                ),
+            ],
+            dim=1,
+        )
+
+        cache_entry = CacheEntry(
+            batch.batch_id,
+            cache_request_ids,
+            cache_input_ids,
+            cache_all_input_ids,
+            cache_next_token_choosers,
+            cache_stopping_criterias,
+        )
+        return finished_generations, cache_entry
+
+
+class BLOOMSharded(BLOOM):
+    def __init__(self, model_name: str, shard_directory: Path):
+        super(BLOOM, self).__init__()
+        self.process_group, self.rank, self.world_size = initialize_torch_distributed()
+        self.master = self.rank == 0
+        if torch.cuda.is_available():
+            self.device = torch.device(f"cuda:{self.rank}")
+            dtype = torch.bfloat16
+        else:
+            self.device = torch.device("cpu")
+            dtype = torch.float32
+
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
+
+        # shard state_dict
+        if self.master:
+            # TODO @thomasw21 do some caching
+            shard_state_dict_paths = shard_model(
+                model_name, shard_directory, tp_world_size=self.world_size, dtype=dtype
+            )
+            shard_state_dict_paths = [
+                str(path.absolute()) for path in shard_state_dict_paths
+            ]
+        else:
+            shard_state_dict_paths = [None] * self.world_size
+
+        torch.distributed.broadcast_object_list(
+            shard_state_dict_paths, src=0, group=self.process_group
+        )
+        shard_state_dict_path = shard_state_dict_paths[self.rank]
+
+        config = AutoConfig.from_pretrained(
+            model_name, slow_but_exact=False, tp_parallel=True
+        )
+        config.pad_token_id = 3
+
+        # The flag below controls whether to allow TF32 on matmul. This flag defaults to False
+        # in PyTorch 1.12 and later.
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+        # The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
+        torch.backends.cudnn.allow_tf32 = True
+
+        with set_default_dtype(dtype):
+            with no_init_weights():
+                # we can probably set the device to `meta` here?
+                model = AutoModelForCausalLM.from_config(config).to(dtype)
+
+        torch.distributed.barrier(group=self.process_group)
+        # print_rank_0(f"Initialized model")
+        state_dict = torch.load(shard_state_dict_path)
+        # TODO @thomasw21: HACK in order to transpose all weight prior
+        for key in state_dict.keys():
+            do_transpose = False
+            if not match_suffix(key, "weight"):
+                continue
+
+            for potential_suffix in [
+                "self_attention.query_key_value.weight",
+                "self_attention.dense.weight",
+                "dense_h_to_4h.weight",
+                "dense_4h_to_h.weight",
+            ]:
+                if match_suffix(key, potential_suffix):
+                    do_transpose = True
+
+            if do_transpose:
+                state_dict[key] = state_dict[key].transpose(1, 0).contiguous()
+
+        model.load_state_dict(state_dict)
+        self.model = model.to(self.device).eval()
+        self.num_heads = config.n_head // self.process_group.size()
+        torch.distributed.barrier(group=self.process_group)
+
+    def forward(self, input_ids, attention_mask, past_key_values: Optional = None):
+        outputs = self.model.forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            use_cache=True,
+        )
+
+        logits_shard = outputs.logits[:, -1, :].contiguous()
+
+        batch_size, vocab_shard_size = logits_shard.shape
+        vocab_size = self.world_size * vocab_shard_size
+        logits = [torch.empty_like(logits_shard) for _ in range(self.world_size)]
+        torch.distributed.all_gather(logits, logits_shard, group=self.process_group)
+        logits = torch.cat(logits, dim=1).view(batch_size, 1, vocab_size)
+
+        outputs.logits = logits
+        return outputs
--- a/server/bloom_inference/pb/init.py
+++ b/server/bloom_inference/pb/init.py
--- a/server/bloom_inference/pb/init.py-e
+++ b/server/bloom_inference/pb/init.py-e
--- a/server/bloom_inference/pb/pycache/init.cpython-39.pyc
+++ b/server/bloom_inference/pb/pycache/init.cpython-39.pyc
--- a/server/bloom_inference/pb/pycache/generate_pb2.cpython-39.pyc
+++ b/server/bloom_inference/pb/pycache/generate_pb2.cpython-39.pyc
--- a/server/bloom_inference/pb/pycache/generate_pb2_grpc.cpython-39.pyc
+++ b/server/bloom_inference/pb/pycache/generate_pb2_grpc.cpython-39.pyc
--- a/server/bloom_inference/pb/generate_pb2.py
+++ b/server/bloom_inference/pb/generate_pb2.py
@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: generate.proto
+"""Generated protocol buffer code."""
+from google.protobuf.internal import builder as _builder
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0egenerate.proto\x12\x0bgenerate.v1\"(\n\x18ServiceDiscoveryResponse\x12\x0c\n\x04urls\x18\x01 \x03(\t\"^\n\x16LogitsWarperParameters\x12\x13\n\x0btemperature\x18\x01 \x01(\x02\x12\r\n\x05top_k\x18\x02 \x01(\r\x12\r\n\x05top_p\x18\x03 \x01(\x02\x12\x11\n\tdo_sample\x18\x04 \x01(\x08\"v\n\x07Request\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x0e\n\x06inputs\x18\x02 \x01(\t\x12\x37\n\nparameters\x18\x03 \x01(\x0b\x32#.generate.v1.LogitsWarperParameters\x12\x16\n\x0emax_new_tokens\x18\x04 \x01(\r\";\n\x05\x42\x61tch\x12\n\n\x02id\x18\x01 \x01(\x04\x12&\n\x08requests\x18\x02 \x03(\x0b\x32\x14.generate.v1.Request\"\x7f\n\x0b\x42\x61tchCached\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x13\n\x0brequest_ids\x18\x02 \x03(\x04\x12\x18\n\x10\x62\x61tch_cached_ids\x18\x03 \x03(\x04\x12\x18\n\x10total_batch_size\x18\x04 \x01(\r\x12\x1b\n\x13max_sequence_length\x18\x05 \x01(\r\"0\n\x12\x46inishedGeneration\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x0e\n\x06output\x18\x02 \x01(\t\"F\n\nCacheEntry\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x13\n\x0brequest_ids\x18\x02 \x03(\x04\x12\x17\n\x0fsequence_length\x18\x03 \x01(\r\"\x80\x01\n\x08Response\x12\x31\n\x08\x66inished\x18\x01 \x03(\x0b\x32\x1f.generate.v1.FinishedGeneration\x12\x31\n\x0b\x63\x61\x63he_entry\x18\x02 \x01(\x0b\x32\x17.generate.v1.CacheEntryH\x00\x88\x01\x01\x42\x0e\n\x0c_cache_entry\"\x07\n\x05\x45mpty2\x94\x02\n\x0eTextGeneration\x12O\n\x10ServiceDiscovery\x12\x12.generate.v1.Empty\x1a%.generate.v1.ServiceDiscoveryResponse\"\x00\x12\x34\n\nClearCache\x12\x12.generate.v1.Empty\x1a\x12.generate.v1.Empty\x12\x35\n\x08Generate\x12\x12.generate.v1.Batch\x1a\x15.generate.v1.Response\x12\x44\n\x11GenerateWithCache\x12\x18.generate.v1.BatchCached\x1a\x15.generate.v1.Responseb\x06proto3')
+
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'generate_pb2', globals())
+if _descriptor._USE_C_DESCRIPTORS == False:
+
+  DESCRIPTOR._options = None
+  _SERVICEDISCOVERYRESPONSE._serialized_start=31
+  _SERVICEDISCOVERYRESPONSE._serialized_end=71
+  _LOGITSWARPERPARAMETERS._serialized_start=73
+  _LOGITSWARPERPARAMETERS._serialized_end=167
+  _REQUEST._serialized_start=169
+  _REQUEST._serialized_end=287
+  _BATCH._serialized_start=289
+  _BATCH._serialized_end=348
+  _BATCHCACHED._serialized_start=350
+  _BATCHCACHED._serialized_end=477
+  _FINISHEDGENERATION._serialized_start=479
+  _FINISHEDGENERATION._serialized_end=527
+  _CACHEENTRY._serialized_start=529
+  _CACHEENTRY._serialized_end=599
+  _RESPONSE._serialized_start=602
+  _RESPONSE._serialized_end=730
+  _EMPTY._serialized_start=732
+  _EMPTY._serialized_end=739
+  _TEXTGENERATION._serialized_start=742
+  _TEXTGENERATION._serialized_end=1018
+# @@protoc_insertion_point(module_scope)
--- a/server/bloom_inference/pb/generate_pb2.py-e
+++ b/server/bloom_inference/pb/generate_pb2.py-e
@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: generate.proto
+"""Generated protocol buffer code."""
+from google.protobuf.internal import builder as _builder
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0egenerate.proto\x12\x0bgenerate.v1\"(\n\x18ServiceDiscoveryResponse\x12\x0c\n\x04urls\x18\x01 \x03(\t\"^\n\x16LogitsWarperParameters\x12\x13\n\x0btemperature\x18\x01 \x01(\x02\x12\r\n\x05top_k\x18\x02 \x01(\r\x12\r\n\x05top_p\x18\x03 \x01(\x02\x12\x11\n\tdo_sample\x18\x04 \x01(\x08\"v\n\x07Request\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x0e\n\x06inputs\x18\x02 \x01(\t\x12\x37\n\nparameters\x18\x03 \x01(\x0b\x32#.generate.v1.LogitsWarperParameters\x12\x16\n\x0emax_new_tokens\x18\x04 \x01(\r\";\n\x05\x42\x61tch\x12\n\n\x02id\x18\x01 \x01(\x04\x12&\n\x08requests\x18\x02 \x03(\x0b\x32\x14.generate.v1.Request\"\x7f\n\x0b\x42\x61tchCached\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x13\n\x0brequest_ids\x18\x02 \x03(\x04\x12\x18\n\x10\x62\x61tch_cached_ids\x18\x03 \x03(\x04\x12\x18\n\x10total_batch_size\x18\x04 \x01(\r\x12\x1b\n\x13max_sequence_length\x18\x05 \x01(\r\"0\n\x12\x46inishedGeneration\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x0e\n\x06output\x18\x02 \x01(\t\"F\n\nCacheEntry\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x13\n\x0brequest_ids\x18\x02 \x03(\x04\x12\x17\n\x0fsequence_length\x18\x03 \x01(\r\"\x80\x01\n\x08Response\x12\x31\n\x08\x66inished\x18\x01 \x03(\x0b\x32\x1f.generate.v1.FinishedGeneration\x12\x31\n\x0b\x63\x61\x63he_entry\x18\x02 \x01(\x0b\x32\x17.generate.v1.CacheEntryH\x00\x88\x01\x01\x42\x0e\n\x0c_cache_entry\"\x07\n\x05\x45mpty2\x94\x02\n\x0eTextGeneration\x12O\n\x10ServiceDiscovery\x12\x12.generate.v1.Empty\x1a%.generate.v1.ServiceDiscoveryResponse\"\x00\x12\x34\n\nClearCache\x12\x12.generate.v1.Empty\x1a\x12.generate.v1.Empty\x12\x35\n\x08Generate\x12\x12.generate.v1.Batch\x1a\x15.generate.v1.Response\x12\x44\n\x11GenerateWithCache\x12\x18.generate.v1.BatchCached\x1a\x15.generate.v1.Responseb\x06proto3')
+
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'generate_pb2', globals())
+if _descriptor._USE_C_DESCRIPTORS == False:
+
+  DESCRIPTOR._options = None
+  _SERVICEDISCOVERYRESPONSE._serialized_start=31
+  _SERVICEDISCOVERYRESPONSE._serialized_end=71
+  _LOGITSWARPERPARAMETERS._serialized_start=73
+  _LOGITSWARPERPARAMETERS._serialized_end=167
+  _REQUEST._serialized_start=169
+  _REQUEST._serialized_end=287
+  _BATCH._serialized_start=289
+  _BATCH._serialized_end=348
+  _BATCHCACHED._serialized_start=350
+  _BATCHCACHED._serialized_end=477
+  _FINISHEDGENERATION._serialized_start=479
+  _FINISHEDGENERATION._serialized_end=527
+  _CACHEENTRY._serialized_start=529
+  _CACHEENTRY._serialized_end=599
+  _RESPONSE._serialized_start=602
+  _RESPONSE._serialized_end=730
+  _EMPTY._serialized_start=732
+  _EMPTY._serialized_end=739
+  _TEXTGENERATION._serialized_start=742
+  _TEXTGENERATION._serialized_end=1018
+# @@protoc_insertion_point(module_scope)
--- a/server/bloom_inference/pb/generate_pb2_grpc.py
+++ b/server/bloom_inference/pb/generate_pb2_grpc.py
@ -0,0 +1,169 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import grpc
+
+from . import generate_pb2 as generate__pb2
+
+
+class TextGenerationStub(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.ServiceDiscovery = channel.unary_unary(
+                '/generate.v1.TextGeneration/ServiceDiscovery',
+                request_serializer=generate__pb2.Empty.SerializeToString,
+                response_deserializer=generate__pb2.ServiceDiscoveryResponse.FromString,
+                )
+        self.ClearCache = channel.unary_unary(
+                '/generate.v1.TextGeneration/ClearCache',
+                request_serializer=generate__pb2.Empty.SerializeToString,
+                response_deserializer=generate__pb2.Empty.FromString,
+                )
+        self.Generate = channel.unary_unary(
+                '/generate.v1.TextGeneration/Generate',
+                request_serializer=generate__pb2.Batch.SerializeToString,
+                response_deserializer=generate__pb2.Response.FromString,
+                )
+        self.GenerateWithCache = channel.unary_unary(
+                '/generate.v1.TextGeneration/GenerateWithCache',
+                request_serializer=generate__pb2.BatchCached.SerializeToString,
+                response_deserializer=generate__pb2.Response.FromString,
+                )
+
+
+class TextGenerationServicer(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def ServiceDiscovery(self, request, context):
+        """/ Service discovery
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def ClearCache(self, request, context):
+        """/ Empties batch cache
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Generate(self, request, context):
+        """/ Generate tokens for a batch without cache
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def GenerateWithCache(self, request, context):
+        """/ Generate tokens for a batch with cache
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_TextGenerationServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'ServiceDiscovery': grpc.unary_unary_rpc_method_handler(
+                    servicer.ServiceDiscovery,
+                    request_deserializer=generate__pb2.Empty.FromString,
+                    response_serializer=generate__pb2.ServiceDiscoveryResponse.SerializeToString,
+            ),
+            'ClearCache': grpc.unary_unary_rpc_method_handler(
+                    servicer.ClearCache,
+                    request_deserializer=generate__pb2.Empty.FromString,
+                    response_serializer=generate__pb2.Empty.SerializeToString,
+            ),
+            'Generate': grpc.unary_unary_rpc_method_handler(
+                    servicer.Generate,
+                    request_deserializer=generate__pb2.Batch.FromString,
+                    response_serializer=generate__pb2.Response.SerializeToString,
+            ),
+            'GenerateWithCache': grpc.unary_unary_rpc_method_handler(
+                    servicer.GenerateWithCache,
+                    request_deserializer=generate__pb2.BatchCached.FromString,
+                    response_serializer=generate__pb2.Response.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'generate.v1.TextGeneration', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+
+
+ # This class is part of an EXPERIMENTAL API.
+class TextGeneration(object):
+    """Missing associated documentation comment in .proto file."""
+
+    @staticmethod
+    def ServiceDiscovery(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/generate.v1.TextGeneration/ServiceDiscovery',
+            generate__pb2.Empty.SerializeToString,
+            generate__pb2.ServiceDiscoveryResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def ClearCache(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/generate.v1.TextGeneration/ClearCache',
+            generate__pb2.Empty.SerializeToString,
+            generate__pb2.Empty.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Generate(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/generate.v1.TextGeneration/Generate',
+            generate__pb2.Batch.SerializeToString,
+            generate__pb2.Response.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def GenerateWithCache(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/generate.v1.TextGeneration/GenerateWithCache',
+            generate__pb2.BatchCached.SerializeToString,
+            generate__pb2.Response.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/server/bloom_inference/pb/generate_pb2_grpc.py-e
+++ b/server/bloom_inference/pb/generate_pb2_grpc.py-e
@ -0,0 +1,169 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import grpc
+
+import generate_pb2 as generate__pb2
+
+
+class TextGenerationStub(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.ServiceDiscovery = channel.unary_unary(
+                '/generate.v1.TextGeneration/ServiceDiscovery',
+                request_serializer=generate__pb2.Empty.SerializeToString,
+                response_deserializer=generate__pb2.ServiceDiscoveryResponse.FromString,
+                )
+        self.ClearCache = channel.unary_unary(
+                '/generate.v1.TextGeneration/ClearCache',
+                request_serializer=generate__pb2.Empty.SerializeToString,
+                response_deserializer=generate__pb2.Empty.FromString,
+                )
+        self.Generate = channel.unary_unary(
+                '/generate.v1.TextGeneration/Generate',
+                request_serializer=generate__pb2.Batch.SerializeToString,
+                response_deserializer=generate__pb2.Response.FromString,
+                )
+        self.GenerateWithCache = channel.unary_unary(
+                '/generate.v1.TextGeneration/GenerateWithCache',
+                request_serializer=generate__pb2.BatchCached.SerializeToString,
+                response_deserializer=generate__pb2.Response.FromString,
+                )
+
+
+class TextGenerationServicer(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def ServiceDiscovery(self, request, context):
+        """/ Service discovery
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def ClearCache(self, request, context):
+        """/ Empties batch cache
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Generate(self, request, context):
+        """/ Generate tokens for a batch without cache
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def GenerateWithCache(self, request, context):
+        """/ Generate tokens for a batch with cache
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_TextGenerationServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'ServiceDiscovery': grpc.unary_unary_rpc_method_handler(
+                    servicer.ServiceDiscovery,
+                    request_deserializer=generate__pb2.Empty.FromString,
+                    response_serializer=generate__pb2.ServiceDiscoveryResponse.SerializeToString,
+            ),
+            'ClearCache': grpc.unary_unary_rpc_method_handler(
+                    servicer.ClearCache,
+                    request_deserializer=generate__pb2.Empty.FromString,
+                    response_serializer=generate__pb2.Empty.SerializeToString,
+            ),
+            'Generate': grpc.unary_unary_rpc_method_handler(
+                    servicer.Generate,
+                    request_deserializer=generate__pb2.Batch.FromString,
+                    response_serializer=generate__pb2.Response.SerializeToString,
+            ),
+            'GenerateWithCache': grpc.unary_unary_rpc_method_handler(
+                    servicer.GenerateWithCache,
+                    request_deserializer=generate__pb2.BatchCached.FromString,
+                    response_serializer=generate__pb2.Response.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'generate.v1.TextGeneration', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+
+
+ # This class is part of an EXPERIMENTAL API.
+class TextGeneration(object):
+    """Missing associated documentation comment in .proto file."""
+
+    @staticmethod
+    def ServiceDiscovery(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/generate.v1.TextGeneration/ServiceDiscovery',
+            generate__pb2.Empty.SerializeToString,
+            generate__pb2.ServiceDiscoveryResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def ClearCache(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/generate.v1.TextGeneration/ClearCache',
+            generate__pb2.Empty.SerializeToString,
+            generate__pb2.Empty.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Generate(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/generate.v1.TextGeneration/Generate',
+            generate__pb2.Batch.SerializeToString,
+            generate__pb2.Response.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def GenerateWithCache(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/generate.v1.TextGeneration/GenerateWithCache',
+            generate__pb2.BatchCached.SerializeToString,
+            generate__pb2.Response.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/server/bloom_inference/prepare_weights.py
+++ b/server/bloom_inference/prepare_weights.py
@ -0,0 +1,124 @@
+import torch
+
+from pathlib import Path
+from tqdm import tqdm
+
+MODEL_NAME = "bigscience/bloom"
+
+
+def match_suffix(text, suffix):
+    return text[-len(suffix) :] == suffix
+
+
+def prepare_weights(hub_path: Path, save_path: Path, tp_world_size: int):
+    save_paths = [
+        save_path / f"{MODEL_NAME}_tp-rank-{tp_rank}-of-{tp_world_size}.pty"
+        for tp_rank in range(tp_world_size)
+    ]
+
+    if all(save_path.exists() for save_path in save_paths):
+        print("Weights are already prepared")
+        return
+
+    shards_state_dicts = [{} for _ in range(tp_world_size)]
+
+    for weight_path in tqdm(hub_path.glob("*.bin")):
+        state_dict = torch.load(weight_path, map_location="cpu")
+
+        keys = list(state_dict.keys())
+        for state_name in keys:
+            state = state_dict[state_name]
+            if any(
+                match_suffix(state_name, candidate)
+                for candidate in [
+                    "self_attention.query_key_value.weight",
+                    "self_attention.query_key_value.bias",
+                    "mlp.dense_h_to_4h.weight",
+                    "mlp.dense_h_to_4h.bias",
+                    "word_embeddings.weight",
+                    "lm_head.weight",
+                ]
+            ):
+                output_size = state.shape[0]
+                assert output_size % tp_world_size == 0
+                block_size = output_size // tp_world_size
+                sharded_weights = torch.split(state, block_size, dim=0)
+                assert len(sharded_weights) == tp_world_size
+                for tp_rank, shard in enumerate(sharded_weights):
+                    assert shard.shape[0] == block_size
+                    if match_suffix(state_name, "lm_head.weight"):
+                        shards_state_dicts[tp_rank][state_name] = shard.detach().clone()
+                    else:
+                        shards_state_dicts[tp_rank][
+                            "transformer." + state_name
+                        ] = shard.detach().clone()
+            elif any(
+                match_suffix(state_name, candidate)
+                for candidate in [
+                    "self_attention.dense.weight",
+                    "mlp.dense_4h_to_h.weight",
+                    "lm_head.weight",
+                ]
+            ):
+                input_size = state.shape[1]
+                assert input_size % tp_world_size == 0
+                block_size = input_size // tp_world_size
+                sharded_weights = torch.split(state, block_size, dim=1)
+                assert len(sharded_weights) == tp_world_size
+                for tp_rank, shard in enumerate(sharded_weights):
+                    assert shard.shape[1] == block_size
+                    if match_suffix(state_name, "lm_head.weight"):
+                        shards_state_dicts[tp_rank][state_name] = shard.detach().clone()
+                    else:
+                        shards_state_dicts[tp_rank][
+                            "transformer." + state_name
+                        ] = shard.detach().clone()
+            elif any(
+                match_suffix(state_name, candidate)
+                for candidate in [
+                    "self_attention.dense.bias",
+                    "mlp.dense_4h_to_h.bias",
+                ]
+            ):
+                shards_state_dicts[0][
+                    "transformer." + state_name
+                ] = state.detach().clone()
+                for tp_rank in range(1, tp_world_size):
+                    shards_state_dicts[tp_rank][
+                        "transformer." + state_name
+                    ] = torch.zeros_like(state)
+            else:
+                # We duplicate parameters across tp ranks
+                for tp_rank in range(tp_world_size):
+                    shards_state_dicts[tp_rank][
+                        "transformer." + state_name
+                    ] = state.detach().clone()
+
+            del state_dict[state_name]  # delete key from state_dict
+            del state  # delete tensor
+
+    # we save state_dict
+    for tp_rank, (save_path, shard_state_dict) in enumerate(
+        zip(save_paths, shards_state_dicts)
+    ):
+        save_paths.append(save_path)
+        save_path.parent.mkdir(parents=True, exist_ok=True)
+        if save_path.exists():
+            print(f"Skipping {save_path} as it already exists")
+        else:
+            torch.save(shard_state_dict, save_path)
+
+    return save_paths
+
+
+if __name__ == "__main__":
+    from argparse import ArgumentParser
+
+    parser = ArgumentParser()
+
+    parser.add_argument("--hub-path", required=True, type=str)
+    parser.add_argument("--save-path", required=True, type=str)
+    parser.add_argument("--world-size", required=True, type=int)
+    args = parser.parse_args()
+
+    prepare_weights(Path(args.hub_path), Path(args.save_path), args.world_size)
--- a/server/bloom_inference/server.py
+++ b/server/bloom_inference/server.py
@ -0,0 +1,91 @@
+import asyncio
+from grpc import aio
+
+from grpc_reflection.v1alpha import reflection
+from pathlib import Path
+from typing import Optional, List
+
+from bloom_inference.cache import Cache
+from bloom_inference.model import BLOOM, Batch, BLOOMSharded
+from bloom_inference.pb import generate_pb2_grpc, generate_pb2
+
+
+class TextGeneration(generate_pb2_grpc.TextGenerationServicer):
+    def __init__(self, model: BLOOM, cache: Cache, server_urls: List[str]):
+        self.cache = cache
+        self.model = model
+        self.server_urls = server_urls
+
+    async def ServiceDiscovery(self, request, context):
+        return generate_pb2.ServiceDiscoveryResponse(urls=self.server_urls)
+
+    async def ClearCache(self, request, context):
+        self.cache.clear()
+        return generate_pb2.Empty()
+
+    async def Generate(self, request, context):
+        batch = Batch.from_batch_pb(request, self.model.tokenizer, self.model.device)
+        finished_generations, cache_entry = self.model.generate_token(batch)
+        self.cache.set(cache_entry)
+
+        return generate_pb2.Response(
+            finished=[
+                finished_generation.to_pb()
+                for finished_generation in finished_generations
+            ],
+            cache_entry=cache_entry.to_pb() if cache_entry else None,
+        )
+
+    async def GenerateWithCache(self, request, context):
+        batch = Batch.from_batch_cached_pb(request, self.cache)
+        finished_generations, cache_entry = self.model.generate_token(batch)
+        self.cache.set(cache_entry)
+
+        return generate_pb2.Response(
+            finished=[
+                finished_generation.to_pb()
+                for finished_generation in finished_generations
+            ],
+            cache_entry=cache_entry.to_pb() if cache_entry else None,
+        )
+
+
+def serve(model_name, sharded, shard_directory):
+    async def serve_inner(
+        model_name: str,
+        sharded: bool = False,
+        shard_directory: Optional[Path] = None,
+    ):
+        unix_socket_template = "unix:///tmp/bloom-inference-{}"
+        if sharded:
+            if shard_directory is None:
+                raise ValueError("shard_directory must be set when sharded is True")
+            model = BLOOMSharded(model_name, shard_directory)
+            server_urls = [
+                unix_socket_template.format(rank) for rank in range(model.world_size)
+            ]
+            local_url = unix_socket_template.format(model.rank)
+        else:
+            model = BLOOM(model_name)
+            local_url = unix_socket_template.format(0)
+            server_urls = [local_url]
+
+        server = aio.server()
+        generate_pb2_grpc.add_TextGenerationServicer_to_server(
+            TextGeneration(model, Cache(), server_urls), server
+        )
+        SERVICE_NAMES = (
+            generate_pb2.DESCRIPTOR.services_by_name["TextGeneration"].full_name,
+            reflection.SERVICE_NAME,
+        )
+        reflection.enable_server_reflection(SERVICE_NAMES, server)
+        server.add_insecure_port(local_url)
+        await server.start()
+        print("Server started at {}".format(local_url))
+        await server.wait_for_termination()
+
+    asyncio.run(serve_inner(model_name, sharded, shard_directory))
+
+
+if __name__ == "__main__":
+    serve("bigscience/bloom-560m", True, Path("/tmp/models"))
--- a/server/bloom_inference/shard_model.py
+++ b/server/bloom_inference/shard_model.py
@ -0,0 +1,102 @@
+from pathlib import Path
+
+import torch
+from torch import nn
+from transformers import AutoModelForCausalLM
+
+
+def match_suffix(text, suffix):
+    return text[-len(suffix) :] == suffix
+
+
+def shard_model(model_name: str, path: Path, tp_world_size: int, dtype: torch.dtype):
+    """BLOOM specific sharding mechanism"""
+    save_paths = [
+        path / f"{model_name}_tp-rank-{tp_rank}-of-{tp_world_size}.pty"
+        for tp_rank in range(tp_world_size)
+    ]
+    if all(save_path.exists() for save_path in save_paths):
+        print("Loading already cached values")
+        return save_paths
+
+    model: nn.Module = AutoModelForCausalLM.from_pretrained(
+        model_name, torch_dtype=dtype, local_files_only=True
+    )
+
+    shards_state_dicts = [{} for _ in range(tp_world_size)]
+    state_dict = model.state_dict()
+    keys = list(state_dict.keys())
+    for state_name in keys:
+        print(state_name)
+        state = state_dict[state_name]
+        if any(
+            match_suffix(state_name, candidate)
+            for candidate in [
+                "self_attention.query_key_value.weight",
+                "self_attention.query_key_value.bias",
+                "mlp.dense_h_to_4h.weight",
+                "mlp.dense_h_to_4h.bias",
+                "transformer.word_embeddings.weight",
+                "lm_head.weight",
+            ]
+        ):
+            output_size = state.shape[0]
+            assert output_size % tp_world_size == 0
+            block_size = output_size // tp_world_size
+            sharded_weights = torch.split(state, block_size, dim=0)
+            assert len(sharded_weights) == tp_world_size
+            for tp_rank, shard in enumerate(sharded_weights):
+                assert shard.shape[0] == block_size
+                shards_state_dicts[tp_rank][state_name] = shard.detach().clone()
+        elif any(
+            match_suffix(state_name, candidate)
+            for candidate in [
+                "self_attention.dense.weight",
+                "mlp.dense_4h_to_h.weight",
+                "lm_head.weight",
+            ]
+        ):
+            input_size = state.shape[1]
+            assert input_size % tp_world_size == 0
+            block_size = input_size // tp_world_size
+            sharded_weights = torch.split(state, block_size, dim=1)
+            assert len(sharded_weights) == tp_world_size
+            for tp_rank, shard in enumerate(sharded_weights):
+                assert shard.shape[1] == block_size
+                shards_state_dicts[tp_rank][state_name] = shard.detach().clone()
+        elif any(
+            match_suffix(state_name, candidate)
+            for candidate in [
+                "self_attention.dense.bias",
+                "mlp.dense_4h_to_h.bias",
+            ]
+        ):
+            shards_state_dicts[0][state_name] = state.detach().clone()
+            for tp_rank in range(1, tp_world_size):
+                shards_state_dicts[tp_rank][state_name] = torch.zeros_like(state)
+        else:
+            # We duplicate parameters across tp ranks
+            for tp_rank in range(tp_world_size):
+                shards_state_dicts[tp_rank][state_name] = state.detach().clone()
+
+        del state_dict[state_name]  # delete key from state_dict
+        del state  # delete tensor
+
+    # we save state_dict
+    for tp_rank, (save_path, shard_state_dict) in enumerate(
+        zip(save_paths, shards_state_dicts)
+    ):
+        save_path.parent.mkdir(parents=True, exist_ok=True)
+        torch.save(shard_state_dict, save_path)
+        save_paths.append(save_path)
+
+    return save_paths
+
+
+if __name__ == "__main__":
+    model_name = "bigscience/bloom"
+    save_path = Path("/data/shards")
+    tp_world_size = 8
+    dtype = torch.bfloat16
+
+    shard_model(model_name, save_path, tp_world_size=tp_world_size, dtype=dtype)
--- a/server/bloom_inference/utils.py
+++ b/server/bloom_inference/utils.py
@ -0,0 +1,95 @@
+import os
+import contextlib
+import torch
+import torch.distributed
+from transformers.generation_logits_process import (
+    LogitsProcessorList,
+    TemperatureLogitsWarper,
+    TopPLogitsWarper,
+    TopKLogitsWarper,
+)
+
+
+class Sampling:
+    def __call__(self, logits):
+        probs = torch.nn.functional.softmax(logits, dim=-1)
+        next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+        return next_tokens
+
+
+class Greedy:
+    def __call__(self, logits):
+        return logits.argmax(dim=-1)
+
+
+class NextTokenChooser:
+    def __init__(self, temperature=1.0, top_k=None, top_p=None, do_sample=False):
+        warpers = LogitsProcessorList()
+        # the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files
+        # all samplers can be found in `generation_utils_samplers.py`
+        sampling = do_sample
+        if temperature is not None and temperature != 1.0:
+            temperature = float(temperature)
+            warpers.append(TemperatureLogitsWarper(temperature))
+            sampling = True
+        if top_k is not None and top_k != 0:
+            warpers.append(TopKLogitsWarper(top_k=top_k))
+            sampling = True
+        if top_p is not None and top_p < 1.0:
+            warpers.append(TopPLogitsWarper(top_p=top_p))
+            sampling = True
+
+        self.warpers = warpers
+        self.choice = Sampling() if sampling else Greedy()
+
+    def __call__(self, input_ids, scores):
+        scores = self.warpers(input_ids, scores)
+        next_ids = self.choice(scores)
+        return next_ids.unsqueeze(-1)
+
+
+class StoppingCriteria:
+    def __init__(self, max_new_tokens=20):
+        self.max_new_tokens = max_new_tokens
+        self.current_tokens = 0
+
+    def __call__(self, all_ids):
+        self.current_tokens += 1
+        if self.current_tokens >= self.max_new_tokens:
+            return True
+        return False
+
+
+def initialize_torch_distributed():
+    rank = int(os.getenv("RANK", "0"))
+    world_size = int(os.getenv("WORLD_SIZE", "1"))
+
+    if torch.cuda.is_available():
+        # initialized `torch.distributed`
+        # Set the device id.
+        assert world_size <= torch.cuda.device_count(), "Each process is one gpu"
+        device = rank % torch.cuda.device_count()
+        torch.cuda.set_device(device)
+        backend = "nccl"
+    else:
+        backend = "gloo"
+
+    # Call the init process.
+    torch.distributed.init_process_group(
+        backend=backend,
+        world_size=world_size,
+        rank=rank,
+        init_method="tcp://localhost:6000",
+    )
+
+    return torch.distributed.distributed_c10d._get_default_group(), rank, world_size
+
+
+@contextlib.contextmanager
+def set_default_dtype(dtype):
+    saved_dtype = torch.get_default_dtype()
+    torch.set_default_dtype(dtype)
+    try:
+        yield
+    finally:
+        torch.set_default_dtype(saved_dtype)
--- a/server/poetry.lock
+++ b/server/poetry.lock
@ -0,0 +1,480 @@
+[[package]]
+name = "accelerate"
+version = "0.12.0"
+description = "Accelerate"
+category = "main"
+optional = false
+python-versions = ">=3.7.0"
+
+[package.dependencies]
+numpy = ">=1.17"
+packaging = ">=20.0"
+psutil = "*"
+pyyaml = "*"
+torch = ">=1.4.0"
+
+[package.extras]
+dev = ["black (>=22.0,<23.0)", "datasets", "deepspeed (<0.7.0)", "evaluate", "flake8 (>=3.8.3)", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "parameterized", "pytest", "pytest-subtests", "pytest-xdist", "scipy", "sklearn", "tqdm", "transformers"]
+quality = ["black (>=22.0,<23.0)", "flake8 (>=3.8.3)", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)"]
+sagemaker = ["sagemaker"]
+test_dev = ["datasets", "deepspeed (<0.7.0)", "evaluate", "scipy", "sklearn", "tqdm", "transformers"]
+test_prod = ["parameterized", "pytest", "pytest-subtests", "pytest-xdist"]
+test_trackers = ["comet-ml", "tensorboard", "wandb"]
+testing = ["datasets", "deepspeed (<0.7.0)", "evaluate", "parameterized", "pytest", "pytest-subtests", "pytest-xdist", "scipy", "sklearn", "tqdm", "transformers"]
+
+[[package]]
+name = "click"
+version = "8.1.3"
+description = "Composable command line interface toolkit"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
+[[package]]
+name = "colorama"
+version = "0.4.5"
+description = "Cross-platform colored terminal text."
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[[package]]
+name = "grpcio"
+version = "1.49.1"
+description = "HTTP/2-based RPC framework"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+six = ">=1.5.2"
+
+[package.extras]
+protobuf = ["grpcio-tools (>=1.49.1)"]
+
+[[package]]
+name = "grpcio-reflection"
+version = "1.49.1"
+description = "Standard Protobuf Reflection Service for gRPC"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+grpcio = ">=1.49.1"
+protobuf = ">=4.21.3"
+
+[[package]]
+name = "grpcio-tools"
+version = "1.49.1"
+description = "Protobuf code generator for gRPC"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+grpcio = ">=1.49.1"
+protobuf = ">=4.21.3,<5.0dev"
+setuptools = "*"
+
+[[package]]
+name = "numpy"
+version = "1.23.3"
+description = "NumPy is the fundamental package for array computing with Python."
+category = "main"
+optional = false
+python-versions = ">=3.8"
+
+[[package]]
+name = "packaging"
+version = "21.3"
+description = "Core utilities for Python packages"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+pyparsing = ">=2.0.2,<3.0.5 || >3.0.5"
+
+[[package]]
+name = "protobuf"
+version = "4.21.7"
+description = ""
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[[package]]
+name = "psutil"
+version = "5.9.2"
+description = "Cross-platform lib for process and system monitoring in Python."
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[package.extras]
+test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
+
+[[package]]
+name = "pyparsing"
+version = "3.0.9"
+description = "pyparsing module - Classes and methods to define and execute parsing grammars"
+category = "main"
+optional = false
+python-versions = ">=3.6.8"
+
+[package.extras]
+diagrams = ["jinja2", "railroad-diagrams"]
+
+[[package]]
+name = "PyYAML"
+version = "6.0"
+description = "YAML parser and emitter for Python"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "setuptools"
+version = "65.4.1"
+description = "Easily download, build, install, upgrade, and uninstall Python packages"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.extras]
+docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
+testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mock", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
+testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
+
+[[package]]
+name = "six"
+version = "1.16.0"
+description = "Python 2 and 3 compatibility utilities"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
+
+[[package]]
+name = "torch"
+version = "1.12.1"
+description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
+category = "main"
+optional = false
+python-versions = ">=3.7.0"
+
+[package.dependencies]
+typing-extensions = "*"
+
+[[package]]
+name = "typer"
+version = "0.6.1"
+description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+click = ">=7.1.1,<9.0.0"
+
+[package.extras]
+all = ["colorama (>=0.4.3,<0.5.0)", "rich (>=10.11.0,<13.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
+dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2.17.0,<3.0.0)"]
+doc = ["mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)"]
+test = ["black (>=22.3.0,<23.0.0)", "coverage (>=5.2,<6.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<5.4.0)", "pytest-cov (>=2.10.0,<3.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<2.0.0)", "rich (>=10.11.0,<13.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
+
+[[package]]
+name = "typing-extensions"
+version = "4.3.0"
+description = "Backported and Experimental Type Hints for Python 3.7+"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[metadata]
+lock-version = "1.1"
+python-versions = "^3.9"
+content-hash = "cedd0aebeb3731e2bbddf017a2ee6074c285866354272f8dfe930e9606437a25"
+
+[metadata.files]
+accelerate = [
+    {file = "accelerate-0.12.0-py3-none-any.whl", hash = "sha256:7742ca5c9f15dd1e0a283305599c196e260af4717a561d1f544aeab27d828af6"},
+    {file = "accelerate-0.12.0.tar.gz", hash = "sha256:e8b119c94fac31877620d5f9de311164ec81fa9dc9e175f0d0d4f50fc8d79473"},
+]
+click = [
+    {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"},
+    {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"},
+]
+colorama = [
+    {file = "colorama-0.4.5-py2.py3-none-any.whl", hash = "sha256:854bf444933e37f5824ae7bfc1e98d5bce2ebe4160d46b5edf346a89358e99da"},
+    {file = "colorama-0.4.5.tar.gz", hash = "sha256:e6c6b4334fc50988a639d9b98aa429a0b57da6e17b9a44f0451f930b6967b7a4"},
+]
+grpcio = [
+    {file = "grpcio-1.49.1-cp310-cp310-linux_armv7l.whl", hash = "sha256:fd86040232e805b8e6378b2348c928490ee595b058ce9aaa27ed8e4b0f172b20"},
+    {file = "grpcio-1.49.1-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:6fd0c9cede9552bf00f8c5791d257d5bf3790d7057b26c59df08be5e7a1e021d"},
+    {file = "grpcio-1.49.1-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:d0d402e158d4e84e49c158cb5204119d55e1baf363ee98d6cb5dce321c3a065d"},
+    {file = "grpcio-1.49.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:822ceec743d42a627e64ea266059a62d214c5a3cdfcd0d7fe2b7a8e4e82527c7"},
+    {file = "grpcio-1.49.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2106d9c16527f0a85e2eea6e6b91a74fc99579c60dd810d8690843ea02bc0f5f"},
+    {file = "grpcio-1.49.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:52dd02b7e7868233c571b49bc38ebd347c3bb1ff8907bb0cb74cb5f00c790afc"},
+    {file = "grpcio-1.49.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:120fecba2ec5d14b5a15d11063b39783fda8dc8d24addd83196acb6582cabd9b"},
+    {file = "grpcio-1.49.1-cp310-cp310-win32.whl", hash = "sha256:f1a3b88e3c53c1a6e6bed635ec1bbb92201bb6a1f2db186179f7f3f244829788"},
+    {file = "grpcio-1.49.1-cp310-cp310-win_amd64.whl", hash = "sha256:a7d0017b92d3850abea87c1bdec6ea41104e71c77bca44c3e17f175c6700af62"},
+    {file = "grpcio-1.49.1-cp311-cp311-linux_armv7l.whl", hash = "sha256:9fb17ff8c0d56099ac6ebfa84f670c5a62228d6b5c695cf21c02160c2ac1446b"},
+    {file = "grpcio-1.49.1-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:075f2d06e3db6b48a2157a1bcd52d6cbdca980dd18988fe6afdb41795d51625f"},
+    {file = "grpcio-1.49.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:46d93a1b4572b461a227f1db6b8d35a88952db1c47e5fadcf8b8a2f0e1dd9201"},
+    {file = "grpcio-1.49.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc79b2b37d779ac42341ddef40ad5bf0966a64af412c89fc2b062e3ddabb093f"},
+    {file = "grpcio-1.49.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:5f8b3a971c7820ea9878f3fd70086240a36aeee15d1b7e9ecbc2743b0e785568"},
+    {file = "grpcio-1.49.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49b301740cf5bc8fed4fee4c877570189ae3951432d79fa8e524b09353659811"},
+    {file = "grpcio-1.49.1-cp311-cp311-win32.whl", hash = "sha256:1c66a25afc6c71d357867b341da594a5587db5849b48f4b7d5908d236bb62ede"},
+    {file = "grpcio-1.49.1-cp311-cp311-win_amd64.whl", hash = "sha256:6b6c3a95d27846f4145d6967899b3ab25fffc6ae99544415e1adcacef84842d2"},
+    {file = "grpcio-1.49.1-cp37-cp37m-linux_armv7l.whl", hash = "sha256:1cc400c8a2173d1c042997d98a9563e12d9bb3fb6ad36b7f355bc77c7663b8af"},
+    {file = "grpcio-1.49.1-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:34f736bd4d0deae90015c0e383885b431444fe6b6c591dea288173df20603146"},
+    {file = "grpcio-1.49.1-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:196082b9c89ebf0961dcd77cb114bed8171964c8e3063b9da2fb33536a6938ed"},
+    {file = "grpcio-1.49.1-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8c9f89c42749890618cd3c2464e1fbf88446e3d2f67f1e334c8e5db2f3272bbd"},
+    {file = "grpcio-1.49.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64419cb8a5b612cdb1550c2fd4acbb7d4fb263556cf4625f25522337e461509e"},
+    {file = "grpcio-1.49.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:8a5272061826e6164f96e3255405ef6f73b88fd3e8bef464c7d061af8585ac62"},
+    {file = "grpcio-1.49.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ea9d0172445241ad7cb49577314e39d0af2c5267395b3561d7ced5d70458a9f3"},
+    {file = "grpcio-1.49.1-cp37-cp37m-win32.whl", hash = "sha256:2070e87d95991473244c72d96d13596c751cb35558e11f5df5414981e7ed2492"},
+    {file = "grpcio-1.49.1-cp37-cp37m-win_amd64.whl", hash = "sha256:4fcedcab49baaa9db4a2d240ac81f2d57eb0052b1c6a9501b46b8ae912720fbf"},
+    {file = "grpcio-1.49.1-cp38-cp38-linux_armv7l.whl", hash = "sha256:afbb3475cf7f4f7d380c2ca37ee826e51974f3e2665613996a91d6a58583a534"},
+    {file = "grpcio-1.49.1-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:a4f9ba141380abde6c3adc1727f21529137a2552002243fa87c41a07e528245c"},
+    {file = "grpcio-1.49.1-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:cf0a1fb18a7204b9c44623dfbd1465b363236ce70c7a4ed30402f9f60d8b743b"},
+    {file = "grpcio-1.49.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:17bb6fe72784b630728c6cff9c9d10ccc3b6d04e85da6e0a7b27fb1d135fac62"},
+    {file = "grpcio-1.49.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18305d5a082d1593b005a895c10041f833b16788e88b02bb81061f5ebcc465df"},
+    {file = "grpcio-1.49.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:b6a1b39e59ac5a3067794a0e498911cf2e37e4b19ee9e9977dc5e7051714f13f"},
+    {file = "grpcio-1.49.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:0e20d59aafc086b1cc68400463bddda6e41d3e5ed30851d1e2e0f6a2e7e342d3"},
+    {file = "grpcio-1.49.1-cp38-cp38-win32.whl", hash = "sha256:e1e83233d4680863a421f3ee4a7a9b80d33cd27ee9ed7593bc93f6128302d3f2"},
+    {file = "grpcio-1.49.1-cp38-cp38-win_amd64.whl", hash = "sha256:221d42c654d2a41fa31323216279c73ed17d92f533bc140a3390cc1bd78bf63c"},
+    {file = "grpcio-1.49.1-cp39-cp39-linux_armv7l.whl", hash = "sha256:fa9e6e61391e99708ac87fc3436f6b7b9c6b845dc4639b406e5e61901e1aacde"},
+    {file = "grpcio-1.49.1-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:9b449e966ef518ce9c860d21f8afe0b0f055220d95bc710301752ac1db96dd6a"},
+    {file = "grpcio-1.49.1-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:aa34d2ad9f24e47fa9a3172801c676e4037d862247e39030165fe83821a7aafd"},
+    {file = "grpcio-1.49.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5207f4eed1b775d264fcfe379d8541e1c43b878f2b63c0698f8f5c56c40f3d68"},
+    {file = "grpcio-1.49.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b24a74651438d45619ac67004638856f76cc13d78b7478f2457754cbcb1c8ad"},
+    {file = "grpcio-1.49.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:fe763781669790dc8b9618e7e677c839c87eae6cf28b655ee1fa69ae04eea03f"},
+    {file = "grpcio-1.49.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2f2ff7ba0f8f431f32d4b4bc3a3713426949d3533b08466c4ff1b2b475932ca8"},
+    {file = "grpcio-1.49.1-cp39-cp39-win32.whl", hash = "sha256:08ff74aec8ff457a89b97152d36cb811dcc1d17cd5a92a65933524e363327394"},
+    {file = "grpcio-1.49.1-cp39-cp39-win_amd64.whl", hash = "sha256:274ffbb39717918c514b35176510ae9be06e1d93121e84d50b350861dcb9a705"},
+    {file = "grpcio-1.49.1.tar.gz", hash = "sha256:d4725fc9ec8e8822906ae26bb26f5546891aa7fbc3443de970cc556d43a5c99f"},
+]
+grpcio-reflection = [
+    {file = "grpcio-reflection-1.49.1.tar.gz", hash = "sha256:b755dfe61d5255a02fb8d0d845bd0027847dee68bf0763a2b286d664ed07ec4d"},
+    {file = "grpcio_reflection-1.49.1-py3-none-any.whl", hash = "sha256:70a325a83c1c1ab583d368711e5733cbef5e068ad2c17cbe77df6e47e0311d1f"},
+]
+grpcio-tools = [
+    {file = "grpcio-tools-1.49.1.tar.gz", hash = "sha256:84cc64e5b46bad43d5d7bd2fd772b656eba0366961187a847e908e2cb735db91"},
+    {file = "grpcio_tools-1.49.1-cp310-cp310-linux_armv7l.whl", hash = "sha256:2dfb6c7ece84d46bd690b23d3e060d18115c8bc5047d2e8a33e6747ed323a348"},
+    {file = "grpcio_tools-1.49.1-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:8f452a107c054a04db2570f7851a07f060313c6e841b0d394ce6030d598290e6"},
+    {file = "grpcio_tools-1.49.1-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:6a198871b582287213c4d70792bf275e1d7cf34eed1d019f534ddf4cd15ab039"},
+    {file = "grpcio_tools-1.49.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a0cca67a7d0287bdc855d81fdd38dc949c4273273a74f832f9e520abe4f20bc6"},
+    {file = "grpcio_tools-1.49.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bdaff4c89eecb37c247b93025410db68114d97fa093cbb028e9bd7cda5912473"},
+    {file = "grpcio_tools-1.49.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:bb8773118ad315db317d7b22b5ff75d649ca20931733281209e7cbd8c0fad53e"},
+    {file = "grpcio_tools-1.49.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7cc5534023735b8a8f56760b7c533918f874ce5a9064d7c5456d2709ae2b31f9"},
+    {file = "grpcio_tools-1.49.1-cp310-cp310-win32.whl", hash = "sha256:d277642acbe305f5586f9597b78fb9970d6633eb9f89c61e429c92c296c37129"},
+    {file = "grpcio_tools-1.49.1-cp310-cp310-win_amd64.whl", hash = "sha256:eed599cf08fc1a06c72492d3c5750c32f58de3750eddd984af1f257c14326701"},
+    {file = "grpcio_tools-1.49.1-cp311-cp311-linux_armv7l.whl", hash = "sha256:9e5c13809ab2f245398e8446c4c3b399a62d591db651e46806cccf52a700452e"},
+    {file = "grpcio_tools-1.49.1-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:ab3d0ee9623720ee585fdf3753b3755d3144a4a8ae35bca8e3655fa2f41056be"},
+    {file = "grpcio_tools-1.49.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ba87e3512bc91d78bf9febcfb522eadda171d2d4ddaf886066b0f01aa4929ad"},
+    {file = "grpcio_tools-1.49.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13e13b3643e7577a3ec13b79689eb4d7548890b1e104c04b9ed6557a3c3dd452"},
+    {file = "grpcio_tools-1.49.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:324f67d9cb4b7058b6ce45352fb64c20cc1fa04c34d97ad44772cfe6a4ae0cf5"},
+    {file = "grpcio_tools-1.49.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a64bab81b220c50033f584f57978ebbea575f09c1ccee765cd5c462177988098"},
+    {file = "grpcio_tools-1.49.1-cp311-cp311-win32.whl", hash = "sha256:f632d376f92f23e5931697a3acf1b38df7eb719774213d93c52e02acd2d529ac"},
+    {file = "grpcio_tools-1.49.1-cp311-cp311-win_amd64.whl", hash = "sha256:28ff2b978d9509474928b9c096a0cce4eaa9c8f7046136aee1545f6211ed8126"},
+    {file = "grpcio_tools-1.49.1-cp37-cp37m-linux_armv7l.whl", hash = "sha256:46afd3cb7e555187451a5d283f108cdef397952a662cb48680afc615b158864a"},
+    {file = "grpcio_tools-1.49.1-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:9284568b728e41fa8f7e9c2e7399545d605f75d8072ef0e9aa2a05655cb679eb"},
+    {file = "grpcio_tools-1.49.1-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:aa34442cf787732cb41f2aa6172007e24f480b8b9d3dc5166de80d63e9072ea4"},
+    {file = "grpcio_tools-1.49.1-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3b8c9eb5a4250905414cd53a68caea3eb8f0c515aadb689e6e81b71ebe9ab5c6"},
+    {file = "grpcio_tools-1.49.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ab15db024051bf21feb21c29cb2c3ea0a2e4f5cf341d46ef76e17fcf6aaef164"},
+    {file = "grpcio_tools-1.49.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:502084b622f758bef620a9107c2db9fcdf66d26c7e0e481d6bb87db4dc917d70"},
+    {file = "grpcio_tools-1.49.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:4085890b77c640085f82bf1e90a0ea166ce48000bc2f5180914b974783c9c0a8"},
+    {file = "grpcio_tools-1.49.1-cp37-cp37m-win32.whl", hash = "sha256:da0edb984699769ce02e18e3392d54b59a7a3f93acd285a68043f5bde4fc028e"},
+    {file = "grpcio_tools-1.49.1-cp37-cp37m-win_amd64.whl", hash = "sha256:9887cd622770271101a7dd1832845d64744c3f88fd11ccb2620394079197a42e"},
+    {file = "grpcio_tools-1.49.1-cp38-cp38-linux_armv7l.whl", hash = "sha256:8440fe7dae6a40c279e3a24b82793735babd38ecbb0d07bb712ff9c8963185d9"},
+    {file = "grpcio_tools-1.49.1-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:b5de2bb7dd6b6231da9b1556ade981513330b740e767f1d902c71ceee0a7d196"},
+    {file = "grpcio_tools-1.49.1-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:1e6f06a763aea7836b63d9c117347f2bf7038008ceef72758815c9e09c5fb1fc"},
+    {file = "grpcio_tools-1.49.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e31562f90120318c5395aabec0f2f69ad8c14b6676996b7730d9d2eaf9415d57"},
+    {file = "grpcio_tools-1.49.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49ef9a4e389a618157a9daa9fafdfeeaef1ece9adda7f50f85db928f24d4b3e8"},
+    {file = "grpcio_tools-1.49.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:b384cb8e8d9bcb55ee8f9b064374561c7a1a05d848249581403d36fc7060032f"},
+    {file = "grpcio_tools-1.49.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:73732f77943ac3e898879cbb29c27253aa3c47566b8a59780fd24c6a54de1b66"},
+    {file = "grpcio_tools-1.49.1-cp38-cp38-win32.whl", hash = "sha256:b594b2745a5ba9e7a76ce561bc5ab40bc65bb44743c505529b1e4f12af29104d"},
+    {file = "grpcio_tools-1.49.1-cp38-cp38-win_amd64.whl", hash = "sha256:680fbc88f8709ddcabb88f86749f2d8e429160890cff2c70680880a6970d4eef"},
+    {file = "grpcio_tools-1.49.1-cp39-cp39-linux_armv7l.whl", hash = "sha256:e8c3869121860f6767eedb7d24fc54dfd71e737fdfbb26e1334684606f3274fd"},
+    {file = "grpcio_tools-1.49.1-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:73e9d7c886ba10e20c97d1dab0ff961ba5800757ae5e31be21b1cda8130c52f8"},
+    {file = "grpcio_tools-1.49.1-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:1760de2dd2c4f08de87b039043a4797f3c17193656e7e3eb84e92f0517083c0c"},
+    {file = "grpcio_tools-1.49.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd4b1e216dd04d9245ee8f4e601a1f98c25e6e417ea5cf8d825c50589a8b447e"},
+    {file = "grpcio_tools-1.49.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c28751ab5955cae563d07677e799233f0fe1c0fc49d9cbd61ff1957e83617f"},
+    {file = "grpcio_tools-1.49.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:c24239c3ee9ed16314c14b4e24437b5079ebc344f343f33629a582f8699f583b"},
+    {file = "grpcio_tools-1.49.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:892d3dacf1942820f0b7a868a30e6fbcdf5bec08543b682c7274b0101cee632d"},
+    {file = "grpcio_tools-1.49.1-cp39-cp39-win32.whl", hash = "sha256:704d21509ec06efc9d034dbe70e7152715aac004941f4f0f553cf3a0aff15bd5"},
+    {file = "grpcio_tools-1.49.1-cp39-cp39-win_amd64.whl", hash = "sha256:1efa0c221c719433f441ac0e026fc3c4dbc9a1a08a552ecdc707775e2f2fbbae"},
+]
+numpy = [
+    {file = "numpy-1.23.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c9f707b5bb73bf277d812ded9896f9512a43edff72712f31667d0a8c2f8e71ee"},
+    {file = "numpy-1.23.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ffcf105ecdd9396e05a8e58e81faaaf34d3f9875f137c7372450baa5d77c9a54"},
+    {file = "numpy-1.23.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ea3f98a0ffce3f8f57675eb9119f3f4edb81888b6874bc1953f91e0b1d4f440"},
+    {file = "numpy-1.23.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:004f0efcb2fe1c0bd6ae1fcfc69cc8b6bf2407e0f18be308612007a0762b4089"},
+    {file = "numpy-1.23.3-cp310-cp310-win32.whl", hash = "sha256:98dcbc02e39b1658dc4b4508442a560fe3ca5ca0d989f0df062534e5ca3a5c1a"},
+    {file = "numpy-1.23.3-cp310-cp310-win_amd64.whl", hash = "sha256:39a664e3d26ea854211867d20ebcc8023257c1800ae89773cbba9f9e97bae036"},
+    {file = "numpy-1.23.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1f27b5322ac4067e67c8f9378b41c746d8feac8bdd0e0ffede5324667b8a075c"},
+    {file = "numpy-1.23.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2ad3ec9a748a8943e6eb4358201f7e1c12ede35f510b1a2221b70af4bb64295c"},
+    {file = "numpy-1.23.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bdc9febce3e68b697d931941b263c59e0c74e8f18861f4064c1f712562903411"},
+    {file = "numpy-1.23.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:301c00cf5e60e08e04d842fc47df641d4a181e651c7135c50dc2762ffe293dbd"},
+    {file = "numpy-1.23.3-cp311-cp311-win32.whl", hash = "sha256:7cd1328e5bdf0dee621912f5833648e2daca72e3839ec1d6695e91089625f0b4"},
+    {file = "numpy-1.23.3-cp311-cp311-win_amd64.whl", hash = "sha256:8355fc10fd33a5a70981a5b8a0de51d10af3688d7a9e4a34fcc8fa0d7467bb7f"},
+    {file = "numpy-1.23.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bc6e8da415f359b578b00bcfb1d08411c96e9a97f9e6c7adada554a0812a6cc6"},
+    {file = "numpy-1.23.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:22d43376ee0acd547f3149b9ec12eec2f0ca4a6ab2f61753c5b29bb3e795ac4d"},
+    {file = "numpy-1.23.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a64403f634e5ffdcd85e0b12c08f04b3080d3e840aef118721021f9b48fc1460"},
+    {file = "numpy-1.23.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:efd9d3abe5774404becdb0748178b48a218f1d8c44e0375475732211ea47c67e"},
+    {file = "numpy-1.23.3-cp38-cp38-win32.whl", hash = "sha256:f8c02ec3c4c4fcb718fdf89a6c6f709b14949408e8cf2a2be5bfa9c49548fd85"},
+    {file = "numpy-1.23.3-cp38-cp38-win_amd64.whl", hash = "sha256:e868b0389c5ccfc092031a861d4e158ea164d8b7fdbb10e3b5689b4fc6498df6"},
+    {file = "numpy-1.23.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:09f6b7bdffe57fc61d869a22f506049825d707b288039d30f26a0d0d8ea05164"},
+    {file = "numpy-1.23.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8c79d7cf86d049d0c5089231a5bcd31edb03555bd93d81a16870aa98c6cfb79d"},
+    {file = "numpy-1.23.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5d5420053bbb3dd64c30e58f9363d7a9c27444c3648e61460c1237f9ec3fa14"},
+    {file = "numpy-1.23.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5422d6a1ea9b15577a9432e26608c73a78faf0b9039437b075cf322c92e98e7"},
+    {file = "numpy-1.23.3-cp39-cp39-win32.whl", hash = "sha256:c1ba66c48b19cc9c2975c0d354f24058888cdc674bebadceb3cdc9ec403fb5d1"},
+    {file = "numpy-1.23.3-cp39-cp39-win_amd64.whl", hash = "sha256:78a63d2df1d947bd9d1b11d35564c2f9e4b57898aae4626638056ec1a231c40c"},
+    {file = "numpy-1.23.3-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:17c0e467ade9bda685d5ac7f5fa729d8d3e76b23195471adae2d6a6941bd2c18"},
+    {file = "numpy-1.23.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:91b8d6768a75247026e951dce3b2aac79dc7e78622fc148329135ba189813584"},
+    {file = "numpy-1.23.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:94c15ca4e52671a59219146ff584488907b1f9b3fc232622b47e2cf832e94fb8"},
+    {file = "numpy-1.23.3.tar.gz", hash = "sha256:51bf49c0cd1d52be0a240aa66f3458afc4b95d8993d2d04f0d91fa60c10af6cd"},
+]
+packaging = [
+    {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"},
+    {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"},
+]
+protobuf = [
+    {file = "protobuf-4.21.7-cp310-abi3-win32.whl", hash = "sha256:c7cb105d69a87416bd9023e64324e1c089593e6dae64d2536f06bcbe49cd97d8"},
+    {file = "protobuf-4.21.7-cp310-abi3-win_amd64.whl", hash = "sha256:3ec85328a35a16463c6f419dbce3c0fc42b3e904d966f17f48bae39597c7a543"},
+    {file = "protobuf-4.21.7-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:db9056b6a11cb5131036d734bcbf91ef3ef9235d6b681b2fc431cbfe5a7f2e56"},
+    {file = "protobuf-4.21.7-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:ca200645d6235ce0df3ccfdff1567acbab35c4db222a97357806e015f85b5744"},
+    {file = "protobuf-4.21.7-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:b019c79e23a80735cc8a71b95f76a49a262f579d6b84fd20a0b82279f40e2cc1"},
+    {file = "protobuf-4.21.7-cp37-cp37m-win32.whl", hash = "sha256:d3f89ccf7182293feba2de2739c8bf34fed1ed7c65a5cf987be00311acac57c1"},
+    {file = "protobuf-4.21.7-cp37-cp37m-win_amd64.whl", hash = "sha256:a74d96cd960b87b4b712797c741bb3ea3a913f5c2dc4b6cbe9c0f8360b75297d"},
+    {file = "protobuf-4.21.7-cp38-cp38-win32.whl", hash = "sha256:8e09d1916386eca1ef1353767b6efcebc0a6859ed7f73cb7fb974feba3184830"},
+    {file = "protobuf-4.21.7-cp38-cp38-win_amd64.whl", hash = "sha256:9e355f2a839d9930d83971b9f562395e13493f0e9211520f8913bd11efa53c02"},
+    {file = "protobuf-4.21.7-cp39-cp39-win32.whl", hash = "sha256:f370c0a71712f8965023dd5b13277444d3cdfecc96b2c778b0e19acbfd60df6e"},
+    {file = "protobuf-4.21.7-cp39-cp39-win_amd64.whl", hash = "sha256:9643684232b6b340b5e63bb69c9b4904cdd39e4303d498d1a92abddc7e895b7f"},
+    {file = "protobuf-4.21.7-py2.py3-none-any.whl", hash = "sha256:8066322588d4b499869bf9f665ebe448e793036b552f68c585a9b28f1e393f66"},
+    {file = "protobuf-4.21.7-py3-none-any.whl", hash = "sha256:58b81358ec6c0b5d50df761460ae2db58405c063fd415e1101209221a0a810e1"},
+    {file = "protobuf-4.21.7.tar.gz", hash = "sha256:71d9dba03ed3432c878a801e2ea51e034b0ea01cf3a4344fb60166cb5f6c8757"},
+]
+psutil = [
+    {file = "psutil-5.9.2-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:8f024fbb26c8daf5d70287bb3edfafa22283c255287cf523c5d81721e8e5d82c"},
+    {file = "psutil-5.9.2-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:b2f248ffc346f4f4f0d747ee1947963613216b06688be0be2e393986fe20dbbb"},
+    {file = "psutil-5.9.2-cp27-cp27m-win32.whl", hash = "sha256:b1928b9bf478d31fdffdb57101d18f9b70ed4e9b0e41af751851813547b2a9ab"},
+    {file = "psutil-5.9.2-cp27-cp27m-win_amd64.whl", hash = "sha256:404f4816c16a2fcc4eaa36d7eb49a66df2d083e829d3e39ee8759a411dbc9ecf"},
+    {file = "psutil-5.9.2-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:94e621c6a4ddb2573d4d30cba074f6d1aa0186645917df42c811c473dd22b339"},
+    {file = "psutil-5.9.2-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:256098b4f6ffea6441eb54ab3eb64db9ecef18f6a80d7ba91549195d55420f84"},
+    {file = "psutil-5.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:614337922702e9be37a39954d67fdb9e855981624d8011a9927b8f2d3c9625d9"},
+    {file = "psutil-5.9.2-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:39ec06dc6c934fb53df10c1672e299145ce609ff0611b569e75a88f313634969"},
+    {file = "psutil-5.9.2-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e3ac2c0375ef498e74b9b4ec56df3c88be43fe56cac465627572dbfb21c4be34"},
+    {file = "psutil-5.9.2-cp310-cp310-win32.whl", hash = "sha256:e4c4a7636ffc47b7141864f1c5e7d649f42c54e49da2dd3cceb1c5f5d29bfc85"},
+    {file = "psutil-5.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:f4cb67215c10d4657e320037109939b1c1d2fd70ca3d76301992f89fe2edb1f1"},
+    {file = "psutil-5.9.2-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:dc9bda7d5ced744622f157cc8d8bdd51735dafcecff807e928ff26bdb0ff097d"},
+    {file = "psutil-5.9.2-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d75291912b945a7351d45df682f9644540d564d62115d4a20d45fa17dc2d48f8"},
+    {file = "psutil-5.9.2-cp36-cp36m-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4018d5f9b6651f9896c7a7c2c9f4652e4eea53f10751c4e7d08a9093ab587ec"},
+    {file = "psutil-5.9.2-cp36-cp36m-win32.whl", hash = "sha256:f40ba362fefc11d6bea4403f070078d60053ed422255bd838cd86a40674364c9"},
+    {file = "psutil-5.9.2-cp36-cp36m-win_amd64.whl", hash = "sha256:9770c1d25aee91417eba7869139d629d6328a9422ce1cdd112bd56377ca98444"},
+    {file = "psutil-5.9.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:42638876b7f5ef43cef8dcf640d3401b27a51ee3fa137cb2aa2e72e188414c32"},
+    {file = "psutil-5.9.2-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91aa0dac0c64688667b4285fa29354acfb3e834e1fd98b535b9986c883c2ce1d"},
+    {file = "psutil-5.9.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4fb54941aac044a61db9d8eb56fc5bee207db3bc58645d657249030e15ba3727"},
+    {file = "psutil-5.9.2-cp37-cp37m-win32.whl", hash = "sha256:7cbb795dcd8ed8fd238bc9e9f64ab188f3f4096d2e811b5a82da53d164b84c3f"},
+    {file = "psutil-5.9.2-cp37-cp37m-win_amd64.whl", hash = "sha256:5d39e3a2d5c40efa977c9a8dd4f679763c43c6c255b1340a56489955dbca767c"},
+    {file = "psutil-5.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:fd331866628d18223a4265371fd255774affd86244fc307ef66eaf00de0633d5"},
+    {file = "psutil-5.9.2-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b315febaebae813326296872fdb4be92ad3ce10d1d742a6b0c49fb619481ed0b"},
+    {file = "psutil-5.9.2-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7929a516125f62399d6e8e026129c8835f6c5a3aab88c3fff1a05ee8feb840d"},
+    {file = "psutil-5.9.2-cp38-cp38-win32.whl", hash = "sha256:561dec454853846d1dd0247b44c2e66a0a0c490f937086930ec4b8f83bf44f06"},
+    {file = "psutil-5.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:67b33f27fc0427483b61563a16c90d9f3b547eeb7af0ef1b9fe024cdc9b3a6ea"},
+    {file = "psutil-5.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b3591616fa07b15050b2f87e1cdefd06a554382e72866fcc0ab2be9d116486c8"},
+    {file = "psutil-5.9.2-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:14b29f581b5edab1f133563272a6011925401804d52d603c5c606936b49c8b97"},
+    {file = "psutil-5.9.2-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4642fd93785a29353d6917a23e2ac6177308ef5e8be5cc17008d885cb9f70f12"},
+    {file = "psutil-5.9.2-cp39-cp39-win32.whl", hash = "sha256:ed29ea0b9a372c5188cdb2ad39f937900a10fb5478dc077283bf86eeac678ef1"},
+    {file = "psutil-5.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:68b35cbff92d1f7103d8f1db77c977e72f49fcefae3d3d2b91c76b0e7aef48b8"},
+    {file = "psutil-5.9.2.tar.gz", hash = "sha256:feb861a10b6c3bb00701063b37e4afc754f8217f0f09c42280586bd6ac712b5c"},
+]
+pyparsing = [
+    {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"},
+    {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"},
+]
+PyYAML = [
+    {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"},
+    {file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"},
+    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc"},
+    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b"},
+    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"},
+    {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"},
+    {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"},
+    {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"},
+    {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"},
+    {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"},
+    {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"},
+    {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"},
+    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"},
+    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"},
+    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4"},
+    {file = "PyYAML-6.0-cp36-cp36m-win32.whl", hash = "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293"},
+    {file = "PyYAML-6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57"},
+    {file = "PyYAML-6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c"},
+    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0"},
+    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4"},
+    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9"},
+    {file = "PyYAML-6.0-cp37-cp37m-win32.whl", hash = "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737"},
+    {file = "PyYAML-6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d"},
+    {file = "PyYAML-6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b"},
+    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba"},
+    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34"},
+    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287"},
+    {file = "PyYAML-6.0-cp38-cp38-win32.whl", hash = "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78"},
+    {file = "PyYAML-6.0-cp38-cp38-win_amd64.whl", hash = "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07"},
+    {file = "PyYAML-6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b"},
+    {file = "PyYAML-6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174"},
+    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803"},
+    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3"},
+    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0"},
+    {file = "PyYAML-6.0-cp39-cp39-win32.whl", hash = "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb"},
+    {file = "PyYAML-6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c"},
+    {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"},
+]
+setuptools = [
+    {file = "setuptools-65.4.1-py3-none-any.whl", hash = "sha256:1b6bdc6161661409c5f21508763dc63ab20a9ac2f8ba20029aaaa7fdb9118012"},
+    {file = "setuptools-65.4.1.tar.gz", hash = "sha256:3050e338e5871e70c72983072fe34f6032ae1cdeeeb67338199c2f74e083a80e"},
+]
+six = [
+    {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
+    {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
+]
+torch = [
+    {file = "torch-1.12.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:9c038662db894a23e49e385df13d47b2a777ffd56d9bcd5b832593fab0a7e286"},
+    {file = "torch-1.12.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:4e1b9c14cf13fd2ab8d769529050629a0e68a6fc5cb8e84b4a3cc1dd8c4fe541"},
+    {file = "torch-1.12.1-cp310-cp310-win_amd64.whl", hash = "sha256:e9c8f4a311ac29fc7e8e955cfb7733deb5dbe1bdaabf5d4af2765695824b7e0d"},
+    {file = "torch-1.12.1-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:976c3f997cea38ee91a0dd3c3a42322785414748d1761ef926b789dfa97c6134"},
+    {file = "torch-1.12.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:68104e4715a55c4bb29a85c6a8d57d820e0757da363be1ba680fa8cc5be17b52"},
+    {file = "torch-1.12.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:743784ccea0dc8f2a3fe6a536bec8c4763bd82c1352f314937cb4008d4805de1"},
+    {file = "torch-1.12.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:b5dbcca369800ce99ba7ae6dee3466607a66958afca3b740690d88168752abcf"},
+    {file = "torch-1.12.1-cp37-cp37m-win_amd64.whl", hash = "sha256:f3b52a634e62821e747e872084ab32fbcb01b7fa7dbb7471b6218279f02a178a"},
+    {file = "torch-1.12.1-cp37-none-macosx_10_9_x86_64.whl", hash = "sha256:8a34a2fbbaa07c921e1b203f59d3d6e00ed379f2b384445773bd14e328a5b6c8"},
+    {file = "torch-1.12.1-cp37-none-macosx_11_0_arm64.whl", hash = "sha256:42f639501928caabb9d1d55ddd17f07cd694de146686c24489ab8c615c2871f2"},
+    {file = "torch-1.12.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:0b44601ec56f7dd44ad8afc00846051162ef9c26a8579dda0a02194327f2d55e"},
+    {file = "torch-1.12.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:cd26d8c5640c3a28c526d41ccdca14cf1cbca0d0f2e14e8263a7ac17194ab1d2"},
+    {file = "torch-1.12.1-cp38-cp38-win_amd64.whl", hash = "sha256:42e115dab26f60c29e298559dbec88444175528b729ae994ec4c65d56fe267dd"},
+    {file = "torch-1.12.1-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:a8320ba9ad87e80ca5a6a016e46ada4d1ba0c54626e135d99b2129a4541c509d"},
+    {file = "torch-1.12.1-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:03e31c37711db2cd201e02de5826de875529e45a55631d317aadce2f1ed45aa8"},
+    {file = "torch-1.12.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:9b356aea223772cd754edb4d9ecf2a025909b8615a7668ac7d5130f86e7ec421"},
+    {file = "torch-1.12.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:6cf6f54b43c0c30335428195589bd00e764a6d27f3b9ba637aaa8c11aaf93073"},
+    {file = "torch-1.12.1-cp39-cp39-win_amd64.whl", hash = "sha256:f00c721f489089dc6364a01fd84906348fe02243d0af737f944fddb36003400d"},
+    {file = "torch-1.12.1-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:bfec2843daa654f04fda23ba823af03e7b6f7650a873cdb726752d0e3718dada"},
+    {file = "torch-1.12.1-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:69fe2cae7c39ccadd65a123793d30e0db881f1c1927945519c5c17323131437e"},
+]
+typer = [
+    {file = "typer-0.6.1-py3-none-any.whl", hash = "sha256:54b19e5df18654070a82f8c2aa1da456a4ac16a2a83e6dcd9f170e291c56338e"},
+    {file = "typer-0.6.1.tar.gz", hash = "sha256:2d5720a5e63f73eaf31edaa15f6ab87f35f0690f8ca233017d7d23d743a91d73"},
+]
+typing-extensions = [
+    {file = "typing_extensions-4.3.0-py3-none-any.whl", hash = "sha256:25642c956049920a5aa49edcdd6ab1e06d7e5d467fc00e0506c44ac86fbfca02"},
+    {file = "typing_extensions-4.3.0.tar.gz", hash = "sha256:e6d2677a32f47fc7eb2795db1dd15c1f34eff616bcaf2cfb5e997f854fa1c4a6"},
+]
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@ -0,0 +1,21 @@
+[tool.poetry]
+name = "bloom-inference"
+version = "0.1.0"
+description = "BLOOM Inference Python gRPC Server"
+authors = ["Olivier Dehaene <olivier@huggingface.co>"]
+
+[tool.poetry.dependencies]
+python = "^3.9"
+protobuf = "^4.21.7"
+grpcio = "^1.49.1"
+torch = "^1.12.1"
+typer = "^0.6.1"
+grpcio-reflection = "^1.49.1"
+accelerate = "^0.12.0"
+
+[tool.poetry.group.dev.dependencies]
+grpcio-tools = "^1.49.1"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"