From bf99afe9160f97124ba2ec28afaa97e4364846a8 Mon Sep 17 00:00:00 2001
From: Olivier Dehaene <olivier@huggingface.co>
Date: Fri, 14 Oct 2022 15:56:21 +0200
Subject: [PATCH] feat: Docker image

---
 .dockerignore                             |   1 +
 Dockerfile                                |  59 ++++++++
 README.md                                 |   2 -
 router/rust-toolchain.toml                |   3 +
 router/src/batcher.rs                     |   7 +-
 router/src/main.rs                        |   4 +-
 router/src/server.rs                      |  64 ++++++---
 run.sh                                    |  21 +++
 server/bloom_inference/model.py           |   7 +-
 server/bloom_inference/prepare_weights.py | 160 +++++++++++++++-------
 server/bloom_inference/shard_model.py     | 102 --------------
 server/poetry.lock                        |  14 +-
 server/pyproject.toml                     |   1 +
 13 files changed, 265 insertions(+), 180 deletions(-)
 create mode 100644 .dockerignore
 create mode 100644 Dockerfile
 create mode 100644 router/rust-toolchain.toml
 create mode 100755 run.sh
 delete mode 100644 server/bloom_inference/shard_model.py

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 00000000..d2704bf1
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1 @@
+router/target
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 00000000..a5161020
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,59 @@
+FROM rust:1.64 as builder
+
+WORKDIR /usr/src
+
+COPY proto proto
+COPY router router
+
+WORKDIR /usr/src/router
+
+RUN cargo install --path .
+
+FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
+
+ENV LANG=C.UTF-8 \
+    LC_ALL=C.UTF-8 \
+    DEBIAN_FRONTEND=noninteractive \
+    MODEL_BASE_PATH=/var/azureml-model \
+    MODEL_NAME=bigscience/bloom \
+    NUM_GPUS=8 \
+    CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+    CUDA_HOME=/usr/local/cuda \
+    LD_LIBRARY_PATH="/opt/miniconda/envs/text-generation/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH" \
+    CONDA_DEFAULT_ENV=text-generation \
+    PATH=$PATH:/opt/miniconda/envs/text-generation/bin:/opt/miniconda/bin:/usr/local/cuda/bin
+
+SHELL ["/bin/bash", "-c"]
+
+RUN apt-get update && apt-get install -y unzip wget libssl-dev && rm -rf /var/lib/apt/lists/*
+
+RUN cd ~ && \
+    wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+    chmod +x Miniconda3-latest-Linux-x86_64.sh && \
+    bash ./Miniconda3-latest-Linux-x86_64.sh -bf -p /opt/miniconda && \
+    conda create -n text-generation python=3.9 -y
+
+# Install specific version of torch
+RUN /opt/miniconda/envs/text-generation/bin/pip install torch --extra-index-url https://download.pytorch.org/whl/cu116 --no-cache-dir
+
+# Install specific version of transformers
+RUN wget https://github.com/huggingface/transformers/archive/46d37bece7d3ffdef97b1ee4a3170c0a0627d921.zip && \
+    unzip 46d37bece7d3ffdef97b1ee4a3170c0a0627d921.zip && \
+    rm 46d37bece7d3ffdef97b1ee4a3170c0a0627d921.zip && \
+    cd transformers-46d37bece7d3ffdef97b1ee4a3170c0a0627d921 && \
+    /opt/miniconda/envs/text-generation/bin/python setup.py install
+
+WORKDIR /usr/src
+
+# Install server
+COPY server server
+RUN cd server && \
+    /opt/miniconda/envs/text-generation/bin/pip install . --no-cache-dir
+
+# Install router
+COPY --from=builder /usr/local/cargo/bin/bloom-inference /usr/local/bin/bloom-inference
+
+COPY run.sh .
+RUN chmod +x run.sh
+
+CMD ["./run.sh"]
\ No newline at end of file
diff --git a/README.md b/README.md
index ad8adaa9..a62daca1 100644
--- a/README.md
+++ b/README.md
@@ -43,8 +43,6 @@ python server/bloom_inference/main.py bigscience/bloom --num-gpus 8 --shard-dire
 
 ## TODO:
 
-- [ ] Improve model download
-  - Store "shardable" layers separately and layer by layer
 - [ ] Add batching args to router CLI 
 - [ ] Add docstrings + comments everywhere as the codebase is fairly complicated
 - [ ] Add tests
diff --git a/router/rust-toolchain.toml b/router/rust-toolchain.toml
new file mode 100644
index 00000000..30355814
--- /dev/null
+++ b/router/rust-toolchain.toml
@@ -0,0 +1,3 @@
+[toolchain]
+channel = "1.64.0"
+components = ["rustfmt", "clippy"]
\ No newline at end of file
diff --git a/router/src/batcher.rs b/router/src/batcher.rs
index a044e26c..2025cf62 100644
--- a/router/src/batcher.rs
+++ b/router/src/batcher.rs
@@ -83,7 +83,12 @@ async fn batching_task(client: ShardedClient, db: Db, shared: Arc<Shared>) {
 
                 cached_batch = match batch_size {
                     size if size > 16 => {
-                        wrap_future(client.generate_until_finished_with_cache(batches), request_ids, &db).await
+                        wrap_future(
+                            client.generate_until_finished_with_cache(batches),
+                            request_ids,
+                            &db,
+                        )
+                        .await
                     }
                     _ => wrap_future(client.generate_with_cache(batches), request_ids, &db).await,
                 };
diff --git a/router/src/main.rs b/router/src/main.rs
index 803753b1..2fe02944 100644
--- a/router/src/main.rs
+++ b/router/src/main.rs
@@ -1,5 +1,5 @@
-use std::net::SocketAddr;
 use bloom_inference_client::ShardedClient;
+use std::net::SocketAddr;
 use std::time::Duration;
 use tokenizers::Tokenizer;
 
@@ -37,7 +37,7 @@ fn main() -> Result<(), std::io::Error> {
                 .expect("Unable to clear cache");
             tracing::info!("Connected");
 
-            let addr = SocketAddr::from(([127, 0, 0, 1], 3000));
+            let addr = SocketAddr::from(([0, 0, 0, 0], 3000));
 
             server::run(sharded_client, tokenizer, addr).await;
             Ok(())
diff --git a/router/src/server.rs b/router/src/server.rs
index 61c57069..1cbec333 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -1,10 +1,10 @@
-use std::net::SocketAddr;
-use axum::{Router, Json};
-use axum::http::StatusCode;
-use axum::extract::Extension;
-use axum::routing::post;
 use crate::{Batcher, ShardedClient, Validation};
+use axum::extract::Extension;
+use axum::http::StatusCode;
+use axum::routing::post;
+use axum::{Json, Router};
 use serde::Deserialize;
+use std::net::SocketAddr;
 use tokenizers::Tokenizer;
 use tokio::time::Instant;
 use tracing::instrument;
@@ -60,6 +60,31 @@ pub(crate) struct GenerateRequest {
     pub parameters: GenerateParameters,
 }
 
+#[instrument(skip(state), fields(time, time_per_token))]
+async fn liveness(state: Extension<ServerState>) -> Result<(), StatusCode> {
+    let output = state
+        .infer
+        .infer(
+            1,
+            GenerateRequest {
+                inputs: "liveness".to_string(),
+                parameters: GenerateParameters {
+                    temperature: 1.0,
+                    top_k: 0,
+                    top_p: 1.0,
+                    do_sample: false,
+                    max_new_tokens: 1,
+                },
+            },
+        )
+        .await;
+
+    match output {
+        Ok(_) => Ok(()),
+        Err(_) => Err(StatusCode::INTERNAL_SERVER_ERROR),
+    }
+}
+
 #[instrument(skip(state), fields(time, time_per_token))]
 async fn generate(
     state: Extension<ServerState>,
@@ -67,14 +92,16 @@ async fn generate(
 ) -> Result<Json<serde_json::Value>, StatusCode> {
     let start = Instant::now();
 
-    let (input_length, validated_request) = match state.validation
+    let (input_length, validated_request) = match state
+        .validation
         .validate(GenerateRequest {
             inputs: req.inputs.clone(),
             parameters: req.parameters.clone(),
         })
-        .await {
+        .await
+    {
         Ok(result) => result,
-        Err(_) => return Err(StatusCode::INTERNAL_SERVER_ERROR)
+        Err(_) => return Err(StatusCode::INTERNAL_SERVER_ERROR),
     };
 
     let output = state.infer.infer(input_length, validated_request).await;
@@ -102,11 +129,7 @@ struct ServerState {
     infer: Batcher,
 }
 
-pub async fn run(
-    client: ShardedClient,
-    tokenizer: Tokenizer,
-    addr: SocketAddr,
-)  {
+pub async fn run(client: ShardedClient, tokenizer: Tokenizer, addr: SocketAddr) {
     client.clear_cache().await.expect("Unable to clear cache");
     tracing::info!("Connected");
 
@@ -114,13 +137,16 @@ pub async fn run(
 
     let validation = Validation::new(tokenizer);
 
-    let shared_state = ServerState {
-        validation,
-        infer,
-    };
+    let shared_state = ServerState { validation, infer };
 
-    let app = Router::new().route("/generate", post(generate)).layer(Extension(shared_state));
+    let app = Router::new()
+        .route("/generate", post(generate))
+        .layer(Extension(shared_state.clone()))
+        .route("/health", post(liveness))
+        .layer(Extension(shared_state.clone()));
 
     axum::Server::bind(&addr)
-        .serve(app.into_make_service()).await.unwrap();
+        .serve(app.into_make_service())
+        .await
+        .unwrap();
 }
diff --git a/run.sh b/run.sh
new file mode 100755
index 00000000..3b095541
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+server_cmd="python server/bloom_inference/main.py $MODEL_NAME --num-gpus $NUM_GPUS --shard-directory $MODEL_BASE_PATH"
+$server_cmd &
+
+FILE=/tmp/bloom-inference-0
+
+while :
+  do
+    if test -S "$FILE"; then
+        echo "Text Generation Python gRPC server started"
+        break
+    else
+      echo "Waiting for Text Generation Python gRPC server to start"
+      sleep 5
+    fi
+  done
+
+sleep 1
+
+exec "bloom-inference"
diff --git a/server/bloom_inference/model.py b/server/bloom_inference/model.py
index 21cf1154..40d69e8b 100644
--- a/server/bloom_inference/model.py
+++ b/server/bloom_inference/model.py
@@ -220,12 +220,14 @@ class BLOOM:
     def __init__(self, model_name: str):
         if torch.cuda.is_available():
             self.device = torch.device("cuda")
+            dtype = torch.bfloat16
         else:
             self.device = torch.device("cpu")
+            dtype = torch.float32
 
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
         self.model = (
-            AutoModelForCausalLM.from_pretrained(model_name).eval().to(self.device)
+            AutoModelForCausalLM.from_pretrained(model_name).eval().to(self.device).to(dtype)
         )
         self.num_heads = self.model.base_model.num_heads
 
@@ -427,7 +429,8 @@ class BLOOMSharded(BLOOM):
             if do_transpose:
                 state_dict[key] = state_dict[key].transpose(1, 0).contiguous()
 
-        model.load_state_dict(state_dict)
+        model.load_state_dict(state_dict, strict=False)
+        model.tie_weights()
         self.model = model.to(self.device).eval()
         self.num_heads = config.n_head // self.process_group.size()
         torch.distributed.barrier(group=self.process_group)
diff --git a/server/bloom_inference/prepare_weights.py b/server/bloom_inference/prepare_weights.py
index 6fd4ec50..5fa2be51 100644
--- a/server/bloom_inference/prepare_weights.py
+++ b/server/bloom_inference/prepare_weights.py
@@ -1,18 +1,62 @@
 import torch
+import os
+import tempfile
+import json
 
+from typing import BinaryIO
+from joblib import Parallel, delayed
+from functools import partial
 from pathlib import Path
 from tqdm import tqdm
 
-MODEL_NAME = "bigscience/bloom"
+from huggingface_hub import hf_hub_url
+from huggingface_hub.file_download import _request_wrapper, hf_raise_for_status
 
 
 def match_suffix(text, suffix):
-    return text[-len(suffix) :] == suffix
+    return text[-len(suffix):] == suffix
 
 
-def prepare_weights(hub_path: Path, save_path: Path, tp_world_size: int):
+def http_get(
+        url: str,
+        temp_file: BinaryIO,
+        *,
+        timeout=10.0,
+        max_retries=0,
+):
+    """
+    Download a remote file. Do not gobble up errors, and will return errors tailored to the Hugging Face Hub.
+    """
+    r = _request_wrapper(
+        method="GET",
+        url=url,
+        stream=True,
+        timeout=timeout,
+        max_retries=max_retries,
+    )
+    hf_raise_for_status(r)
+    for chunk in r.iter_content(chunk_size=1024):
+        if chunk:  # filter out keep-alive new chunks
+            temp_file.write(chunk)
+
+
+def cache_download_url(url: str, root_dir: Path):
+    filename = root_dir / url.split("/")[-1]
+
+    if not filename.exists():
+        temp_file_manager = partial(
+            tempfile.NamedTemporaryFile, mode="wb", dir=root_dir, delete=False
+        )
+        with temp_file_manager() as temp_file:
+            http_get(url, temp_file)
+
+        os.replace(temp_file.name, filename)
+    return filename
+
+
+def prepare_weights(model_name: str, cache_path: Path, save_path: Path, tp_world_size: int):
     save_paths = [
-        save_path / f"{MODEL_NAME}_tp-rank-{tp_rank}-of-{tp_world_size}.pty"
+        save_path / f"{model_name}_tp-rank-{tp_rank}-of-{tp_world_size}.pty"
         for tp_rank in range(tp_world_size)
     ]
 
@@ -20,45 +64,67 @@ def prepare_weights(hub_path: Path, save_path: Path, tp_world_size: int):
         print("Weights are already prepared")
         return
 
+    cache_path.mkdir(parents=True, exist_ok=True)
+    if model_name == "bigscience/bloom-560m":
+        url = hf_hub_url(model_name, filename="pytorch_model.bin")
+        cache_download_url(url, cache_path)
+    elif model_name == "bigscience/bloom":
+        url = hf_hub_url(model_name, filename="pytorch_model.bin.index.json")
+        index_path = cache_download_url(url, cache_path)
+        with index_path.open("r") as f:
+            index = json.load(f)
+
+        # Get unique file names
+        weight_files = list(set([filename for filename in index["weight_map"].values()]))
+        urls = [hf_hub_url(model_name, filename=filename) for filename in weight_files]
+
+        Parallel(n_jobs=5)(delayed(cache_download_url)(url, cache_path) for url in tqdm(urls))
+    else:
+        raise ValueError(f"Unknown model name: {model_name}")
+
     shards_state_dicts = [{} for _ in range(tp_world_size)]
 
-    for weight_path in tqdm(hub_path.glob("*.bin")):
+    for weight_path in tqdm(Path(cache_path).glob("*.bin")):
         state_dict = torch.load(weight_path, map_location="cpu")
 
         keys = list(state_dict.keys())
         for state_name in keys:
             state = state_dict[state_name]
             if any(
-                match_suffix(state_name, candidate)
-                for candidate in [
-                    "self_attention.query_key_value.weight",
-                    "self_attention.query_key_value.bias",
-                    "mlp.dense_h_to_4h.weight",
-                    "mlp.dense_h_to_4h.bias",
-                    "word_embeddings.weight",
-                    "lm_head.weight",
-                ]
+                    match_suffix(state_name, candidate)
+                    for candidate in [
+                        "self_attention.query_key_value.weight",
+                        "self_attention.query_key_value.bias",
+                        "mlp.dense_h_to_4h.weight",
+                        "mlp.dense_h_to_4h.bias",
+                        "word_embeddings.weight",
+                    ]
             ):
                 output_size = state.shape[0]
                 assert output_size % tp_world_size == 0
                 block_size = output_size // tp_world_size
                 sharded_weights = torch.split(state, block_size, dim=0)
                 assert len(sharded_weights) == tp_world_size
+
                 for tp_rank, shard in enumerate(sharded_weights):
-                    assert shard.shape[0] == block_size
-                    if match_suffix(state_name, "lm_head.weight"):
-                        shards_state_dicts[tp_rank][state_name] = shard.detach().clone()
-                    else:
-                        shards_state_dicts[tp_rank][
-                            "transformer." + state_name
-                        ] = shard.detach().clone()
+                    shards_state_dicts[tp_rank]["transformer." + state_name] = shard.detach().clone()
+
+            elif match_suffix(state_name, "lm_head.weight"):
+                output_size = state.shape[0]
+                assert output_size % tp_world_size == 0
+                block_size = output_size // tp_world_size
+                sharded_weights = torch.split(state, block_size, dim=0)
+                assert len(sharded_weights) == tp_world_size
+
+                for tp_rank, shard in enumerate(sharded_weights):
+                    shards_state_dicts[tp_rank][state_name] = shard.detach().clone()
+
             elif any(
-                match_suffix(state_name, candidate)
-                for candidate in [
-                    "self_attention.dense.weight",
-                    "mlp.dense_4h_to_h.weight",
-                    "lm_head.weight",
-                ]
+                    match_suffix(state_name, candidate)
+                    for candidate in [
+                        "self_attention.dense.weight",
+                        "mlp.dense_4h_to_h.weight",
+                    ]
             ):
                 input_size = state.shape[1]
                 assert input_size % tp_world_size == 0
@@ -66,40 +132,31 @@ def prepare_weights(hub_path: Path, save_path: Path, tp_world_size: int):
                 sharded_weights = torch.split(state, block_size, dim=1)
                 assert len(sharded_weights) == tp_world_size
                 for tp_rank, shard in enumerate(sharded_weights):
-                    assert shard.shape[1] == block_size
-                    if match_suffix(state_name, "lm_head.weight"):
-                        shards_state_dicts[tp_rank][state_name] = shard.detach().clone()
-                    else:
-                        shards_state_dicts[tp_rank][
-                            "transformer." + state_name
-                        ] = shard.detach().clone()
+                    shards_state_dicts[tp_rank]["transformer." + state_name] = shard.detach().clone()
+
             elif any(
-                match_suffix(state_name, candidate)
-                for candidate in [
-                    "self_attention.dense.bias",
-                    "mlp.dense_4h_to_h.bias",
-                ]
+                    match_suffix(state_name, candidate)
+                    for candidate in [
+                        "self_attention.dense.bias",
+                        "mlp.dense_4h_to_h.bias",
+                    ]
             ):
-                shards_state_dicts[0][
-                    "transformer." + state_name
-                ] = state.detach().clone()
+                shards_state_dicts[0]["transformer." + state_name] = state.detach().clone()
                 for tp_rank in range(1, tp_world_size):
-                    shards_state_dicts[tp_rank][
-                        "transformer." + state_name
-                    ] = torch.zeros_like(state)
+                    shards_state_dicts[tp_rank]["transformer." + state_name] = torch.zeros_like(state)
+
             else:
                 # We duplicate parameters across tp ranks
                 for tp_rank in range(tp_world_size):
-                    shards_state_dicts[tp_rank][
-                        "transformer." + state_name
-                    ] = state.detach().clone()
+                    shards_state_dicts[tp_rank]["transformer." + state_name] = state.detach().clone()
 
             del state_dict[state_name]  # delete key from state_dict
             del state  # delete tensor
+        del state_dict
 
     # we save state_dict
     for tp_rank, (save_path, shard_state_dict) in enumerate(
-        zip(save_paths, shards_state_dicts)
+            zip(save_paths, shards_state_dicts)
     ):
         save_paths.append(save_path)
         save_path.parent.mkdir(parents=True, exist_ok=True)
@@ -116,9 +173,10 @@ if __name__ == "__main__":
 
     parser = ArgumentParser()
 
-    parser.add_argument("--hub-path", required=True, type=str)
+    parser.add_argument("--model-name", required=True, type=str)
+    parser.add_argument("--cache-path", required=True, type=str)
     parser.add_argument("--save-path", required=True, type=str)
     parser.add_argument("--world-size", required=True, type=int)
     args = parser.parse_args()
 
-    prepare_weights(Path(args.hub_path), Path(args.save_path), args.world_size)
+    prepare_weights(args.model_name, Path(args.cache_path), Path(args.save_path), args.world_size)
diff --git a/server/bloom_inference/shard_model.py b/server/bloom_inference/shard_model.py
deleted file mode 100644
index 360e8962..00000000
--- a/server/bloom_inference/shard_model.py
+++ /dev/null
@@ -1,102 +0,0 @@
-from pathlib import Path
-
-import torch
-from torch import nn
-from transformers import AutoModelForCausalLM
-
-
-def match_suffix(text, suffix):
-    return text[-len(suffix) :] == suffix
-
-
-def shard_model(model_name: str, path: Path, tp_world_size: int, dtype: torch.dtype):
-    """BLOOM specific sharding mechanism"""
-    save_paths = [
-        path / f"{model_name}_tp-rank-{tp_rank}-of-{tp_world_size}.pty"
-        for tp_rank in range(tp_world_size)
-    ]
-    if all(save_path.exists() for save_path in save_paths):
-        print("Loading already cached values")
-        return save_paths
-
-    model: nn.Module = AutoModelForCausalLM.from_pretrained(
-        model_name, torch_dtype=dtype, local_files_only=True
-    )
-
-    shards_state_dicts = [{} for _ in range(tp_world_size)]
-    state_dict = model.state_dict()
-    keys = list(state_dict.keys())
-    for state_name in keys:
-        print(state_name)
-        state = state_dict[state_name]
-        if any(
-            match_suffix(state_name, candidate)
-            for candidate in [
-                "self_attention.query_key_value.weight",
-                "self_attention.query_key_value.bias",
-                "mlp.dense_h_to_4h.weight",
-                "mlp.dense_h_to_4h.bias",
-                "transformer.word_embeddings.weight",
-                "lm_head.weight",
-            ]
-        ):
-            output_size = state.shape[0]
-            assert output_size % tp_world_size == 0
-            block_size = output_size // tp_world_size
-            sharded_weights = torch.split(state, block_size, dim=0)
-            assert len(sharded_weights) == tp_world_size
-            for tp_rank, shard in enumerate(sharded_weights):
-                assert shard.shape[0] == block_size
-                shards_state_dicts[tp_rank][state_name] = shard.detach().clone()
-        elif any(
-            match_suffix(state_name, candidate)
-            for candidate in [
-                "self_attention.dense.weight",
-                "mlp.dense_4h_to_h.weight",
-                "lm_head.weight",
-            ]
-        ):
-            input_size = state.shape[1]
-            assert input_size % tp_world_size == 0
-            block_size = input_size // tp_world_size
-            sharded_weights = torch.split(state, block_size, dim=1)
-            assert len(sharded_weights) == tp_world_size
-            for tp_rank, shard in enumerate(sharded_weights):
-                assert shard.shape[1] == block_size
-                shards_state_dicts[tp_rank][state_name] = shard.detach().clone()
-        elif any(
-            match_suffix(state_name, candidate)
-            for candidate in [
-                "self_attention.dense.bias",
-                "mlp.dense_4h_to_h.bias",
-            ]
-        ):
-            shards_state_dicts[0][state_name] = state.detach().clone()
-            for tp_rank in range(1, tp_world_size):
-                shards_state_dicts[tp_rank][state_name] = torch.zeros_like(state)
-        else:
-            # We duplicate parameters across tp ranks
-            for tp_rank in range(tp_world_size):
-                shards_state_dicts[tp_rank][state_name] = state.detach().clone()
-
-        del state_dict[state_name]  # delete key from state_dict
-        del state  # delete tensor
-
-    # we save state_dict
-    for tp_rank, (save_path, shard_state_dict) in enumerate(
-        zip(save_paths, shards_state_dicts)
-    ):
-        save_path.parent.mkdir(parents=True, exist_ok=True)
-        torch.save(shard_state_dict, save_path)
-        save_paths.append(save_path)
-
-    return save_paths
-
-
-if __name__ == "__main__":
-    model_name = "bigscience/bloom"
-    save_path = Path("/data/shards")
-    tp_world_size = 8
-    dtype = torch.bfloat16
-
-    shard_model(model_name, save_path, tp_world_size=tp_world_size, dtype=dtype)
diff --git a/server/poetry.lock b/server/poetry.lock
index ea20ef26..8100c200 100644
--- a/server/poetry.lock
+++ b/server/poetry.lock
@@ -80,6 +80,14 @@ grpcio = ">=1.49.1"
 protobuf = ">=4.21.3,<5.0dev"
 setuptools = "*"
 
+[[package]]
+name = "joblib"
+version = "1.2.0"
+description = "Lightweight pipelining with Python functions"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
 [[package]]
 name = "numpy"
 version = "1.23.3"
@@ -197,7 +205,7 @@ python-versions = ">=3.7"
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.9"
-content-hash = "cedd0aebeb3731e2bbddf017a2ee6074c285866354272f8dfe930e9606437a25"
+content-hash = "f3dc5b2420183f2e7e9257e372489409d7bd26d1dcc535fc2558ebca50c988c2"
 
 [metadata.files]
 accelerate = [
@@ -310,6 +318,10 @@ grpcio-tools = [
     {file = "grpcio_tools-1.49.1-cp39-cp39-win32.whl", hash = "sha256:704d21509ec06efc9d034dbe70e7152715aac004941f4f0f553cf3a0aff15bd5"},
     {file = "grpcio_tools-1.49.1-cp39-cp39-win_amd64.whl", hash = "sha256:1efa0c221c719433f441ac0e026fc3c4dbc9a1a08a552ecdc707775e2f2fbbae"},
 ]
+joblib = [
+    {file = "joblib-1.2.0-py3-none-any.whl", hash = "sha256:091138ed78f800342968c523bdde947e7a305b8594b910a0fea2ab83c3c6d385"},
+    {file = "joblib-1.2.0.tar.gz", hash = "sha256:e1cee4a79e4af22881164f218d4311f60074197fb707e082e803b61f6d137018"},
+]
 numpy = [
     {file = "numpy-1.23.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c9f707b5bb73bf277d812ded9896f9512a43edff72712f31667d0a8c2f8e71ee"},
     {file = "numpy-1.23.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ffcf105ecdd9396e05a8e58e81faaaf34d3f9875f137c7372450baa5d77c9a54"},
diff --git a/server/pyproject.toml b/server/pyproject.toml
index 9d14ce6c..80d95426 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -12,6 +12,7 @@ torch = "^1.12.1"
 typer = "^0.6.1"
 grpcio-reflection = "^1.49.1"
 accelerate = "^0.12.0"
+joblib = "^1.2.0"
 
 [tool.poetry.group.dev.dependencies]
 grpcio-tools = "^1.49.1"