diff --git a/.dockerignore b/.dockerignore index 5aa1aa3..fcfaad0 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,2 +1,3 @@ aml -target \ No newline at end of file +target +server/transformers \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 9b6ef83..214a0ac 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,6 +2,7 @@ FROM rust:1.64 as router-builder WORKDIR /usr/src +COPY rust-toolchain.toml rust-toolchain.toml COPY proto proto COPY router router @@ -13,6 +14,7 @@ FROM rust:1.64 as launcher-builder WORKDIR /usr/src +COPY rust-toolchain.toml rust-toolchain.toml COPY launcher launcher WORKDIR /usr/src/launcher diff --git a/aml/deployment.yaml b/aml/deployment.yaml index 35d1900..51e124b 100644 --- a/aml/deployment.yaml +++ b/aml/deployment.yaml @@ -8,7 +8,7 @@ environment_variables: MODEL_NAME: bigscience/bloom NUM_GPUS: 8 environment: - image: db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference:0.2 + image: db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference:0.3.1 inference_config: liveness_route: port: 3000 @@ -25,14 +25,14 @@ request_settings: max_concurrent_requests_per_instance: 256 liveness_probe: initial_delay: 600 - timeout: 20 + timeout: 90 period: 120 success_threshold: 1 - failure_threshold: 3 + failure_threshold: 5 readiness_probe: initial_delay: 600 - timeout: 20 + timeout: 90 period: 120 success_threshold: 1 - failure_threshold: 3 + failure_threshold: 5 instance_count: 1 diff --git a/server/Makefile b/server/Makefile index 39a98b6..57dea48 100644 --- a/server/Makefile +++ b/server/Makefile @@ -7,13 +7,13 @@ gen-server: touch text_generation/pb/__init__.py install-transformers: - # Install specific version of transformers + # Install specific version of transformers with custom cuda kernels rm transformers || true - rm transformers-7302a24535e8dc5637ea5b4e4572fc971d404098 || true - curl -L -O https://github.com/OlivierDehaene/transformers/archive/7302a24535e8dc5637ea5b4e4572fc971d404098.zip - unzip 7302a24535e8dc5637ea5b4e4572fc971d404098.zip - rm 7302a24535e8dc5637ea5b4e4572fc971d404098.zip - mv transformers-7302a24535e8dc5637ea5b4e4572fc971d404098 transformers + rm transformers-b55f16c5b71aeef47a66a4270e19c154f050a7a7 || true + curl -L -O https://github.com/OlivierDehaene/transformers/archive/b55f16c5b71aeef47a66a4270e19c154f050a7a7.zip + unzip b55f16c5b71aeef47a66a4270e19c154f050a7a7.zip + rm b55f16c5b71aeef47a66a4270e19c154f050a7a7.zip + mv transformers-b55f16c5b71aeef47a66a4270e19c154f050a7a7 transformers cd transformers && python setup.py install install-torch: diff --git a/server/text_generation/models/bloom.py b/server/text_generation/models/bloom.py index 730958c..38ef8ef 100644 --- a/server/text_generation/models/bloom.py +++ b/server/text_generation/models/bloom.py @@ -38,7 +38,7 @@ class BLOOMSharded(CausalLM): self.master = self.rank == 0 if torch.cuda.is_available(): device = torch.device(f"cuda:{self.rank}") - dtype = torch.float16 + dtype = torch.bfloat16 else: device = torch.device("cpu") dtype = torch.float32