$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json name: bloom-deployment endpoint_name: bloom-inference model: azureml:bloom-safetensors:1 model_mount_path: /var/azureml-model environment_variables: WEIGHTS_CACHE_OVERRIDE: /var/azureml-model/bloom-safetensors MODEL_ID: bigscience/bloom NUM_SHARD: 8 environment: image: db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference:sha-cd5961b inference_config: liveness_route: port: 80 path: /health readiness_route: port: 80 path: /health scoring_route: port: 80 path: / instance_type: Standard_ND96amsr_A100_v4 request_settings: request_timeout_ms: 60000 max_concurrent_requests_per_instance: 256 liveness_probe: initial_delay: 140 timeout: 60 period: 60 success_threshold: 1 failure_threshold: 2 readiness_probe: initial_delay: 140 timeout: 60 period: 60 success_threshold: 1 failure_threshold: 2 instance_count: 1