From 7872b8c55b6cdbf97e30ba6e4cd700f2de7e9bc4 Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Wed, 24 Jan 2024 11:41:28 -0500
Subject: [PATCH] Add messages api compatibility docs (#1478)

This PR adds a new page to the docs that describes the Messages API and
how to use it.

Additionally this page will contain cloud provider specific information
for enabling and using this feature. This PR includes a SageMaker
example/information.
---
 docs/source/_toctree.yml    |   2 +
 docs/source/messages_api.md | 134 ++++++++++++++++++++++++++++++++++++
 router/src/main.rs          |   6 +-
 router/src/server.rs        |   4 +-
 4 files changed, 141 insertions(+), 5 deletions(-)
 create mode 100644 docs/source/messages_api.md

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 6fa50a6a..d57a594d 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -7,6 +7,8 @@
     title: Installation
   - local: supported_models
     title: Supported Models and Hardware
+  - local: messages_api
+    title: Messages API
   title: Getting started
 - sections:
   - local: basic_tutorials/consuming_tgi
diff --git a/docs/source/messages_api.md b/docs/source/messages_api.md
new file mode 100644
index 00000000..899de865
--- /dev/null
+++ b/docs/source/messages_api.md
@@ -0,0 +1,134 @@
+# Messages API
+
+_Messages API is compatible to OpenAI Chat Completion API_
+
+Text Generation Inference (TGI) now supports the Message API which is fully compatible with the OpenAI Chat Completion API. This means you can use OpenAI's client libraries to interact with TGI's Messages API. Below are some examples of how to utilize this compatibility.
+
+## Making a Request
+
+You can make a request to TGI's Messages API using `curl`. Here's an example:
+
+```bash
+curl localhost:3000/v1/chat/completions \
+    -X POST \
+    -d '{
+  "model": "tgi",
+  "messages": [
+    {
+      "role": "system",
+      "content": "You are a helpful assistant."
+    },
+    {
+      "role": "user",
+      "content": "What is deep learning?"
+    }
+  ],
+  "stream": true,
+  "max_tokens": 20
+}' \
+    -H 'Content-Type: application/json'
+```
+
+## Streaming
+
+You can also use OpenAI's Python client library to make a streaming request. Here's how:
+
+```python
+from openai import OpenAI
+
+# init the client but point it to TGI
+client = OpenAI(
+    base_url="http://localhost:3000/v1",
+    api_key="-"
+)
+
+chat_completion = client.chat.completions.create(
+    model="tgi",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant." },
+        {"role": "user", "content": "What is deep learning?"}
+    ],
+    stream=True
+)
+
+# iterate and print stream
+for message in chat_completion:
+    print(message)
+```
+
+## Synchronous
+
+If you prefer to make a synchronous request, you can do so like this:
+
+```python
+from openai import OpenAI
+
+# init the client but point it to TGI
+client = OpenAI(
+    base_url="http://localhost:3000/v1",
+    api_key="-"
+)
+
+chat_completion = client.chat.completions.create(
+    model="tgi",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant." },
+        {"role": "user", "content": "What is deep learning?"}
+    ],
+    stream=False
+)
+
+print(chat_completion)
+```
+
+## Cloud Providers
+
+TGI can be deployed on various cloud providers for scalable and robust text generation. One such provider is Amazon SageMaker, which has recently added support for TGI. Here's how you can deploy TGI on Amazon SageMaker:
+
+## Amazon SageMaker
+
+To enable the Messages API in Amazon SageMaker you need to set the environment variable `MESSAGES_API_ENABLED=true`. 
+
+This will modify the `/invocations` route to accept Messages dictonaries consisting out of role and content. See the example below on how to deploy Llama with the new Messages API.
+
+```python
+import json
+import sagemaker
+import boto3
+from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri
+
+try:
+	role = sagemaker.get_execution_role()
+except ValueError:
+	iam = boto3.client('iam')
+	role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']
+
+# Hub Model configuration. https://huggingface.co/models
+hub = {
+	'HF_MODEL_ID':'HuggingFaceH4/zephyr-7b-beta',
+	'SM_NUM_GPUS': json.dumps(1),
+    'MESSAGES_API_ENABLED': True
+}
+
+# create Hugging Face Model Class
+huggingface_model = HuggingFaceModel(
+	image_uri=get_huggingface_llm_image_uri("huggingface",version="1.4.0"),
+	env=hub,
+	role=role, 
+)
+
+# deploy model to SageMaker Inference
+predictor = huggingface_model.deploy(
+	initial_instance_count=1,
+	instance_type="ml.g5.2xlarge",
+	container_startup_health_check_timeout=300,
+  )
+  
+# send request
+predictor.predict({
+"messages": [
+        {"role": "system", "content": "You are a helpful assistant." },
+        {"role": "user", "content": "What is deep learning?"}
+    ]
+})
+```
\ No newline at end of file
diff --git a/router/src/main.rs b/router/src/main.rs
index bf987eb6..b6190908 100644
--- a/router/src/main.rs
+++ b/router/src/main.rs
@@ -72,7 +72,7 @@ struct Args {
     #[clap(long, env)]
     ngrok_edge: Option<String>,
     #[clap(long, env, default_value_t = false)]
-    chat_enabled_api: bool,
+    messages_api_enabled: bool,
 }
 
 #[tokio::main]
@@ -104,7 +104,7 @@ async fn main() -> Result<(), RouterError> {
         ngrok,
         ngrok_authtoken,
         ngrok_edge,
-        chat_enabled_api,
+        messages_api_enabled,
     } = args;
 
     // Launch Tokio runtime
@@ -348,7 +348,7 @@ async fn main() -> Result<(), RouterError> {
         ngrok_authtoken,
         ngrok_edge,
         tokenizer_config,
-        chat_enabled_api,
+        messages_api_enabled,
     )
     .await?;
     Ok(())
diff --git a/router/src/server.rs b/router/src/server.rs
index aa1ad202..ff48b4f0 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -708,7 +708,7 @@ pub async fn run(
     ngrok_authtoken: Option<String>,
     ngrok_edge: Option<String>,
     tokenizer_config: HubTokenizerConfig,
-    chat_enabled_api: bool,
+    messages_api_enabled: bool,
 ) -> Result<(), axum::BoxError> {
     // OpenAPI documentation
     #[derive(OpenApi)]
@@ -872,7 +872,7 @@ pub async fn run(
         .route("/metrics", get(metrics));
 
     // Conditional AWS Sagemaker route
-    let aws_sagemaker_route = if chat_enabled_api {
+    let aws_sagemaker_route = if messages_api_enabled {
         Router::new().route("/invocations", post(chat_completions)) // Use 'chat_completions' for OAI_ENABLED
     } else {
         Router::new().route("/invocations", post(compat_generate)) // Use 'compat_generate' otherwise