feat(server): add watermarking tests (#248)

2023-04-27 10:16:35 -07:00 · 2023-04-27 10:16:35 -07:00 · f092ba9b22
parent b9ae7e5da1
commit f092ba9b22
10 changed files with 98 additions and 16 deletions
--- a/.github/workflows/client-tests.yaml
+++ b/.github/workflows/client-tests.yaml
@ -22,4 +22,4 @@ jobs:
      - name: Run tests
        run: |
          pip install pytest pytest-asyncio
-          cd clients/python && pytest tests
+          make python-client-tests
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@ -66,7 +66,7 @@ jobs:
      - name: Run server tests
        run: |
          pip install pytest
-          HF_HUB_ENABLE_HF_TRANSFER=1 pytest -sv server/tests
+          make python-server-tests
      - name: Run Rust fmt
        run: |
          cargo fmt --check
--- a/9
+++ b/9
@ -21,8 +21,13 @@ router-dev:
 integration-tests: install-router install-launcher
 	cargo test
-python-tests:
+python-server-tests:
-	cd server && HF_HUB_ENABLE_HF_TRANSFER=1 pytest tests
+	HF_HUB_ENABLE_HF_TRANSFER=1 pytest server/tests
 python-client-tests:
 	pytest clients/python/tests
 python-tests: python-server-tests python-client-tests
 run-bloom-560m:
 	text-generation-launcher --model-id bigscience/bloom-560m --num-shard 2 --port 8080
--- a/README.md
+++ b/README.md
@ -36,7 +36,7 @@ to power LLMs api-inference widgets.
  - [Quantization](#quantization)
 - [Develop](#develop)
 - [Testing](#testing)
-  
+
 ## Features
 - Serve the most popular Large Language Models with a simple launcher
@ -131,7 +131,7 @@ by setting the address to an OTLP collector with the `--otlp-endpoint` argument.
 ### A note on Shared Memory (shm)
-[`NCCL`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html) is a communication framework used by 
+[`NCCL`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html) is a communication framework used by
 `PyTorch` to do distributed training/inference. `text-generation-inference` make
 use of `NCCL` to enable Tensor Parallelism to dramatically speed up inference for large language models.
@ -152,14 +152,14 @@ creating a volume with:
 and mounting it to `/dev/shm`.
-Finally, you can also disable SHM sharing by using the `NCCL_SHM_DISABLE=1` environment variable. However, note that 
+Finally, you can also disable SHM sharing by using the `NCCL_SHM_DISABLE=1` environment variable. However, note that
 this will impact performance.
 ### Local install
-You can also opt to install `text-generation-inference` locally. 
+You can also opt to install `text-generation-inference` locally.
-First [install Rust](https://rustup.rs/) and create a Python virtual environment with at least 
+First [install Rust](https://rustup.rs/) and create a Python virtual environment with at least
 Python 3.9, e.g. using `conda`:
 ```shell
@ -181,7 +181,7 @@ sudo unzip -o $PROTOC_ZIP -d /usr/local 'include/*'
 rm -f $PROTOC_ZIP
 ```
-On MacOS, using Homebrew: 
+On MacOS, using Homebrew:
 ```shell
 brew install protobuf
@ -241,6 +241,11 @@ make router-dev
 ## Testing
 ```shell
 # python
 make python-server-tests
 make python-client-tests
 # or both server and client tests
 make python-tests
 # rust cargo tests
 make integration-tests
 ```
--- a/clients/python/pyproject.toml
+++ b/clients/python/pyproject.toml
@ -21,6 +21,9 @@ pytest = "^6.2.5"
 pytest-asyncio = "^0.17.2"
 pytest-cov = "^3.0.0"
 [tool.pytest.ini_options]
 asyncio_mode = "auto"
 [build-system]
-requires = ["poetry-core"]
+requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
--- a/clients/python/tests/test_client.py
+++ b/clients/python/tests/test_client.py
@ -87,6 +87,19 @@ async def test_generate_async(flan_t5_xxl_url, hf_headers):
    )
@pytest.mark.asyncio
 async def test_generate_async_best_of(flan_t5_xxl_url, hf_headers):
    client = AsyncClient(flan_t5_xxl_url, hf_headers)
    response = await client.generate(
        "test", max_new_tokens=1, best_of=2, do_sample=True
    )
    assert response.details.seed is not None
    assert response.details.best_of_sequences is not None
    assert len(response.details.best_of_sequences) == 1
    assert response.details.best_of_sequences[0].seed is not None
@pytest.mark.asyncio
 async def test_generate_async_not_found(fake_url, hf_headers):
    client = AsyncClient(fake_url, hf_headers)
--- a/router/client/src/sharded_client.rs
+++ b/router/client/src/sharded_client.rs
@ -20,7 +20,7 @@ impl ShardedClient {
    /// the other shards and returns all uris/unix sockets with the `service_discovery` gRPC method.
    async fn from_master_client(mut master_client: Client) -> Result<Self> {
        // Get all uris/unix sockets from the master client
-        let uris = master_client.service_discovery().await.unwrap();
+        let uris = master_client.service_discovery().await?;
        let futures = uris.into_iter().map(Client::connect_uds);
        let clients: Result<Vec<Client>> = join_all(futures).await.into_iter().collect();
        Ok(Self::new(clients?))
--- a/router/src/health.rs
+++ b/router/src/health.rs
@ -4,6 +4,10 @@ use text_generation_client::{
    Batch, NextTokenChooserParameters, Request, ShardedClient, StoppingCriteriaParameters,
 };
 // Note: Request ids and batch ids cannot collide.
 const LIVENESS_ID: u64 = u64::MAX;
 const BATCH_ID: u64 = u64::MAX;
 #[derive(Clone, Debug)]
 pub(crate) struct Health {
    client: ShardedClient,
@ -27,7 +31,7 @@ impl Health {
            // Dummy batch of 1 token and 1 generated token
            let liveness_request = Request {
-                id: u64::MAX,
+                id: LIVENESS_ID,
                inputs: "liveness".to_string(),
                truncate: 10,
                parameters: Some(NextTokenChooserParameters {
@ -47,7 +51,7 @@ impl Health {
                }),
            };
            let batch = Batch {
-                id: u64::MAX,
+                id: BATCH_ID,
                requests: vec![liveness_request],
                size: 1,
                max_tokens: 2,
--- a/server/tests/utils/test_watermark.py
+++ b/server/tests/utils/test_watermark.py
@ -0,0 +1,54 @@
 # test_watermark_logits_processor.py
 import os
 import numpy as np
 import torch
 from text_generation_server.utils.watermark import WatermarkLogitsProcessor
 GAMMA = os.getenv("WATERMARK_GAMMA", 0.5)
 DELTA = os.getenv("WATERMARK_DELTA", 2.0)
 def test_seed_rng():
    input_ids = [101, 2036, 3731, 102, 2003, 103]
    processor = WatermarkLogitsProcessor()
    processor._seed_rng(input_ids)
    assert isinstance(processor.rng, torch.Generator)
 def test_get_greenlist_ids():
    input_ids = [101, 2036, 3731, 102, 2003, 103]
    processor = WatermarkLogitsProcessor()
    result = processor._get_greenlist_ids(input_ids, 10, torch.device("cpu"))
    assert max(result) <= 10
    assert len(result) == int(10 * 0.5)
 def test_calc_greenlist_mask():
    processor = WatermarkLogitsProcessor()
    scores = torch.tensor([[0.5, 0.3, 0.2, 0.8], [0.1, 0.2, 0.7, 0.9]])
    greenlist_token_ids = torch.tensor([2, 3])
    result = processor._calc_greenlist_mask(scores, greenlist_token_ids)
    assert result.tolist() == [[False, False, False, False], [False, False, True, True]]
    assert result.shape == scores.shape
 def test_bias_greenlist_logits():
    processor = WatermarkLogitsProcessor()
    scores = torch.tensor([[0.5, 0.3, 0.2, 0.8], [0.1, 0.2, 0.7, 0.9]])
    green_tokens_mask = torch.tensor(
        [[False, False, True, True], [False, False, False, True]]
    )
    greenlist_bias = 2.0
    result = processor._bias_greenlist_logits(scores, green_tokens_mask, greenlist_bias)
    assert np.allclose(result.tolist(), [[0.5, 0.3, 2.2, 2.8], [0.1, 0.2, 0.7, 2.9]])
    assert result.shape == scores.shape
 def test_call():
    input_ids = [101, 2036, 3731, 102, 2003, 103]
    processor = WatermarkLogitsProcessor()
    scores = torch.tensor([[0.5, 0.3, 0.2, 0.8], [0.1, 0.2, 0.7, 0.9]])
    result = processor(input_ids, scores)
    assert result.shape == scores.shape
--- a/server/text_generation_server/models/init.py
+++ b/server/text_generation_server/models/init.py
@ -58,8 +58,6 @@ __all__ = [
    "GalacticaSharded",
    "GPTNeoxSharded",
    "Seq2SeqLM",
    "Galactica",
    "GalacticaSharded",
    "SantaCoder",
    "OPT",
    "OPTSharded",