feat(server): add watermarking tests (#248)
This commit is contained in:
parent
b9ae7e5da1
commit
f092ba9b22
|
@ -22,4 +22,4 @@ jobs:
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: |
|
run: |
|
||||||
pip install pytest pytest-asyncio
|
pip install pytest pytest-asyncio
|
||||||
cd clients/python && pytest tests
|
make python-client-tests
|
||||||
|
|
|
@ -66,7 +66,7 @@ jobs:
|
||||||
- name: Run server tests
|
- name: Run server tests
|
||||||
run: |
|
run: |
|
||||||
pip install pytest
|
pip install pytest
|
||||||
HF_HUB_ENABLE_HF_TRANSFER=1 pytest -sv server/tests
|
make python-server-tests
|
||||||
- name: Run Rust fmt
|
- name: Run Rust fmt
|
||||||
run: |
|
run: |
|
||||||
cargo fmt --check
|
cargo fmt --check
|
||||||
|
|
9
Makefile
9
Makefile
|
@ -21,8 +21,13 @@ router-dev:
|
||||||
integration-tests: install-router install-launcher
|
integration-tests: install-router install-launcher
|
||||||
cargo test
|
cargo test
|
||||||
|
|
||||||
python-tests:
|
python-server-tests:
|
||||||
cd server && HF_HUB_ENABLE_HF_TRANSFER=1 pytest tests
|
HF_HUB_ENABLE_HF_TRANSFER=1 pytest server/tests
|
||||||
|
|
||||||
|
python-client-tests:
|
||||||
|
pytest clients/python/tests
|
||||||
|
|
||||||
|
python-tests: python-server-tests python-client-tests
|
||||||
|
|
||||||
run-bloom-560m:
|
run-bloom-560m:
|
||||||
text-generation-launcher --model-id bigscience/bloom-560m --num-shard 2 --port 8080
|
text-generation-launcher --model-id bigscience/bloom-560m --num-shard 2 --port 8080
|
||||||
|
|
17
README.md
17
README.md
|
@ -36,7 +36,7 @@ to power LLMs api-inference widgets.
|
||||||
- [Quantization](#quantization)
|
- [Quantization](#quantization)
|
||||||
- [Develop](#develop)
|
- [Develop](#develop)
|
||||||
- [Testing](#testing)
|
- [Testing](#testing)
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
- Serve the most popular Large Language Models with a simple launcher
|
- Serve the most popular Large Language Models with a simple launcher
|
||||||
|
@ -131,7 +131,7 @@ by setting the address to an OTLP collector with the `--otlp-endpoint` argument.
|
||||||
|
|
||||||
### A note on Shared Memory (shm)
|
### A note on Shared Memory (shm)
|
||||||
|
|
||||||
[`NCCL`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html) is a communication framework used by
|
[`NCCL`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html) is a communication framework used by
|
||||||
`PyTorch` to do distributed training/inference. `text-generation-inference` make
|
`PyTorch` to do distributed training/inference. `text-generation-inference` make
|
||||||
use of `NCCL` to enable Tensor Parallelism to dramatically speed up inference for large language models.
|
use of `NCCL` to enable Tensor Parallelism to dramatically speed up inference for large language models.
|
||||||
|
|
||||||
|
@ -152,14 +152,14 @@ creating a volume with:
|
||||||
|
|
||||||
and mounting it to `/dev/shm`.
|
and mounting it to `/dev/shm`.
|
||||||
|
|
||||||
Finally, you can also disable SHM sharing by using the `NCCL_SHM_DISABLE=1` environment variable. However, note that
|
Finally, you can also disable SHM sharing by using the `NCCL_SHM_DISABLE=1` environment variable. However, note that
|
||||||
this will impact performance.
|
this will impact performance.
|
||||||
|
|
||||||
### Local install
|
### Local install
|
||||||
|
|
||||||
You can also opt to install `text-generation-inference` locally.
|
You can also opt to install `text-generation-inference` locally.
|
||||||
|
|
||||||
First [install Rust](https://rustup.rs/) and create a Python virtual environment with at least
|
First [install Rust](https://rustup.rs/) and create a Python virtual environment with at least
|
||||||
Python 3.9, e.g. using `conda`:
|
Python 3.9, e.g. using `conda`:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
|
@ -181,7 +181,7 @@ sudo unzip -o $PROTOC_ZIP -d /usr/local 'include/*'
|
||||||
rm -f $PROTOC_ZIP
|
rm -f $PROTOC_ZIP
|
||||||
```
|
```
|
||||||
|
|
||||||
On MacOS, using Homebrew:
|
On MacOS, using Homebrew:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
brew install protobuf
|
brew install protobuf
|
||||||
|
@ -241,6 +241,11 @@ make router-dev
|
||||||
## Testing
|
## Testing
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
|
# python
|
||||||
|
make python-server-tests
|
||||||
|
make python-client-tests
|
||||||
|
# or both server and client tests
|
||||||
make python-tests
|
make python-tests
|
||||||
|
# rust cargo tests
|
||||||
make integration-tests
|
make integration-tests
|
||||||
```
|
```
|
||||||
|
|
|
@ -21,6 +21,9 @@ pytest = "^6.2.5"
|
||||||
pytest-asyncio = "^0.17.2"
|
pytest-asyncio = "^0.17.2"
|
||||||
pytest-cov = "^3.0.0"
|
pytest-cov = "^3.0.0"
|
||||||
|
|
||||||
|
[tool.pytest.ini_options]
|
||||||
|
asyncio_mode = "auto"
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["poetry-core"]
|
requires = ["poetry-core>=1.0.0"]
|
||||||
build-backend = "poetry.core.masonry.api"
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
|
@ -87,6 +87,19 @@ async def test_generate_async(flan_t5_xxl_url, hf_headers):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_generate_async_best_of(flan_t5_xxl_url, hf_headers):
|
||||||
|
client = AsyncClient(flan_t5_xxl_url, hf_headers)
|
||||||
|
response = await client.generate(
|
||||||
|
"test", max_new_tokens=1, best_of=2, do_sample=True
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.details.seed is not None
|
||||||
|
assert response.details.best_of_sequences is not None
|
||||||
|
assert len(response.details.best_of_sequences) == 1
|
||||||
|
assert response.details.best_of_sequences[0].seed is not None
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_generate_async_not_found(fake_url, hf_headers):
|
async def test_generate_async_not_found(fake_url, hf_headers):
|
||||||
client = AsyncClient(fake_url, hf_headers)
|
client = AsyncClient(fake_url, hf_headers)
|
||||||
|
|
|
@ -20,7 +20,7 @@ impl ShardedClient {
|
||||||
/// the other shards and returns all uris/unix sockets with the `service_discovery` gRPC method.
|
/// the other shards and returns all uris/unix sockets with the `service_discovery` gRPC method.
|
||||||
async fn from_master_client(mut master_client: Client) -> Result<Self> {
|
async fn from_master_client(mut master_client: Client) -> Result<Self> {
|
||||||
// Get all uris/unix sockets from the master client
|
// Get all uris/unix sockets from the master client
|
||||||
let uris = master_client.service_discovery().await.unwrap();
|
let uris = master_client.service_discovery().await?;
|
||||||
let futures = uris.into_iter().map(Client::connect_uds);
|
let futures = uris.into_iter().map(Client::connect_uds);
|
||||||
let clients: Result<Vec<Client>> = join_all(futures).await.into_iter().collect();
|
let clients: Result<Vec<Client>> = join_all(futures).await.into_iter().collect();
|
||||||
Ok(Self::new(clients?))
|
Ok(Self::new(clients?))
|
||||||
|
|
|
@ -4,6 +4,10 @@ use text_generation_client::{
|
||||||
Batch, NextTokenChooserParameters, Request, ShardedClient, StoppingCriteriaParameters,
|
Batch, NextTokenChooserParameters, Request, ShardedClient, StoppingCriteriaParameters,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Note: Request ids and batch ids cannot collide.
|
||||||
|
const LIVENESS_ID: u64 = u64::MAX;
|
||||||
|
const BATCH_ID: u64 = u64::MAX;
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub(crate) struct Health {
|
pub(crate) struct Health {
|
||||||
client: ShardedClient,
|
client: ShardedClient,
|
||||||
|
@ -27,7 +31,7 @@ impl Health {
|
||||||
|
|
||||||
// Dummy batch of 1 token and 1 generated token
|
// Dummy batch of 1 token and 1 generated token
|
||||||
let liveness_request = Request {
|
let liveness_request = Request {
|
||||||
id: u64::MAX,
|
id: LIVENESS_ID,
|
||||||
inputs: "liveness".to_string(),
|
inputs: "liveness".to_string(),
|
||||||
truncate: 10,
|
truncate: 10,
|
||||||
parameters: Some(NextTokenChooserParameters {
|
parameters: Some(NextTokenChooserParameters {
|
||||||
|
@ -47,7 +51,7 @@ impl Health {
|
||||||
}),
|
}),
|
||||||
};
|
};
|
||||||
let batch = Batch {
|
let batch = Batch {
|
||||||
id: u64::MAX,
|
id: BATCH_ID,
|
||||||
requests: vec![liveness_request],
|
requests: vec![liveness_request],
|
||||||
size: 1,
|
size: 1,
|
||||||
max_tokens: 2,
|
max_tokens: 2,
|
||||||
|
|
|
@ -0,0 +1,54 @@
|
||||||
|
# test_watermark_logits_processor.py
|
||||||
|
|
||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from text_generation_server.utils.watermark import WatermarkLogitsProcessor
|
||||||
|
|
||||||
|
|
||||||
|
GAMMA = os.getenv("WATERMARK_GAMMA", 0.5)
|
||||||
|
DELTA = os.getenv("WATERMARK_DELTA", 2.0)
|
||||||
|
|
||||||
|
|
||||||
|
def test_seed_rng():
|
||||||
|
input_ids = [101, 2036, 3731, 102, 2003, 103]
|
||||||
|
processor = WatermarkLogitsProcessor()
|
||||||
|
processor._seed_rng(input_ids)
|
||||||
|
assert isinstance(processor.rng, torch.Generator)
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_greenlist_ids():
|
||||||
|
input_ids = [101, 2036, 3731, 102, 2003, 103]
|
||||||
|
processor = WatermarkLogitsProcessor()
|
||||||
|
result = processor._get_greenlist_ids(input_ids, 10, torch.device("cpu"))
|
||||||
|
assert max(result) <= 10
|
||||||
|
assert len(result) == int(10 * 0.5)
|
||||||
|
|
||||||
|
|
||||||
|
def test_calc_greenlist_mask():
|
||||||
|
processor = WatermarkLogitsProcessor()
|
||||||
|
scores = torch.tensor([[0.5, 0.3, 0.2, 0.8], [0.1, 0.2, 0.7, 0.9]])
|
||||||
|
greenlist_token_ids = torch.tensor([2, 3])
|
||||||
|
result = processor._calc_greenlist_mask(scores, greenlist_token_ids)
|
||||||
|
assert result.tolist() == [[False, False, False, False], [False, False, True, True]]
|
||||||
|
assert result.shape == scores.shape
|
||||||
|
|
||||||
|
|
||||||
|
def test_bias_greenlist_logits():
|
||||||
|
processor = WatermarkLogitsProcessor()
|
||||||
|
scores = torch.tensor([[0.5, 0.3, 0.2, 0.8], [0.1, 0.2, 0.7, 0.9]])
|
||||||
|
green_tokens_mask = torch.tensor(
|
||||||
|
[[False, False, True, True], [False, False, False, True]]
|
||||||
|
)
|
||||||
|
greenlist_bias = 2.0
|
||||||
|
result = processor._bias_greenlist_logits(scores, green_tokens_mask, greenlist_bias)
|
||||||
|
assert np.allclose(result.tolist(), [[0.5, 0.3, 2.2, 2.8], [0.1, 0.2, 0.7, 2.9]])
|
||||||
|
assert result.shape == scores.shape
|
||||||
|
|
||||||
|
|
||||||
|
def test_call():
|
||||||
|
input_ids = [101, 2036, 3731, 102, 2003, 103]
|
||||||
|
processor = WatermarkLogitsProcessor()
|
||||||
|
scores = torch.tensor([[0.5, 0.3, 0.2, 0.8], [0.1, 0.2, 0.7, 0.9]])
|
||||||
|
result = processor(input_ids, scores)
|
||||||
|
assert result.shape == scores.shape
|
|
@ -58,8 +58,6 @@ __all__ = [
|
||||||
"GalacticaSharded",
|
"GalacticaSharded",
|
||||||
"GPTNeoxSharded",
|
"GPTNeoxSharded",
|
||||||
"Seq2SeqLM",
|
"Seq2SeqLM",
|
||||||
"Galactica",
|
|
||||||
"GalacticaSharded",
|
|
||||||
"SantaCoder",
|
"SantaCoder",
|
||||||
"OPT",
|
"OPT",
|
||||||
"OPTSharded",
|
"OPTSharded",
|
||||||
|
|
Loading…
Reference in New Issue