hf_text-generation-inference/server/text_generation_server/cli.py

import os
import sys
import typer

from pathlib import Path
from loguru import logger
from typing import Optional


app = typer.Typer()


@app.command()
def serve(
    model_id: str,
    revision: Optional[str] = None,
    sharded: bool = False,
    quantize: bool = False,
    uds_path: Path = "/tmp/text-generation-server",
    logger_level: str = "INFO",
    json_output: bool = False,
    otlp_endpoint: Optional[str] = None,
):
    if sharded:
        assert (
            os.getenv("RANK", None) is not None
        ), "RANK must be set when sharded is True"
        assert (
            os.getenv("WORLD_SIZE", None) is not None
        ), "WORLD_SIZE must be set when sharded is True"
        assert (
            os.getenv("MASTER_ADDR", None) is not None
        ), "MASTER_ADDR must be set when sharded is True"
        assert (
            os.getenv("MASTER_PORT", None) is not None
        ), "MASTER_PORT must be set when sharded is True"

    # Remove default handler
    logger.remove()
    logger.add(
        sys.stdout,
        format="{message}",
        filter="text_generation_server",
        level=logger_level,
        serialize=json_output,
        backtrace=True,
        diagnose=False,
    )

    # Import here after the logger is added to log potential import exceptions
    from text_generation_server import server
    from text_generation_server.tracing import setup_tracing

    # Setup OpenTelemetry distributed tracing
    if otlp_endpoint is not None:
        setup_tracing(shard=os.getenv("RANK", 0), otlp_endpoint=otlp_endpoint)

    server.serve(model_id, revision, sharded, quantize, uds_path)


@app.command()
def download_weights(
    model_id: str,
    revision: Optional[str] = None,
    extension: str = ".safetensors",
    auto_convert: bool = True,
    logger_level: str = "INFO",
    json_output: bool = False,
):
    # Remove default handler
    logger.remove()
    logger.add(
        sys.stdout,
        format="{message}",
        filter="text_generation_server",
        level=logger_level,
        serialize=json_output,
        backtrace=True,
        diagnose=False,
    )

    # Import here after the logger is added to log potential import exceptions
    from text_generation_server import utils

    # Test if files were already download
    try:
        utils.weight_files(model_id, revision, extension)
        logger.info("Files are already present on the host. " "Skipping download.")
        return
    # Local files not found
    except (utils.LocalEntryNotFoundError, FileNotFoundError):
        pass

    is_local_model = (Path(model_id).exists() and Path(model_id).is_dir()) or os.getenv(
        "WEIGHTS_CACHE_OVERRIDE", None
    ) is not None

    if not is_local_model:
        # Try to download weights from the hub
        try:
            filenames = utils.weight_hub_files(model_id, revision, extension)
            utils.download_weights(filenames, model_id, revision)
            # Successfully downloaded weights
            return

        # No weights found on the hub with this extension
        except utils.EntryNotFoundError as e:
            # Check if we want to automatically convert to safetensors or if we can use .bin weights instead
            if not extension == ".safetensors" or not auto_convert:
                raise e

    # Try to see if there are local pytorch weights
    try:
        # Get weights for a local model, a hub cached model and inside the WEIGHTS_CACHE_OVERRIDE
        local_pt_files = utils.weight_files(model_id, revision, ".bin")

    # No local pytorch weights
    except utils.LocalEntryNotFoundError:
        if extension == ".safetensors":
            logger.warning(
                f"No safetensors weights found for model {model_id} at revision {revision}. "
                f"Downloading PyTorch weights."
            )

        # Try to see if there are pytorch weights on the hub
        pt_filenames = utils.weight_hub_files(model_id, revision, ".bin")
        # Download pytorch weights
        local_pt_files = utils.download_weights(pt_filenames, model_id, revision)

    if auto_convert:
        logger.warning(
            f"No safetensors weights found for model {model_id} at revision {revision}. "
            f"Converting PyTorch weights to safetensors."
        )

        # Safetensors final filenames
        local_st_files = [
            p.parent / f"{p.stem.lstrip('pytorch_')}.safetensors"
            for p in local_pt_files
        ]
        # Convert pytorch weights to safetensors
        utils.convert_files(local_pt_files, local_st_files)


if __name__ == "__main__":
    app()
v0.1.0 2022-10-18 07:19:03 -06:00			`import os`
feat(launcher): Log server stdout (#19) Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2023-01-05 04:01:23 -07:00			`import sys`
feat: Improve error handling 2022-10-17 06:59:00 -06:00			`import typer`

			`from pathlib import Path`
feat(launcher): Log server stdout (#19) Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2023-01-05 04:01:23 -07:00			`from loguru import logger`
feat(server): Support GPT-Neox (#39) 2023-01-31 10:53:56 -07:00			`from typing import Optional`
feat: Improve error handling 2022-10-17 06:59:00 -06:00

			`app = typer.Typer()`


			`@app.command()`
v0.1.0 2022-10-18 07:19:03 -06:00			`def serve(`
feat(router): refactor API and add openAPI schemas (#53) 2023-02-03 04:43:37 -07:00			`model_id: str,`
feat(server): Support GPT-Neox (#39) 2023-01-31 10:53:56 -07:00			`revision: Optional[str] = None,`
v0.1.0 2022-10-18 07:19:03 -06:00			`sharded: bool = False,`
feat(server): Support bitsandbytes 2022-10-27 06:25:29 -06:00			`quantize: bool = False,`
feat(benchmark): tui based benchmarking tool (#149) 2023-03-30 07:26:27 -06:00			`uds_path: Path = "/tmp/text-generation-server",`
feat(launcher): Log server stdout (#19) Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2023-01-05 04:01:23 -07:00			`logger_level: str = "INFO",`
			`json_output: bool = False,`
feat: add distributed tracing (#62) 2023-02-13 05:02:45 -07:00			`otlp_endpoint: Optional[str] = None,`
feat: Improve error handling 2022-10-17 06:59:00 -06:00			`):`
v0.1.0 2022-10-18 07:19:03 -06:00			`if sharded:`
			`assert (`
			`os.getenv("RANK", None) is not None`
			`), "RANK must be set when sharded is True"`
			`assert (`
			`os.getenv("WORLD_SIZE", None) is not None`
			`), "WORLD_SIZE must be set when sharded is True"`
			`assert (`
			`os.getenv("MASTER_ADDR", None) is not None`
			`), "MASTER_ADDR must be set when sharded is True"`
			`assert (`
			`os.getenv("MASTER_PORT", None) is not None`
			`), "MASTER_PORT must be set when sharded is True"`

feat: add distributed tracing (#62) 2023-02-13 05:02:45 -07:00			`# Remove default handler`
			`logger.remove()`
			`logger.add(`
			`sys.stdout,`
			`format="{message}",`
feat(clients): Python client (#103) 2023-03-07 10:52:22 -07:00			`filter="text_generation_server",`
feat: add distributed tracing (#62) 2023-02-13 05:02:45 -07:00			`level=logger_level,`
			`serialize=json_output,`
			`backtrace=True,`
			`diagnose=False,`
			`)`
fix(docker): fix docker image dependencies (#187) 2023-04-16 16:26:47 -06:00
			`# Import here after the logger is added to log potential import exceptions`
			`from text_generation_server import server`
			`from text_generation_server.tracing import setup_tracing`

feat: add distributed tracing (#62) 2023-02-13 05:02:45 -07:00			`# Setup OpenTelemetry distributed tracing`
			`if otlp_endpoint is not None:`
			`setup_tracing(shard=os.getenv("RANK", 0), otlp_endpoint=otlp_endpoint)`

feat(router): refactor API and add openAPI schemas (#53) 2023-02-03 04:43:37 -07:00			`server.serve(model_id, revision, sharded, quantize, uds_path)`
feat: Improve error handling 2022-10-17 06:59:00 -06:00

			`@app.command()`
feat(server): Use safetensors Co-authored-by: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> 2022-10-22 12:00:15 -06:00			`def download_weights(`
feat(router): refactor API and add openAPI schemas (#53) 2023-02-03 04:43:37 -07:00			`model_id: str,`
feat(server): Support GPT-Neox (#39) 2023-01-31 10:53:56 -07:00			`revision: Optional[str] = None,`
feat(server): Support all AutoModelForCausalLM on a best effort basis 2022-10-28 11:24:00 -06:00			`extension: str = ".safetensors",`
feat(server): support hf endpoint weight layout (#266) 2023-05-03 03:36:24 -06:00			`auto_convert: bool = True,`
feat: add safetensors conversion (#63) 2023-02-14 05:02:16 -07:00			`logger_level: str = "INFO",`
			`json_output: bool = False,`
feat: Improve error handling 2022-10-17 06:59:00 -06:00			`):`
feat: add safetensors conversion (#63) 2023-02-14 05:02:16 -07:00			`# Remove default handler`
			`logger.remove()`
			`logger.add(`
			`sys.stdout,`
			`format="{message}",`
feat(clients): Python client (#103) 2023-03-07 10:52:22 -07:00			`filter="text_generation_server",`
feat: add safetensors conversion (#63) 2023-02-14 05:02:16 -07:00			`level=logger_level,`
			`serialize=json_output,`
			`backtrace=True,`
			`diagnose=False,`
			`)`

fix(docker): fix docker image dependencies (#187) 2023-04-16 16:26:47 -06:00			`# Import here after the logger is added to log potential import exceptions`
			`from text_generation_server import utils`

feat: add safetensors conversion (#63) 2023-02-14 05:02:16 -07:00			`# Test if files were already download`
			`try:`
			`utils.weight_files(model_id, revision, extension)`
feat(server): support hf endpoint weight layout (#266) 2023-05-03 03:36:24 -06:00			`logger.info("Files are already present on the host. " "Skipping download.")`
feat: add safetensors conversion (#63) 2023-02-14 05:02:16 -07:00			`return`
			`# Local files not found`
feat(server): support hf endpoint weight layout (#266) 2023-05-03 03:36:24 -06:00			`except (utils.LocalEntryNotFoundError, FileNotFoundError):`
feat: add safetensors conversion (#63) 2023-02-14 05:02:16 -07:00			`pass`

feat(server): support hf endpoint weight layout (#266) 2023-05-03 03:36:24 -06:00			`is_local_model = (Path(model_id).exists() and Path(model_id).is_dir()) or os.getenv(`
			`"WEIGHTS_CACHE_OVERRIDE", None`
			`) is not None`

			`if not is_local_model:`
			`# Try to download weights from the hub`
			`try:`
			`filenames = utils.weight_hub_files(model_id, revision, extension)`
			`utils.download_weights(filenames, model_id, revision)`
			`# Successfully downloaded weights`
			`return`

			`# No weights found on the hub with this extension`
			`except utils.EntryNotFoundError as e:`
			`# Check if we want to automatically convert to safetensors or if we can use .bin weights instead`
			`if not extension == ".safetensors" or not auto_convert:`
			`raise e`

			`# Try to see if there are local pytorch weights`
feat: add safetensors conversion (#63) 2023-02-14 05:02:16 -07:00			`try:`
feat(server): support hf endpoint weight layout (#266) 2023-05-03 03:36:24 -06:00			`# Get weights for a local model, a hub cached model and inside the WEIGHTS_CACHE_OVERRIDE`
			`local_pt_files = utils.weight_files(model_id, revision, ".bin")`
feat: add safetensors conversion (#63) 2023-02-14 05:02:16 -07:00
feat(server): support hf endpoint weight layout (#266) 2023-05-03 03:36:24 -06:00			`# No local pytorch weights`
			`except utils.LocalEntryNotFoundError:`
			`if extension == ".safetensors":`
			`logger.warning(`
			`f"No safetensors weights found for model {model_id} at revision {revision}. "`
			`f"Downloading PyTorch weights."`
			`)`
feat: add safetensors conversion (#63) 2023-02-14 05:02:16 -07:00
feat(server): support hf endpoint weight layout (#266) 2023-05-03 03:36:24 -06:00			`# Try to see if there are pytorch weights on the hub`
feat: add safetensors conversion (#63) 2023-02-14 05:02:16 -07:00			`pt_filenames = utils.weight_hub_files(model_id, revision, ".bin")`
			`# Download pytorch weights`
			`local_pt_files = utils.download_weights(pt_filenames, model_id, revision)`
feat(server): support hf endpoint weight layout (#266) 2023-05-03 03:36:24 -06:00
			`if auto_convert:`
			`logger.warning(`
			`f"No safetensors weights found for model {model_id} at revision {revision}. "`
			`f"Converting PyTorch weights to safetensors."`
			`)`

			`# Safetensors final filenames`
feat: add safetensors conversion (#63) 2023-02-14 05:02:16 -07:00			`local_st_files = [`
			`p.parent / f"{p.stem.lstrip('pytorch_')}.safetensors"`
			`for p in local_pt_files`
			`]`
			`# Convert pytorch weights to safetensors`
			`utils.convert_files(local_pt_files, local_st_files)`
feat: Improve error handling 2022-10-17 06:59:00 -06:00

			`if __name__ == "__main__":`
			`app()`