hf_text-generation-inference/server/text_generation_server/utils/dist.py

import os
import torch

from datetime import timedelta
from loguru import logger
from text_generation_server.utils.import_utils import SYSTEM

# Tensor Parallelism settings
RANK = int(os.getenv("RANK", "0"))
WORLD_SIZE = int(os.getenv("WORLD_SIZE", "1"))

# CUDA memory fraction
MEMORY_FRACTION = float(os.getenv("CUDA_MEMORY_FRACTION", "1.0"))


class FakeBarrier:
    def wait(self):
        pass


class FakeGroup:
    def __init__(self, rank, size):
        self._rank = rank
        self._size = size

    def allreduce(self, *args, **kwargs):
        return FakeBarrier()

    def allgather(self, inputs, local_tensor, **kwargs):
        assert (
            len(inputs[0]) == len(local_tensor) == 1
        ), f"{len(inputs[0])} != {len(local_tensor)} != 1, and the FakeGroup is supposed to join on simple tensors"
        for input_ in inputs:
            input_[0].data = local_tensor[0].data
        return FakeBarrier()

    def barrier(self, *args, **kwargs):
        return FakeBarrier()

    def size(self):
        return self._size

    def rank(self):
        return self._rank


def initialize_torch_distributed():
    if torch.cuda.is_available():
        from torch.distributed import ProcessGroupNCCL

        # Set the device id.
        assert WORLD_SIZE <= torch.cuda.device_count(), "Each process is one gpu"
        device = RANK % torch.cuda.device_count()
        torch.cuda.set_device(device)
        torch.cuda.set_per_process_memory_fraction(MEMORY_FRACTION, device)
        backend = "nccl"
        options = ProcessGroupNCCL.Options()
        options.is_high_priority_stream = True
        options._timeout = timedelta(seconds=60)
    else:
        backend = "gloo"
        options = None

    if WORLD_SIZE == 1:
        return FakeGroup(RANK, WORLD_SIZE), RANK, WORLD_SIZE
    else:
        if os.getenv("DEBUG", None) == "1":
            return FakeGroup(RANK, WORLD_SIZE), RANK, WORLD_SIZE

        if not torch.distributed.is_initialized():
            # Call the init process.
            if SYSTEM == "ipex":
                import intel_extension_for_pytorch as ipex

                ipex.distributed.init_process_group(
                    backend="ccl",
                    world_size=WORLD_SIZE,
                    rank=RANK,
                    timeout=timedelta(seconds=60),
                    pg_options=options,
                )
            else:
                torch.distributed.init_process_group(
                    backend=backend,
                    world_size=WORLD_SIZE,
                    rank=RANK,
                    timeout=timedelta(seconds=60),
                    pg_options=options,
                )
        else:
            logger.warning("torch.distributed is already initialized.")

        return torch.distributed.group.WORLD, RANK, WORLD_SIZE
feat: add safetensors conversion (#63) 2023-02-14 05:02:16 -07:00			`import os`
			`import torch`

			`from datetime import timedelta`
fix(server): Do not init process group if already initialized (#388) 2023-06-26 04:32:54 -06:00			`from loguru import logger`
Removing IPEX_AVAIL. (#2115) * Removing IPEX_AVAIL. Chose to unify CPU and XPU under `ipex`. Most code is exactly similar except for a very few spots. The biggest number of spots is the kv-cache layout and the flash_xxx.py files. Since those files should be removed soon and factored away, we should not need them. * Forgot a few places. * Unrelated change. * Fixing HF_TOKEN. * HF_TOKEN 2024-06-25 05:20:57 -06:00			`from text_generation_server.utils.import_utils import SYSTEM`
feat: add safetensors conversion (#63) 2023-02-14 05:02:16 -07:00
feat: add cuda memory fraction (#659) Close #673 2023-07-24 03:43:58 -06:00			`# Tensor Parallelism settings`
			`RANK = int(os.getenv("RANK", "0"))`
			`WORLD_SIZE = int(os.getenv("WORLD_SIZE", "1"))`

			`# CUDA memory fraction`
			`MEMORY_FRACTION = float(os.getenv("CUDA_MEMORY_FRACTION", "1.0"))`

feat: add safetensors conversion (#63) 2023-02-14 05:02:16 -07:00
feat(server): Rework model loading (#344) # What does this PR do? Reworked the loading logic. Idea is to use cleaner loading code: - Remove need for `no_init_weights` - Remove all weird `bnb_linear` and `load_weights` and `post_load_weights`. New code layout: - New class `Weights` in charge of handling loading the weights from multiple files into appropiate tensors (potentially sharded) - TP layers now are "shells", they contain the code to know what kind of sharding we need + eventual `all_reduce`. They do not inherit from linear, but they contain some kind of Linear instead - the contained linear can be either FastLinear, BnbLinear or GPTq Linear next. - All modeling code is explictly made for sharding, process group is just no-ops for non sharded code (removes a lot of test cases) ![Screenshot from 2023-05-19 23-19-59](https://github.com/huggingface/text-generation-inference/assets/204321/9a802654-74a3-488c-87a8-073743a6143f) --------- Co-authored-by: Ubuntu <ubuntu@ip-172-31-41-161.taildb5d.ts.net> Co-authored-by: Ubuntu <ubuntu@ip-172-31-41-161.ec2.internal> Co-authored-by: OlivierDehaene <olivier@huggingface.co> Co-authored-by: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> 2023-06-08 06:51:52 -06:00			`class FakeBarrier:`
			`def wait(self):`
			`pass`


			`class FakeGroup:`
			`def __init__(self, rank, size):`
			`self._rank = rank`
			`self._size = size`

			`def allreduce(self, args, *kwargs):`
			`return FakeBarrier()`

			`def allgather(self, inputs, local_tensor, **kwargs):`
			`assert (`
			`len(inputs[0]) == len(local_tensor) == 1`
			`), f"{len(inputs[0])} != {len(local_tensor)} != 1, and the FakeGroup is supposed to join on simple tensors"`
			`for input_ in inputs:`
			`input_[0].data = local_tensor[0].data`
			`return FakeBarrier()`

			`def barrier(self, args, *kwargs):`
			`return FakeBarrier()`

			`def size(self):`
			`return self._size`

			`def rank(self):`
			`return self._rank`


feat: add safetensors conversion (#63) 2023-02-14 05:02:16 -07:00			`def initialize_torch_distributed():`
			`if torch.cuda.is_available():`
			`from torch.distributed import ProcessGroupNCCL`

			`# Set the device id.`
feat: add cuda memory fraction (#659) Close #673 2023-07-24 03:43:58 -06:00			`assert WORLD_SIZE <= torch.cuda.device_count(), "Each process is one gpu"`
			`device = RANK % torch.cuda.device_count()`
feat: add safetensors conversion (#63) 2023-02-14 05:02:16 -07:00			`torch.cuda.set_device(device)`
feat: add cuda memory fraction (#659) Close #673 2023-07-24 03:43:58 -06:00			`torch.cuda.set_per_process_memory_fraction(MEMORY_FRACTION, device)`
feat: add safetensors conversion (#63) 2023-02-14 05:02:16 -07:00			`backend = "nccl"`
			`options = ProcessGroupNCCL.Options()`
			`options.is_high_priority_stream = True`
			`options._timeout = timedelta(seconds=60)`
			`else:`
Cpu tgi (#1936) * add CPU tgi support Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * ipex distributed ops support Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> --------- Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Co-authored-by: Funtowicz Morgan <mfuntowicz@users.noreply.github.com> 2024-06-25 04:21:29 -06:00			`backend = "gloo"`
feat: add safetensors conversion (#63) 2023-02-14 05:02:16 -07:00			`options = None`

feat: add cuda memory fraction (#659) Close #673 2023-07-24 03:43:58 -06:00			`if WORLD_SIZE == 1:`
			`return FakeGroup(RANK, WORLD_SIZE), RANK, WORLD_SIZE`
feat(server): Rework model loading (#344) # What does this PR do? Reworked the loading logic. Idea is to use cleaner loading code: - Remove need for `no_init_weights` - Remove all weird `bnb_linear` and `load_weights` and `post_load_weights`. New code layout: - New class `Weights` in charge of handling loading the weights from multiple files into appropiate tensors (potentially sharded) - TP layers now are "shells", they contain the code to know what kind of sharding we need + eventual `all_reduce`. They do not inherit from linear, but they contain some kind of Linear instead - the contained linear can be either FastLinear, BnbLinear or GPTq Linear next. - All modeling code is explictly made for sharding, process group is just no-ops for non sharded code (removes a lot of test cases) ![Screenshot from 2023-05-19 23-19-59](https://github.com/huggingface/text-generation-inference/assets/204321/9a802654-74a3-488c-87a8-073743a6143f) --------- Co-authored-by: Ubuntu <ubuntu@ip-172-31-41-161.taildb5d.ts.net> Co-authored-by: Ubuntu <ubuntu@ip-172-31-41-161.ec2.internal> Co-authored-by: OlivierDehaene <olivier@huggingface.co> Co-authored-by: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> 2023-06-08 06:51:52 -06:00			`else:`
			`if os.getenv("DEBUG", None) == "1":`
feat: add cuda memory fraction (#659) Close #673 2023-07-24 03:43:58 -06:00			`return FakeGroup(RANK, WORLD_SIZE), RANK, WORLD_SIZE`
fix(server): Do not init process group if already initialized (#388) 2023-06-26 04:32:54 -06:00
			`if not torch.distributed.is_initialized():`
			`# Call the init process.`
Removing IPEX_AVAIL. (#2115) * Removing IPEX_AVAIL. Chose to unify CPU and XPU under `ipex`. Most code is exactly similar except for a very few spots. The biggest number of spots is the kv-cache layout and the flash_xxx.py files. Since those files should be removed soon and factored away, we should not need them. * Forgot a few places. * Unrelated change. * Fixing HF_TOKEN. * HF_TOKEN 2024-06-25 05:20:57 -06:00			`if SYSTEM == "ipex":`
Cpu tgi (#1936) * add CPU tgi support Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * ipex distributed ops support Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> --------- Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Co-authored-by: Funtowicz Morgan <mfuntowicz@users.noreply.github.com> 2024-06-25 04:21:29 -06:00			`import intel_extension_for_pytorch as ipex`

			`ipex.distributed.init_process_group(`
			`backend="ccl",`
			`world_size=WORLD_SIZE,`
			`rank=RANK,`
			`timeout=timedelta(seconds=60),`
			`pg_options=options,`
			`)`
			`else:`
			`torch.distributed.init_process_group(`
			`backend=backend,`
			`world_size=WORLD_SIZE,`
			`rank=RANK,`
			`timeout=timedelta(seconds=60),`
			`pg_options=options,`
			`)`
fix(server): Do not init process group if already initialized (#388) 2023-06-26 04:32:54 -06:00			`else:`
			`logger.warning("torch.distributed is already initialized.")`
feat: add safetensors conversion (#63) 2023-02-14 05:02:16 -07:00
feat: add cuda memory fraction (#659) Close #673 2023-07-24 03:43:58 -06:00			`return torch.distributed.group.WORLD, RANK, WORLD_SIZE`