feat(server): empty cache on errors
This commit is contained in:
parent
67347950b7
commit
f2f0289fb9
|
@ -1,3 +1,4 @@
|
||||||
|
import torch
|
||||||
import grpc
|
import grpc
|
||||||
|
|
||||||
from google.rpc import status_pb2, code_pb2
|
from google.rpc import status_pb2, code_pb2
|
||||||
|
@ -22,6 +23,9 @@ class ExceptionInterceptor(AsyncServerInterceptor):
|
||||||
method_name = method_name.split("/")[-1]
|
method_name = method_name.split("/")[-1]
|
||||||
logger.exception(f"Method {method_name} encountered an error.")
|
logger.exception(f"Method {method_name} encountered an error.")
|
||||||
|
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
await context.abort_with_status(
|
await context.abort_with_status(
|
||||||
rpc_status.to_status(
|
rpc_status.to_status(
|
||||||
status_pb2.Status(code=code_pb2.INTERNAL, message=str(err))
|
status_pb2.Status(code=code_pb2.INTERNAL, message=str(err))
|
||||||
|
|
|
@ -639,7 +639,6 @@ class FlashCausalLMBatch(Batch):
|
||||||
for b in batches:
|
for b in batches:
|
||||||
b.block_tables = None
|
b.block_tables = None
|
||||||
del b
|
del b
|
||||||
torch.cuda.empty_cache()
|
|
||||||
|
|
||||||
return FlashCausalLMBatch(
|
return FlashCausalLMBatch(
|
||||||
batch_id=batches[0].batch_id,
|
batch_id=batches[0].batch_id,
|
||||||
|
@ -733,7 +732,6 @@ class FlashCausalLM(Model):
|
||||||
f"You need to decrease `--max-batch-total-tokens` or `--max-batch-prefill-tokens`"
|
f"You need to decrease `--max-batch-total-tokens` or `--max-batch-prefill-tokens`"
|
||||||
) from e
|
) from e
|
||||||
del batch
|
del batch
|
||||||
torch.cuda.empty_cache()
|
|
||||||
|
|
||||||
def decode(self, generated_ids: Union[torch.Tensor, List[int]]) -> str:
|
def decode(self, generated_ids: Union[torch.Tensor, List[int]]) -> str:
|
||||||
return self.tokenizer.decode(
|
return self.tokenizer.decode(
|
||||||
|
@ -790,7 +788,6 @@ class FlashCausalLM(Model):
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
del batch
|
del batch
|
||||||
torch.cuda.empty_cache()
|
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
if prefill:
|
if prefill:
|
||||||
|
|
|
@ -51,6 +51,9 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
|
||||||
filtered_batch = batch.filter(request.request_ids)
|
filtered_batch = batch.filter(request.request_ids)
|
||||||
self.cache.set(filtered_batch)
|
self.cache.set(filtered_batch)
|
||||||
|
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
return generate_pb2.FilterBatchResponse(batch=filtered_batch.to_pb())
|
return generate_pb2.FilterBatchResponse(batch=filtered_batch.to_pb())
|
||||||
|
|
||||||
async def Warmup(self, request, context):
|
async def Warmup(self, request, context):
|
||||||
|
@ -58,6 +61,10 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
|
||||||
request.batch, self.model.tokenizer, self.model.dtype, self.model.device
|
request.batch, self.model.tokenizer, self.model.dtype, self.model.device
|
||||||
)
|
)
|
||||||
self.model.warmup(batch, request.max_total_tokens)
|
self.model.warmup(batch, request.max_total_tokens)
|
||||||
|
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
return generate_pb2.WarmupResponse()
|
return generate_pb2.WarmupResponse()
|
||||||
|
|
||||||
async def Prefill(self, request, context):
|
async def Prefill(self, request, context):
|
||||||
|
@ -89,6 +96,8 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
|
||||||
|
|
||||||
if len(batches) > 1:
|
if len(batches) > 1:
|
||||||
batch = self.model.batch_type.concatenate(batches)
|
batch = self.model.batch_type.concatenate(batches)
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.empty_cache()
|
||||||
else:
|
else:
|
||||||
batch = batches[0]
|
batch = batches[0]
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue