diff --git a/llm_server/routes/v1/generate_stream.py b/llm_server/routes/v1/generate_stream.py index 6b96d99..43511db 100644 --- a/llm_server/routes/v1/generate_stream.py +++ b/llm_server/routes/v1/generate_stream.py @@ -63,29 +63,14 @@ def stream(ws): except: response_status_code = 0 - # details = {} - - # Initialize an empty byte string to store parts of the response partial_response = b'' - # Process each part of the response as it's received for chunk in response.iter_content(chunk_size=1): - # Add the chunk to the partial response partial_response += chunk - - # If the partial response ends with a null character, parse it as JSON if partial_response.endswith(b'\x00'): - # Remove the null character and decode the byte string to a string - json_str = partial_response[:-1].decode() - - # Parse the string as JSON + json_str = partial_response[:-1].decode() # Remove the null character and decode the byte string to a string json_obj = json.loads(json_str) - - # Strip the input prompt from the response - if generated_text: - new = json_obj['text'][0].split(generated_text)[1] - else: - new = json_obj['text'][0].split(input_prompt)[1] + new = json_obj['text'][0].split(input_prompt + generated_text)[1] ws.send(json.dumps({ 'event': 'text_stream', @@ -95,9 +80,7 @@ def stream(ws): message_num += 1 generated_text = generated_text + new - - # Reset the partial response - partial_response = b'' + partial_response = b'' # Reset the partial response # If there is no more data, break the loop if not chunk: