feat(router): add header option to disable buffering for the generate_stream response (#498)
# This PR adds an http header option to disable buffering for the generate_stream endpoint response stream. Problem: If a model is run behind a proxy server such as nginx that has buffering enabled then the response stream from generate_stream gets aggregated into a single response which basically disables streaming. Instead of getting a chunked response where each token is presented over time the response presents everything all at once. Solution: This change adds the `X-Accel-Buffering` http header which disables buffering for the generate_stream response, allowing the response to stream properly.
This commit is contained in:
parent
ae466a8736
commit
70f485bf9f
|
@ -351,6 +351,7 @@ async fn generate_stream(
|
|||
"x-compute-characters",
|
||||
compute_characters.to_string().parse().unwrap(),
|
||||
);
|
||||
headers.insert("X-Accel-Buffering", "no".parse().unwrap());
|
||||
|
||||
let stream = async_stream::stream! {
|
||||
// Inference
|
||||
|
|
Loading…
Reference in New Issue