feat(router): add header option to disable buffering for the generate_stream response (#498)
# This PR adds an http header option to disable buffering for the generate_stream endpoint response stream. Problem: If a model is run behind a proxy server such as nginx that has buffering enabled then the response stream from generate_stream gets aggregated into a single response which basically disables streaming. Instead of getting a chunked response where each token is presented over time the response presents everything all at once. Solution: This change adds the `X-Accel-Buffering` http header which disables buffering for the generate_stream response, allowing the response to stream properly.
This commit is contained in:
parent
ae466a8736
commit
70f485bf9f
|
@ -351,6 +351,7 @@ async fn generate_stream(
|
||||||
"x-compute-characters",
|
"x-compute-characters",
|
||||||
compute_characters.to_string().parse().unwrap(),
|
compute_characters.to_string().parse().unwrap(),
|
||||||
);
|
);
|
||||||
|
headers.insert("X-Accel-Buffering", "no".parse().unwrap());
|
||||||
|
|
||||||
let stream = async_stream::stream! {
|
let stream = async_stream::stream! {
|
||||||
// Inference
|
// Inference
|
||||||
|
|
Loading…
Reference in New Issue