Fix after rebase..
This commit is contained in:
parent
1b86d0f31d
commit
cacba5f21f
|
@ -1,5 +1,6 @@
|
||||||
import torch
|
import torch
|
||||||
from text_generation_server.utils.import_utils import SYSTEM
|
from text_generation_server.utils.import_utils import SYSTEM
|
||||||
|
from text_generation_server.models.globals import FLASH_DECODING
|
||||||
|
|
||||||
_PARTITION_SIZE = 512
|
_PARTITION_SIZE = 512
|
||||||
|
|
||||||
|
@ -125,7 +126,9 @@ def attention(
|
||||||
else:
|
else:
|
||||||
from vllm._C import ops
|
from vllm._C import ops
|
||||||
|
|
||||||
use_v1 = max_s <= 8192 and (max_num_partitions == 1 or num_seqs * num_heads > 512)
|
use_v1 = max_s <= 8192 and (
|
||||||
|
max_num_partitions == 1 or num_seqs * num_heads > 512
|
||||||
|
)
|
||||||
if use_v1:
|
if use_v1:
|
||||||
ops.paged_attention_v1(
|
ops.paged_attention_v1(
|
||||||
out,
|
out,
|
||||||
|
|
Loading…
Reference in New Issue