Adding more cards.
This commit is contained in:
parent
fa912440b1
commit
23c0a20dc9
|
@ -33,7 +33,7 @@ fn compute_optimal(config: Option<&Config>, compute: Option<&ComputeType>) -> Op
|
||||||
if let (Some(config), Some(compute)) = (config, compute) {
|
if let (Some(config), Some(compute)) = (config, compute) {
|
||||||
if let (Some(f16_max_compute), Some(model_compute)) = (compute.f16_flop(), config.flop()) {
|
if let (Some(f16_max_compute), Some(model_compute)) = (compute.f16_flop(), config.flop()) {
|
||||||
tracing::debug!("MAx compute {f16_max_compute} model compute {model_compute}");
|
tracing::debug!("MAx compute {f16_max_compute} model compute {model_compute}");
|
||||||
let optimal_size = (f16_max_compute / model_compute) as usize;
|
let optimal_size = (f16_max_compute / model_compute / 2) as usize;
|
||||||
if optimal_size > 100 {
|
if optimal_size > 100 {
|
||||||
// Ignore calculations that's too low
|
// Ignore calculations that's too low
|
||||||
// Most likely an error
|
// Most likely an error
|
||||||
|
@ -1500,6 +1500,8 @@ impl ComputeType {
|
||||||
let card_flop = match &self.card[..] {
|
let card_flop = match &self.card[..] {
|
||||||
// https://www.nvidia.com/en-us/data-center/l4/
|
// https://www.nvidia.com/en-us/data-center/l4/
|
||||||
"nvidia-l4" => Some(121 * 10u64.pow(12)),
|
"nvidia-l4" => Some(121 * 10u64.pow(12)),
|
||||||
|
// https://www.nvidia.com/en-us/data-center/products/a10-gpu/
|
||||||
|
"nvidia-a10g" => Some(125 * 10u64.pow(12)),
|
||||||
card => {
|
card => {
|
||||||
tracing::warn!("Unkown compute for card {card}");
|
tracing::warn!("Unkown compute for card {card}");
|
||||||
None
|
None
|
||||||
|
|
|
@ -1554,12 +1554,16 @@ class FlashCausalLM(Model):
|
||||||
)
|
)
|
||||||
batch_num_blocks = batch.num_blocks
|
batch_num_blocks = batch.num_blocks
|
||||||
|
|
||||||
|
num_tokens = batch.to_pb().current_tokens
|
||||||
|
logger.info(f"BLOCKS {batch.num_blocks}")
|
||||||
|
free_memory = get_free_memory(self.device, MEMORY_FRACTION)
|
||||||
|
logger.info(f"Free memory {free_memory}")
|
||||||
if SYSTEM == "rocm" and os.environ.get("PYTORCH_TUNABLEOP_ENABLED", False):
|
if SYSTEM == "rocm" and os.environ.get("PYTORCH_TUNABLEOP_ENABLED", False):
|
||||||
torch.cuda.tunable.tuning_enable(False)
|
torch.cuda.tunable.tuning_enable(False)
|
||||||
_, _batch, _ = self.generate_token(batch)
|
_, _batch, _ = self.generate_token(batch)
|
||||||
except torch.cuda.OutOfMemoryError as e:
|
except torch.cuda.OutOfMemoryError as e:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Not enough memory to handle {batch.to_pb().current_tokens} prefill tokens. "
|
f"Not enough memory to handle {num_tokens} prefill tokens. "
|
||||||
f"You need to decrease `--max-batch-prefill-tokens`"
|
f"You need to decrease `--max-batch-prefill-tokens`"
|
||||||
) from e
|
) from e
|
||||||
|
|
||||||
|
@ -2106,6 +2110,10 @@ class FlashCausalLM(Model):
|
||||||
|
|
||||||
if prefill and prefill_logprobs:
|
if prefill and prefill_logprobs:
|
||||||
# Get prefill logprobs with inplace softmax (avoid copying the `out` tensor (max_batch_prefill_tokens * vocab_size))
|
# Get prefill logprobs with inplace softmax (avoid copying the `out` tensor (max_batch_prefill_tokens * vocab_size))
|
||||||
|
free_memory = get_free_memory(self.device, MEMORY_FRACTION)
|
||||||
|
logger.info(f"Free memory {free_memory / 1e9}GB")
|
||||||
|
logmemory = out.nelement() * out.element_size()
|
||||||
|
logger.info(f"Log memory {logmemory / 1e9}GB")
|
||||||
torch.log_softmax(out, -1, out=out)
|
torch.log_softmax(out, -1, out=out)
|
||||||
prefill_logprobs_tensor = out
|
prefill_logprobs_tensor = out
|
||||||
prefill_logprobs = torch.gather(
|
prefill_logprobs = torch.gather(
|
||||||
|
|
Loading…
Reference in New Issue