From a6e4d63c86f4eeaae2ba1337a39f19d03bbd2277 Mon Sep 17 00:00:00 2001 From: fxmarty <9808326+fxmarty@users.noreply.github.com> Date: Tue, 11 Jun 2024 13:30:29 +0200 Subject: [PATCH 01/15] Update LLMM1 bound (#2050) update commit --- server/Makefile-vllm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/Makefile-vllm b/server/Makefile-vllm index 8c0437ea..2f2b5ef6 100644 --- a/server/Makefile-vllm +++ b/server/Makefile-vllm @@ -1,5 +1,5 @@ commit_cuda := b5dfc61db88a81069e45b44f7cc99bd9e62a60fa -commit_rocm := 559200c1a028de990c1ddea761b0ccd62109e3a0 +commit_rocm := c6ee53b1be97e3bbc791b95f22827501297f8921 build-vllm-cuda: if [ ! -d 'vllm' ]; then \ pip install -U ninja packaging --no-cache-dir && \ From 376a0b7ada91548a68798383cb008ea01c728b30 Mon Sep 17 00:00:00 2001 From: drbh Date: Tue, 11 Jun 2024 10:44:56 -0400 Subject: [PATCH 02/15] Support chat response format (#2046) * feat: support response_format in chat * fix: adjust typos * fix: add trufflehog lint --- .github/workflows/trufflehog.yml | 1 - ...st_grammar_response_format_llama_json.json | 23 ++++ .../test_grammar_response_format_llama.py | 101 ++++++++++++++++++ router/src/lib.rs | 8 ++ router/src/server.rs | 30 ++++-- 5 files changed, 156 insertions(+), 7 deletions(-) create mode 100644 integration-tests/models/__snapshots__/test_grammar_response_format_llama/test_grammar_response_format_llama_json.json create mode 100644 integration-tests/models/test_grammar_response_format_llama.py diff --git a/.github/workflows/trufflehog.yml b/.github/workflows/trufflehog.yml index 8bc60eff..b406d43b 100644 --- a/.github/workflows/trufflehog.yml +++ b/.github/workflows/trufflehog.yml @@ -16,4 +16,3 @@ jobs: fetch-depth: 0 - name: Secret Scanning uses: trufflesecurity/trufflehog@main - diff --git a/integration-tests/models/__snapshots__/test_grammar_response_format_llama/test_grammar_response_format_llama_json.json b/integration-tests/models/__snapshots__/test_grammar_response_format_llama/test_grammar_response_format_llama_json.json new file mode 100644 index 00000000..83390832 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_grammar_response_format_llama/test_grammar_response_format_llama_json.json @@ -0,0 +1,23 @@ +{ + "choices": [ + { + "finish_reason": "eos_token", + "index": 0, + "logprobs": null, + "message": { + "content": "{\n \"temperature\": [\n 35,\n 34,\n 36\n ],\n \"unit\": \"°c\"\n}", + "role": "assistant" + } + } + ], + "created": 1718044128, + "id": "", + "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "object": "text_completion", + "system_fingerprint": "2.0.5-dev0-native", + "usage": { + "completion_tokens": 39, + "prompt_tokens": 136, + "total_tokens": 175 + } +} diff --git a/integration-tests/models/test_grammar_response_format_llama.py b/integration-tests/models/test_grammar_response_format_llama.py new file mode 100644 index 00000000..9c4c048e --- /dev/null +++ b/integration-tests/models/test_grammar_response_format_llama.py @@ -0,0 +1,101 @@ +import pytest +import requests +from pydantic import BaseModel +from typing import List + + +@pytest.fixture(scope="module") +def llama_grammar_handle(launcher): + with launcher( + "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + num_shard=1, + disable_grammar_support=False, + use_flash_attention=False, + max_batch_prefill_tokens=3000, + ) as handle: + yield handle + + +@pytest.fixture(scope="module") +async def llama_grammar(llama_grammar_handle): + await llama_grammar_handle.health(300) + return llama_grammar_handle.client + + +@pytest.mark.asyncio +async def test_grammar_response_format_llama_json(llama_grammar, response_snapshot): + + class Weather(BaseModel): + unit: str + temperature: List[int] + + # send the request + response = requests.post( + f"{llama_grammar.base_url}/v1/chat/completions", + headers=llama_grammar.headers, + json={ + "model": "tgi", + "messages": [ + { + "role": "system", + "content": f"Respond to the users questions and answer them in the following format: {Weather.schema()}", + }, + { + "role": "user", + "content": "What's the weather like the next 3 days in San Francisco, CA?", + }, + ], + "seed": 42, + "max_tokens": 500, + "response_format": {"type": "json_object", "value": Weather.schema()}, + }, + ) + + chat_completion = response.json() + called = chat_completion["choices"][0]["message"]["content"] + + assert response.status_code == 200 + assert ( + called + == '{\n "temperature": [\n 35,\n 34,\n 36\n ],\n "unit": "°c"\n}' + ) + assert chat_completion == response_snapshot + + +@pytest.mark.asyncio +async def test_grammar_response_format_llama_error_if_tools_not_installed( + llama_grammar, +): + class Weather(BaseModel): + unit: str + temperature: List[int] + + # send the request + response = requests.post( + f"{llama_grammar.base_url}/v1/chat/completions", + headers=llama_grammar.headers, + json={ + "model": "tgi", + "messages": [ + { + "role": "system", + "content": f"Respond to the users questions and answer them in the following format: {Weather.schema()}", + }, + { + "role": "user", + "content": "What's the weather like the next 3 days in San Francisco, CA?", + }, + ], + "seed": 42, + "max_tokens": 500, + "tools": [], + "response_format": {"type": "json_object", "value": Weather.schema()}, + }, + ) + + # 422 means the server was unable to process the request because it contains invalid data. + assert response.status_code == 422 + assert response.json() == { + "error": "Grammar and tools are mutually exclusive", + "error_type": "grammar and tools", + } diff --git a/router/src/lib.rs b/router/src/lib.rs index b6902c49..1016019d 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -89,6 +89,7 @@ pub(crate) enum GrammarType { /// JSON Schema is a declarative language that allows to annotate JSON documents /// with types and descriptions. #[serde(rename = "json")] + #[serde(alias = "json_object")] #[schema(example = json ! ({"properties": {"location":{"type": "string"}}}))] Json(serde_json::Value), #[serde(rename = "regex")] @@ -791,6 +792,13 @@ pub(crate) struct ChatRequest { #[schema(nullable = true, example = "null")] #[serde(deserialize_with = "deserialize_tool_choice::deserialize")] pub tool_choice: Option, + + /// Response format constraints for the generation. + /// + /// NOTE: A request can use `response_format` OR `tools` but not both. + #[serde(default)] + #[schema(nullable = true, default = "null", example = "null")] + pub response_format: Option, } fn default_tool_prompt() -> Option { diff --git a/router/src/server.rs b/router/src/server.rs index 30479b0e..e3c2c4f9 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -1016,6 +1016,7 @@ async fn chat_completions( tool_choice, tool_prompt, temperature, + response_format, .. } = req; @@ -1030,6 +1031,18 @@ async fn chat_completions( other => (true, other), }; + // response_format and tools are mutually exclusive + if response_format.is_some() && tools.as_ref().is_some() { + metrics::increment_counter!("tgi_request_failure", "err" => "validation"); + return Err(( + StatusCode::UNPROCESSABLE_ENTITY, + Json(ErrorResponse { + error: "Grammar and tools are mutually exclusive".to_string(), + error_type: "grammar and tools".to_string(), + }), + )); + } + // extract tool grammar if present let tool_grammar = match ToolGrammar::apply(tools, tool_choice) { Ok(grammar) => grammar, @@ -1046,16 +1059,21 @@ async fn chat_completions( } }; - let grammar_with_prompt = tool_grammar + // determine the appropriate arguments for apply_chat_template + let tools_grammar_prompt = tool_grammar .as_ref() .map(|t| (GrammarType::Json(serde_json::json!(t)), tool_prompt)); - let typed_grammar = grammar_with_prompt - .as_ref() - .map(|(grammar, _)| grammar.clone()); + let (tools_grammar_prompt, grammar) = match response_format { + Some(response_format) => (None, Some(response_format)), + None => ( + tools_grammar_prompt.clone(), + tools_grammar_prompt.map(|(grammar, _)| grammar.clone()), + ), + }; // apply chat template to flatten the request into a single input - let inputs = match infer.apply_chat_template(messages, grammar_with_prompt) { + let inputs = match infer.apply_chat_template(messages, tools_grammar_prompt) { Ok(inputs) => inputs, Err(err) => { metrics::increment_counter!("tgi_request_failure", "err" => "validation"); @@ -1091,7 +1109,7 @@ async fn chat_completions( decoder_input_details: !stream, seed, top_n_tokens: req.top_logprobs, - grammar: typed_grammar, + grammar, }, }; From 521de6cacd2af42caa1f93c75a34460a6ecddf9e Mon Sep 17 00:00:00 2001 From: OlivierDehaene Date: Wed, 12 Jun 2024 18:22:20 +0200 Subject: [PATCH 03/15] fix(server): fix OPT implementation (#2061) --- .../models/custom_modeling/opt_modeling.py | 2 +- server/text_generation_server/models/gpt_neox.py | 3 +-- server/text_generation_server/models/opt.py | 4 ++-- server/text_generation_server/models/rw.py | 8 +++++--- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/server/text_generation_server/models/custom_modeling/opt_modeling.py b/server/text_generation_server/models/custom_modeling/opt_modeling.py index 83d62dea..9b2d01e0 100644 --- a/server/text_generation_server/models/custom_modeling/opt_modeling.py +++ b/server/text_generation_server/models/custom_modeling/opt_modeling.py @@ -792,7 +792,7 @@ class OPTForCausalLM(OPTPreTrainedModel): return_dict=return_dict, ) - logits, speculative_logits = self.lm_head(outputs) + logits, speculative_logits = self.lm_head(outputs.last_hidden_state) loss = None diff --git a/server/text_generation_server/models/gpt_neox.py b/server/text_generation_server/models/gpt_neox.py index c0e1adf2..d1f8f5be 100644 --- a/server/text_generation_server/models/gpt_neox.py +++ b/server/text_generation_server/models/gpt_neox.py @@ -85,5 +85,4 @@ class GPTNeoxSharded(CausalLM): use_cache=True, ) - logits = outputs.logits - return logits, speculative_logits, outputs.past_key_values + return outputs.logits, speculative_logits, outputs.past_key_values diff --git a/server/text_generation_server/models/opt.py b/server/text_generation_server/models/opt.py index 5b84f4ff..87319ef0 100644 --- a/server/text_generation_server/models/opt.py +++ b/server/text_generation_server/models/opt.py @@ -75,11 +75,11 @@ class OPTSharded(CausalLM): def forward( self, input_ids, attention_mask, position_ids, past_key_values: Optional = None ): - outputs = self.model.forward( + outputs, speculative_logits = self.model.forward( input_ids=input_ids, attention_mask=attention_mask, past_key_values=past_key_values, use_cache=True, ) - return outputs.logits, outputs.past_key_values + return outputs.logits, speculative_logits, outputs.past_key_values diff --git a/server/text_generation_server/models/rw.py b/server/text_generation_server/models/rw.py index d4764ded..50f6ead8 100644 --- a/server/text_generation_server/models/rw.py +++ b/server/text_generation_server/models/rw.py @@ -71,11 +71,13 @@ class RW(CausalLM): def forward( self, input_ids, attention_mask, position_ids, past_key_values: Optional = None - ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]: + ): # Model Forward - outputs = self.model.forward( + outputs, speculative_logits = self.model.forward( input_ids=input_ids, attention_mask=attention_mask, past_key_values=past_key_values, + use_cache=True, ) - return outputs.logits, outputs.past_key_values + + return outputs.logits, speculative_logits, outputs.past_key_values From 90184df79c12ec2aa9111248077e237ca2ba9ee9 Mon Sep 17 00:00:00 2001 From: OlivierDehaene Date: Wed, 12 Jun 2024 18:24:47 +0200 Subject: [PATCH 04/15] fix(layers): fix SuRotaryEmbedding (#2060) * fix(layers): fix SuRotaryEmbedding * change arange * remove logs --- .../text_generation_server/layers/rotary.py | 26 ++++++++++--------- .../models/flash_phi.py | 3 +-- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/server/text_generation_server/layers/rotary.py b/server/text_generation_server/layers/rotary.py index 648d28ab..c2f12189 100644 --- a/server/text_generation_server/layers/rotary.py +++ b/server/text_generation_server/layers/rotary.py @@ -267,19 +267,21 @@ class SuRotaryEmbedding(PositionRotaryEmbedding): or self._cos_cached.dtype != dtype ): self._seq_len_cached = seqlen - if seqlen > self.original_max_position_embeddings: - inv_freq = self.long_inv_freq - else: - inv_freq = self.short_inv_freq - t = torch.arange(seqlen, device=device, dtype=inv_freq.dtype) - if self.scaling_factor is not None: - t /= self.scaling_factor - # Don't do einsum, it converts fp32 to fp16 - # freqs = torch.einsum("i,j->ij", t, self.inv_freq) - freqs = torch.outer(t, inv_freq.to(device=t.device)) - self._cos_cached = torch.cos(freqs).to(dtype) - self._sin_cached = torch.sin(freqs).to(dtype) + t = torch.arange(seqlen, device=device, dtype=self.short_inv_freq.dtype) + short_freqs = torch.outer( + t[: self.original_max_position_embeddings], + self.short_inv_freq.to(device=t.device), + ) + long_freqs = torch.outer( + t[self.original_max_position_embeddings :], + self.long_inv_freq.to(device=t.device), + ) + + freqs = torch.cat([short_freqs, long_freqs]) + + self._cos_cached = (torch.cos(freqs) * self.scaling_factor).to(dtype) + self._sin_cached = (torch.sin(freqs) * self.scaling_factor).to(dtype) class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding): diff --git a/server/text_generation_server/models/flash_phi.py b/server/text_generation_server/models/flash_phi.py index 32b573a9..6a52c1d7 100644 --- a/server/text_generation_server/models/flash_phi.py +++ b/server/text_generation_server/models/flash_phi.py @@ -8,7 +8,6 @@ from typing import Optional from text_generation_server.models import FlashCausalLM from text_generation_server.models.custom_modeling.flash_phi_modeling import ( FlashPhiForCausalLM, - PhiConfig, ) from text_generation_server.utils import ( initialize_torch_distributed, @@ -44,7 +43,7 @@ class FlashPhi(FlashCausalLM): trust_remote_code=trust_remote_code, ) - config = PhiConfig.from_pretrained( + config = AutoConfig.from_pretrained( model_id, revision=revision, trust_remote_code=trust_remote_code ) config.quantize = quantize From 42aa8ee1bb57ca5ae453c3feeb2485112664b46c Mon Sep 17 00:00:00 2001 From: drbh Date: Thu, 13 Jun 2024 11:53:49 -0400 Subject: [PATCH 05/15] PR #2049 CI run (#2054) * Use minijinja's pycompat mode for python methods * fix: cargo fmt lint for pre commit --------- Co-authored-by: Armin Ronacher --- Cargo.lock | 16 ++++++++++++++-- router/Cargo.toml | 3 ++- router/src/infer/mod.rs | 13 +++++-------- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b5de8576..b9bd7363 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1856,12 +1856,23 @@ dependencies = [ [[package]] name = "minijinja" -version = "1.0.12" -source = "git+https://github.com/mitsuhiko/minijinja.git?rev=5cd4efb#5cd4efb9e2639247df275fe6e22a5dbe0ce71b28" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e136ef580d7955019ab0a407b68d77c292a9976907e217900f3f76bc8f6dc1a4" dependencies = [ "serde", ] +[[package]] +name = "minijinja-contrib" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15ee37078c98d31e510d6a7af488031a2c3ccacdb76c5c4fc98ddfe6d0e9da07" +dependencies = [ + "minijinja", + "serde", +] + [[package]] name = "minimal-lexical" version = "0.2.1" @@ -3604,6 +3615,7 @@ dependencies = [ "metrics", "metrics-exporter-prometheus", "minijinja", + "minijinja-contrib", "ngrok", "nohash-hasher", "once_cell", diff --git a/router/Cargo.toml b/router/Cargo.toml index 2e6264be..3262e7e6 100644 --- a/router/Cargo.toml +++ b/router/Cargo.toml @@ -44,7 +44,8 @@ utoipa = { version = "4.2.0", features = ["axum_extras"] } utoipa-swagger-ui = { version = "6.0.0", features = ["axum"] } ngrok = { version = "0.13.1", features = ["axum"], optional = true } init-tracing-opentelemetry = { version = "0.14.1", features = ["opentelemetry-otlp"] } -minijinja = { git = "https://github.com/mitsuhiko/minijinja.git", rev = "5cd4efb" } +minijinja = { version = "2.0.2" } +minijinja-contrib = { version = "2.0.2", features = ["pycompat"] } futures-util = "0.3.30" regex = "1.10.3" once_cell = "1.19.0" diff --git a/router/src/infer/mod.rs b/router/src/infer/mod.rs index 20630c1b..07c334a3 100644 --- a/router/src/infer/mod.rs +++ b/router/src/infer/mod.rs @@ -12,6 +12,8 @@ use crate::{ use crate::{FunctionRef, FunctionsMap, GrammarType, Properties, Tool, ToolType, Tools}; use futures::future::try_join_all; use minijinja::{Environment, ErrorKind, Template}; +use minijinja_contrib::pycompat; + use serde_json::{json, Map, Value}; use std::collections::HashMap; use std::sync::Arc; @@ -62,14 +64,7 @@ impl Infer { .find(|t| t.name == "default") .map(|t| t.template), }) - .map(|t| { - // .strip() is not supported in minijinja - // .capitalize() is not supported in minijinja but we can use | capitalize - let t = t - .replace(".strip()", " | trim") - .replace(".capitalize()", " | capitalize"); - ChatTemplate::new(t, tokenizer_config.bos_token, tokenizer_config.eos_token) - }); + .map(|t| ChatTemplate::new(t, tokenizer_config.bos_token, tokenizer_config.eos_token)); // Inference limit with a semaphore let semaphore = Arc::new(Semaphore::new(max_concurrent_requests)); @@ -277,6 +272,8 @@ struct ChatTemplate { impl ChatTemplate { fn new(template: String, bos_token: Option, eos_token: Option) -> Self { let mut env = Box::new(Environment::new()); + // enable things like .strip() or .capitalize() + env.set_unknown_method_callback(pycompat::unknown_method_callback); let template_str = template.into_boxed_str(); env.add_function("raise_exception", raise_exception); From f433f1f7705ba5d9110532a223d340effef059de Mon Sep 17 00:00:00 2001 From: drbh Date: Thu, 13 Jun 2024 12:51:51 -0400 Subject: [PATCH 06/15] implement Open Inference Protocol endpoints (#1942) * feat: add kserve feature and basic routes * feat: implement infer endpoint wrapper around generate * fix: refactor and improve types * fix: improve infer and simplify * fix: cleanup and improve api docs * fix: refactor and encapsulate kserve feat in file * fix: remove typos after rebase --- router/Cargo.toml | 1 + router/src/kserve.rs | 247 +++++++++++++++++++++++++++++++++++++++++++ router/src/lib.rs | 3 + router/src/server.rs | 98 +++++++++++++---- 4 files changed, 328 insertions(+), 21 deletions(-) create mode 100644 router/src/kserve.rs diff --git a/router/Cargo.toml b/router/Cargo.toml index 3262e7e6..5bf4c00c 100644 --- a/router/Cargo.toml +++ b/router/Cargo.toml @@ -59,3 +59,4 @@ vergen = { version = "8.2.5", features = ["build", "git", "gitcl"] } default = ["ngrok"] ngrok = ["dep:ngrok"] google = [] +kserve = [] diff --git a/router/src/kserve.rs b/router/src/kserve.rs new file mode 100644 index 00000000..b64efd38 --- /dev/null +++ b/router/src/kserve.rs @@ -0,0 +1,247 @@ +use crate::{ + default_parameters, + server::{generate_internal, ComputeType}, + Deserialize, ErrorResponse, GenerateParameters, GenerateRequest, Infer, Serialize, ToSchema, +}; +use axum::extract::{Extension, Path}; +use axum::response::{IntoResponse, Response}; +use axum::Json; +use futures::stream::FuturesUnordered; +use futures::TryStreamExt; +use reqwest::header::HeaderMap; +use reqwest::StatusCode; + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct OutputChunk { + pub name: String, + pub shape: Vec, + pub datatype: String, + pub data: Vec, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct InferenceOutput { + pub id: String, + pub outputs: Vec, +} + +#[derive(Debug, Deserialize, ToSchema)] +pub(crate) struct InferenceRequest { + pub id: String, + #[serde(default = "default_parameters")] + pub parameters: GenerateParameters, + pub inputs: Vec, + pub outputs: Vec, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub(crate) struct Input { + pub name: String, + pub shape: Vec, + pub datatype: String, + pub data: Vec, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub(crate) struct Output { + pub name: String, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct LiveResponse { + pub live: bool, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct ReadyResponse { + pub live: bool, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct MetadataServerResponse { + pub name: String, + pub version: String, + pub extensions: Vec, +} + +// Routes + +#[utoipa::path( + post, + tag = "Text Generation Inference", + path = "/v2/health/live", + responses( + (status = 200, description = "Service is live", body = LiveReponse), + (status = 404, description = "Service not found", body = ErrorResponse, + example = json!({"error": "No response"})) + ) +)] +pub async fn kserve_health_live() -> Result)> { + let data = LiveResponse { live: true }; + Ok((HeaderMap::new(), Json(data)).into_response()) +} + +#[utoipa::path( + post, + tag = "Text Generation Inference", + path = "/v2/health/ready", + responses( + (status = 200, description = "Service is ready", body = ReadyResponse), + (status = 404, description = "Service not found", body = ErrorResponse, + example = json!({"error": "No response"})) + ) +)] +pub async fn kserve_health_ready() -> Result)> { + let data = ReadyResponse { live: true }; + Ok((HeaderMap::new(), Json(data)).into_response()) +} + +#[utoipa::path( + get, + tag = "Text Generation Inference", + path = "/v2", + responses( + (status = 200, description = "Metadata retrieved", body = MetadataServerResponse), + (status = 404, description = "Service not found", body = ErrorResponse, + example = json!({"error": "No response"})) + ) +)] +pub async fn kerve_server_metadata() -> Result)> { + let data = MetadataServerResponse { + name: "text-generation-inference".to_string(), + version: env!("CARGO_PKG_VERSION").to_string(), + extensions: vec![ + "health".to_string(), + "models".to_string(), + "metrics".to_string(), + ], + }; + Ok((HeaderMap::new(), Json(data)).into_response()) +} + +#[utoipa::path( + get, + tag = "Text Generation Inference", + path = "/v2/models/{model_name}/versions/{model_version}", + responses( + (status = 200, description = "Model version metadata retrieved", body = MetadataServerResponse), + (status = 404, description = "Model or version not found", body = ErrorResponse, + example = json!({"error": "No response"})) + ) +)] +pub async fn kserve_model_metadata( + Path((model_name, model_version)): Path<(String, String)>, +) -> Result)> { + let data = MetadataServerResponse { + name: model_name, + version: model_version, + extensions: vec!["infer".to_string(), "ready".to_string()], + }; + Ok((HeaderMap::new(), Json(data)).into_response()) +} + +#[utoipa::path( + post, + tag = "Text Generation Inference", + path = "/v2/models/{model_name}/versions/{model_version}/infer", + request_body = Json, + responses( + (status = 200, description = "Inference executed successfully", body = InferenceOutput), + (status = 404, description = "Model or version not found", body = ErrorResponse, + example = json!({"error": "No response"})) + ) +)] +pub async fn kserve_model_infer( + infer: Extension, + Extension(compute_type): Extension, + Json(payload): Json, +) -> Result)> { + let id = payload.id.clone(); + let str_inputs = payload + .inputs + .iter() + .map(|input| { + std::str::from_utf8(&input.data).map_err(|e| { + ( + StatusCode::UNPROCESSABLE_ENTITY, + Json(ErrorResponse { + error: e.to_string(), + error_type: "utf8".to_string(), + }), + ) + }) + }) + .collect::, _>>()?; + + if str_inputs.len() != payload.outputs.len() { + return Err(( + StatusCode::UNPROCESSABLE_ENTITY, + Json(ErrorResponse { + error: "Inputs and outputs length mismatch".to_string(), + error_type: "length mismatch".to_string(), + }), + )); + } + + let output_chunks = str_inputs + .iter() + .zip(&payload.outputs) + .map(|(str_input, output)| { + let generate_request = GenerateRequest { + inputs: str_input.to_string(), + parameters: payload.parameters.clone(), + }; + let infer = infer.clone(); + let compute_type = compute_type.clone(); + let span = tracing::Span::current(); + async move { + generate_internal(infer, compute_type, Json(generate_request), span) + .await + .map(|(_, Json(generation))| { + let generation_as_bytes = generation.generated_text.as_bytes().to_vec(); + OutputChunk { + name: output.name.clone(), + shape: vec![1, generation_as_bytes.len()], + datatype: "BYTES".to_string(), + data: generation_as_bytes, + } + }) + .map_err(|_| { + ( + StatusCode::INTERNAL_SERVER_ERROR, + Json(ErrorResponse { + error: "Incomplete generation".into(), + error_type: "Incomplete generation".into(), + }), + ) + }) + } + }) + .collect::>() + .try_collect::>() + .await?; + + let inference_output = InferenceOutput { + id: id.clone(), + outputs: output_chunks, + }; + + Ok((HeaderMap::new(), Json(inference_output)).into_response()) +} + +#[utoipa::path( + get, + tag = "Text Generation Inference", + path = "/v2/models/{model_name}/versions/{model_version}/ready", + responses( + (status = 200, description = "Model version is ready", body = ReadyResponse), + (status = 404, description = "Model or version not found", body = ErrorResponse, + example = json!({"error": "No response"})) + ) +)] +pub async fn kserve_model_metadata_ready( + Path((_model_name, _model_version)): Path<(String, String)>, +) -> Result)> { + let data = ReadyResponse { live: true }; + Ok((HeaderMap::new(), Json(data)).into_response()) +} diff --git a/router/src/lib.rs b/router/src/lib.rs index 1016019d..b0b93c13 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -4,6 +4,9 @@ mod infer; pub mod server; mod validation; +#[cfg(feature = "kserve")] +mod kserve; + use serde::{Deserialize, Serialize}; use tracing::warn; use utoipa::ToSchema; diff --git a/router/src/server.rs b/router/src/server.rs index e3c2c4f9..aa872df9 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -4,6 +4,11 @@ use crate::infer::v2::SchedulerV2; use crate::infer::v3::SchedulerV3; use crate::infer::{HealthCheck, Scheduler}; use crate::infer::{Infer, InferError, InferResponse, InferStreamResponse, ToolGrammar}; +#[cfg(feature = "kserve")] +use crate::kserve::{ + kerve_server_metadata, kserve_health_live, kserve_health_ready, kserve_model_infer, + kserve_model_metadata, kserve_model_metadata_ready, +}; use crate::validation::ValidationError; use crate::{ BestOfSequence, Details, ErrorResponse, FinishReason, GenerateParameters, GenerateRequest, @@ -172,7 +177,7 @@ async fn generate( generate_internal(infer, ComputeType(compute_type), Json(req), span).await } -async fn generate_internal( +pub(crate) async fn generate_internal( infer: Extension, ComputeType(compute_type): ComputeType, Json(req): Json, @@ -1727,28 +1732,58 @@ pub async fn run( docker_label: option_env!("DOCKER_LABEL"), }; - // Define VertextApiDoc conditionally only if the "google" feature is enabled - let doc = { - // avoid `mut` if possible - #[cfg(feature = "google")] - { - use crate::VertexInstance; + #[allow(unused_mut)] // mut is needed for conditional compilation + let mut doc = ApiDoc::openapi(); - #[derive(OpenApi)] - #[openapi( - paths(vertex_compatibility), - components(schemas(VertexInstance, VertexRequest, VertexResponse)) - )] - struct VertextApiDoc; + #[cfg(feature = "google")] + { + use crate::VertexInstance; - // limiting mutability to the smallest scope necessary - let mut doc = ApiDoc::openapi(); - doc.merge(VertextApiDoc::openapi()); - doc - } - #[cfg(not(feature = "google"))] - ApiDoc::openapi() - }; + #[derive(OpenApi)] + #[openapi( + paths(vertex_compatibility), + components(schemas(VertexInstance, VertexRequest, VertexResponse)) + )] + struct VertexApiDoc; + + doc.merge(VertexApiDoc::openapi()); + } + + #[cfg(feature = "kserve")] + { + use crate::kserve::{ + InferenceOutput, InferenceRequest, LiveResponse, MetadataServerResponse, OutputChunk, + ReadyResponse, + }; + use crate::kserve::{ + __path_kerve_server_metadata, __path_kserve_health_live, __path_kserve_health_ready, + __path_kserve_model_infer, __path_kserve_model_metadata, + __path_kserve_model_metadata_ready, + }; + + #[derive(OpenApi)] + #[openapi( + paths( + kserve_model_infer, + kserve_health_live, + kserve_health_ready, + kerve_server_metadata, + kserve_model_metadata, + kserve_model_metadata_ready, + ), + components(schemas( + InferenceOutput, + InferenceRequest, + LiveResponse, + MetadataServerResponse, + OutputChunk, + ReadyResponse, + )) + )] + struct KServeApiDoc; + + doc.merge(KServeApiDoc::openapi()); + } // Configure Swagger UI let swagger_ui = SwaggerUi::new("/docs").url("/api-doc/openapi.json", doc); @@ -1798,6 +1833,27 @@ pub async fn run( } } + #[cfg(feature = "kserve")] + { + tracing::info!("Built with `kserve` feature"); + app = app + .route( + "/v2/models/:model_name/versions/:model_version/infer", + post(kserve_model_infer), + ) + .route( + "/v2/models/:model_name/versions/:model_version", + get(kserve_model_metadata), + ) + .route("/v2/health/ready", get(kserve_health_ready)) + .route("/v2/health/live", get(kserve_health_live)) + .route("/v2", get(kerve_server_metadata)) + .route( + "/v2/models/:model_name/versions/:model_version/ready", + get(kserve_model_metadata_ready), + ); + } + // add layers after routes app = app .layer(Extension(info)) From 093a27c528dccefe83316d3ef1ff03b85cacdb94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Fri, 14 Jun 2024 09:45:42 +0200 Subject: [PATCH 07/15] Add support for GPTQ Marlin (#2052) Add support for GPTQ Marlin kernels GPTQ Marlin extends the Marlin kernels to support common GPTQ configurations: - bits: 4 or 8 - groupsize: -1, 32, 64, or 128 - desc_act: true/false Using the GPTQ Marlin kernels requires repacking the parameters in the Marlin quantizer format. The kernels were contributed by Neural Magic to VLLM. We vendor them here for convenience. --- Dockerfile | 6 +- .../test_flash_llama_gptq_marlin.json | 84 + ...st_flash_llama_gptq_marlin_all_params.json | 84 + .../test_flash_llama_gptq_marlin_load.json | 338 +++ .../models/test_flash_llama_gptq_marlin.py | 65 + server/Makefile | 1 - server/Makefile-marlin | 11 - server/marlin/COPYRIGHT | 20 + server/marlin/marlin_kernels/__init__.pyi | 44 + server/marlin/marlin_kernels/ext.cpp | 11 + server/marlin/marlin_kernels/ext.hh | 23 + server/marlin/marlin_kernels/gptq_marlin.cu | 1870 +++++++++++++++++ server/marlin/marlin_kernels/gptq_marlin.cuh | 76 + .../marlin_kernels/gptq_marlin_dtypes.cuh | 77 + .../marlin_kernels/gptq_marlin_repack.cu | 350 +++ .../marlin_kernels/marlin_cuda_kernel.cu | 1136 ++++++++++ server/marlin/marlin_kernels/py.typed | 0 server/marlin/setup.py | 21 + .../text_generation_server/layers/linear.py | 17 +- .../text_generation_server/layers/marlin.py | 256 ++- server/text_generation_server/models/bloom.py | 2 +- .../custom_modeling/flash_cohere_modeling.py | 2 +- .../flash_santacoder_modeling.py | 15 +- .../flash_starcoder2_modeling.py | 2 +- .../models/flash_cohere.py | 2 +- .../models/flash_dbrx.py | 2 +- .../models/flash_gemma.py | 2 +- .../models/flash_llama.py | 2 +- .../models/flash_mistral.py | 2 +- .../models/flash_neox.py | 2 +- .../models/flash_phi.py | 2 +- .../models/flash_qwen2.py | 2 +- .../text_generation_server/models/flash_rw.py | 2 +- .../models/flash_santacoder.py | 2 +- .../models/flash_starcoder2.py | 2 +- .../models/galactica.py | 2 +- .../text_generation_server/models/gpt_neox.py | 2 +- server/text_generation_server/models/mpt.py | 2 +- server/text_generation_server/models/opt.py | 2 +- .../text_generation_server/utils/weights.py | 253 ++- 40 files changed, 4654 insertions(+), 140 deletions(-) create mode 100644 integration-tests/models/__snapshots__/test_flash_llama_gptq_marlin/test_flash_llama_gptq_marlin.json create mode 100644 integration-tests/models/__snapshots__/test_flash_llama_gptq_marlin/test_flash_llama_gptq_marlin_all_params.json create mode 100644 integration-tests/models/__snapshots__/test_flash_llama_gptq_marlin/test_flash_llama_gptq_marlin_load.json create mode 100644 integration-tests/models/test_flash_llama_gptq_marlin.py delete mode 100644 server/Makefile-marlin create mode 100644 server/marlin/COPYRIGHT create mode 100644 server/marlin/marlin_kernels/__init__.pyi create mode 100644 server/marlin/marlin_kernels/ext.cpp create mode 100644 server/marlin/marlin_kernels/ext.hh create mode 100644 server/marlin/marlin_kernels/gptq_marlin.cu create mode 100644 server/marlin/marlin_kernels/gptq_marlin.cuh create mode 100644 server/marlin/marlin_kernels/gptq_marlin_dtypes.cuh create mode 100644 server/marlin/marlin_kernels/gptq_marlin_repack.cu create mode 100644 server/marlin/marlin_kernels/marlin_cuda_kernel.cu create mode 100644 server/marlin/marlin_kernels/py.typed create mode 100644 server/marlin/setup.py diff --git a/Dockerfile b/Dockerfile index f2f6df5f..14628339 100644 --- a/Dockerfile +++ b/Dockerfile @@ -140,9 +140,9 @@ RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq # Build marlin kernels FROM kernel-builder as marlin-kernels-builder WORKDIR /usr/src -COPY server/Makefile-marlin Makefile +COPY server/marlin/ . # Build specific version of transformers -RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-marlin +RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build # Build Transformers CUDA kernels FROM kernel-builder as custom-kernels-builder @@ -213,7 +213,7 @@ COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86 # Copy build artifacts from eetq kernels builder COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages # Copy build artifacts from marlin kernels builder -COPY --from=marlin-kernels-builder /usr/src/marlin/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages +COPY --from=marlin-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages # Copy builds artifacts from vllm builder COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages diff --git a/integration-tests/models/__snapshots__/test_flash_llama_gptq_marlin/test_flash_llama_gptq_marlin.json b/integration-tests/models/__snapshots__/test_flash_llama_gptq_marlin/test_flash_llama_gptq_marlin.json new file mode 100644 index 00000000..0f99d259 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_llama_gptq_marlin/test_flash_llama_gptq_marlin.json @@ -0,0 +1,84 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 2323, + "logprob": null, + "text": "Test" + }, + { + "id": 1715, + "logprob": -11.34375, + "text": " request" + } + ], + "seed": null, + "tokens": [ + { + "id": 198, + "logprob": -2.5742188, + "special": false, + "text": "\n" + }, + { + "id": 262, + "logprob": -1.6230469, + "special": false, + "text": " " + }, + { + "id": 3270, + "logprob": -2.046875, + "special": false, + "text": " \"\"\"\n" + }, + { + "id": 262, + "logprob": -0.015281677, + "special": false, + "text": " " + }, + { + "id": 422, + "logprob": -2.1425781, + "special": false, + "text": " if" + }, + { + "id": 1715, + "logprob": -0.9238281, + "special": false, + "text": " request" + }, + { + "id": 13204, + "logprob": -0.076660156, + "special": false, + "text": ".method" + }, + { + "id": 624, + "logprob": -0.021987915, + "special": false, + "text": " ==" + }, + { + "id": 364, + "logprob": -0.39208984, + "special": false, + "text": " '" + }, + { + "id": 3019, + "logprob": -0.10821533, + "special": false, + "text": "POST" + } + ], + "top_tokens": null + }, + "generated_text": "\n \"\"\"\n if request.method == 'POST" +} diff --git a/integration-tests/models/__snapshots__/test_flash_llama_gptq_marlin/test_flash_llama_gptq_marlin_all_params.json b/integration-tests/models/__snapshots__/test_flash_llama_gptq_marlin/test_flash_llama_gptq_marlin_all_params.json new file mode 100644 index 00000000..4152b5b3 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_llama_gptq_marlin/test_flash_llama_gptq_marlin_all_params.json @@ -0,0 +1,84 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 2323, + "logprob": null, + "text": "Test" + }, + { + "id": 1715, + "logprob": -11.34375, + "text": " request" + } + ], + "seed": 0, + "tokens": [ + { + "id": 13, + "logprob": -2.2539062, + "special": false, + "text": "." + }, + { + "id": 578, + "logprob": -0.15563965, + "special": false, + "text": " The" + }, + { + "id": 3622, + "logprob": -0.8203125, + "special": false, + "text": " server" + }, + { + "id": 706, + "logprob": 0.0, + "special": false, + "text": " has" + }, + { + "id": 539, + "logprob": 0.0, + "special": false, + "text": " not" + }, + { + "id": 3686, + "logprob": 0.0, + "special": false, + "text": " yet" + }, + { + "id": 3288, + "logprob": 0.0, + "special": false, + "text": " sent" + }, + { + "id": 904, + "logprob": 0.0, + "special": false, + "text": " any" + }, + { + "id": 828, + "logprob": 0.0, + "special": false, + "text": " data" + }, + { + "id": 382, + "logprob": -1.5517578, + "special": false, + "text": ".\n\n" + } + ], + "top_tokens": null + }, + "generated_text": "Test request. The server has not yet sent any data.\n\n" +} diff --git a/integration-tests/models/__snapshots__/test_flash_llama_gptq_marlin/test_flash_llama_gptq_marlin_load.json b/integration-tests/models/__snapshots__/test_flash_llama_gptq_marlin/test_flash_llama_gptq_marlin_load.json new file mode 100644 index 00000000..75e90303 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_llama_gptq_marlin/test_flash_llama_gptq_marlin_load.json @@ -0,0 +1,338 @@ +[ + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 2323, + "logprob": null, + "text": "Test" + }, + { + "id": 1715, + "logprob": -11.34375, + "text": " request" + } + ], + "seed": null, + "tokens": [ + { + "id": 198, + "logprob": -2.5742188, + "special": false, + "text": "\n" + }, + { + "id": 262, + "logprob": -1.6220703, + "special": false, + "text": " " + }, + { + "id": 3270, + "logprob": -2.0410156, + "special": false, + "text": " \"\"\"\n" + }, + { + "id": 262, + "logprob": -0.015281677, + "special": false, + "text": " " + }, + { + "id": 422, + "logprob": -2.1445312, + "special": false, + "text": " if" + }, + { + "id": 1715, + "logprob": -0.92333984, + "special": false, + "text": " request" + }, + { + "id": 13204, + "logprob": -0.07672119, + "special": false, + "text": ".method" + }, + { + "id": 624, + "logprob": -0.021987915, + "special": false, + "text": " ==" + }, + { + "id": 364, + "logprob": -0.39208984, + "special": false, + "text": " '" + }, + { + "id": 3019, + "logprob": -0.10638428, + "special": false, + "text": "POST" + } + ], + "top_tokens": null + }, + "generated_text": "\n \"\"\"\n if request.method == 'POST" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 2323, + "logprob": null, + "text": "Test" + }, + { + "id": 1715, + "logprob": -11.34375, + "text": " request" + } + ], + "seed": null, + "tokens": [ + { + "id": 198, + "logprob": -2.5742188, + "special": false, + "text": "\n" + }, + { + "id": 262, + "logprob": -1.6220703, + "special": false, + "text": " " + }, + { + "id": 3270, + "logprob": -2.0410156, + "special": false, + "text": " \"\"\"\n" + }, + { + "id": 262, + "logprob": -0.015281677, + "special": false, + "text": " " + }, + { + "id": 422, + "logprob": -2.1445312, + "special": false, + "text": " if" + }, + { + "id": 1715, + "logprob": -0.92333984, + "special": false, + "text": " request" + }, + { + "id": 13204, + "logprob": -0.07672119, + "special": false, + "text": ".method" + }, + { + "id": 624, + "logprob": -0.021987915, + "special": false, + "text": " ==" + }, + { + "id": 364, + "logprob": -0.39208984, + "special": false, + "text": " '" + }, + { + "id": 3019, + "logprob": -0.10638428, + "special": false, + "text": "POST" + } + ], + "top_tokens": null + }, + "generated_text": "\n \"\"\"\n if request.method == 'POST" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 2323, + "logprob": null, + "text": "Test" + }, + { + "id": 1715, + "logprob": -11.34375, + "text": " request" + } + ], + "seed": null, + "tokens": [ + { + "id": 198, + "logprob": -2.5742188, + "special": false, + "text": "\n" + }, + { + "id": 262, + "logprob": -1.6220703, + "special": false, + "text": " " + }, + { + "id": 3270, + "logprob": -2.0410156, + "special": false, + "text": " \"\"\"\n" + }, + { + "id": 262, + "logprob": -0.015281677, + "special": false, + "text": " " + }, + { + "id": 422, + "logprob": -2.1445312, + "special": false, + "text": " if" + }, + { + "id": 1715, + "logprob": -0.92333984, + "special": false, + "text": " request" + }, + { + "id": 13204, + "logprob": -0.07672119, + "special": false, + "text": ".method" + }, + { + "id": 624, + "logprob": -0.021987915, + "special": false, + "text": " ==" + }, + { + "id": 364, + "logprob": -0.39208984, + "special": false, + "text": " '" + }, + { + "id": 3019, + "logprob": -0.10638428, + "special": false, + "text": "POST" + } + ], + "top_tokens": null + }, + "generated_text": "\n \"\"\"\n if request.method == 'POST" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 2323, + "logprob": null, + "text": "Test" + }, + { + "id": 1715, + "logprob": -11.34375, + "text": " request" + } + ], + "seed": null, + "tokens": [ + { + "id": 198, + "logprob": -2.5742188, + "special": false, + "text": "\n" + }, + { + "id": 262, + "logprob": -1.6220703, + "special": false, + "text": " " + }, + { + "id": 3270, + "logprob": -2.0410156, + "special": false, + "text": " \"\"\"\n" + }, + { + "id": 262, + "logprob": -0.015281677, + "special": false, + "text": " " + }, + { + "id": 422, + "logprob": -2.1445312, + "special": false, + "text": " if" + }, + { + "id": 1715, + "logprob": -0.92333984, + "special": false, + "text": " request" + }, + { + "id": 13204, + "logprob": -0.07672119, + "special": false, + "text": ".method" + }, + { + "id": 624, + "logprob": -0.021987915, + "special": false, + "text": " ==" + }, + { + "id": 364, + "logprob": -0.39208984, + "special": false, + "text": " '" + }, + { + "id": 3019, + "logprob": -0.10638428, + "special": false, + "text": "POST" + } + ], + "top_tokens": null + }, + "generated_text": "\n \"\"\"\n if request.method == 'POST" + } +] diff --git a/integration-tests/models/test_flash_llama_gptq_marlin.py b/integration-tests/models/test_flash_llama_gptq_marlin.py new file mode 100644 index 00000000..9c37a644 --- /dev/null +++ b/integration-tests/models/test_flash_llama_gptq_marlin.py @@ -0,0 +1,65 @@ +import pytest + + +@pytest.fixture(scope="module") +def flash_llama_gptq_marlin_handle(launcher): + with launcher( + "astronomer/Llama-3-8B-Instruct-GPTQ-4-Bit", num_shard=2, quantize="marlin" + ) as handle: + yield handle + + +@pytest.fixture(scope="module") +async def flash_llama_gptq_marlin(flash_llama_gptq_marlin_handle): + await flash_llama_gptq_marlin_handle.health(300) + return flash_llama_gptq_marlin_handle.client + + +@pytest.mark.asyncio +@pytest.mark.private +async def test_flash_llama_gptq_marlin(flash_llama_gptq_marlin, response_snapshot): + response = await flash_llama_gptq_marlin.generate( + "Test request", max_new_tokens=10, decoder_input_details=True + ) + + assert response.details.generated_tokens == 10 + assert response == response_snapshot + + +@pytest.mark.asyncio +@pytest.mark.private +async def test_flash_llama_gptq_marlin_all_params( + flash_llama_gptq_marlin, response_snapshot +): + response = await flash_llama_gptq_marlin.generate( + "Test request", + max_new_tokens=10, + repetition_penalty=1.2, + return_full_text=True, + temperature=0.5, + top_p=0.9, + top_k=10, + truncate=5, + typical_p=0.9, + watermark=True, + decoder_input_details=True, + seed=0, + ) + + assert response.details.generated_tokens == 10 + assert response == response_snapshot + + +@pytest.mark.asyncio +@pytest.mark.private +async def test_flash_llama_gptq_marlin_load( + flash_llama_gptq_marlin, generate_load, response_snapshot +): + responses = await generate_load( + flash_llama_gptq_marlin, "Test request", max_new_tokens=10, n=4 + ) + + assert len(responses) == 4 + assert all([r.generated_text == responses[0].generated_text for r in responses]) + + assert responses == response_snapshot diff --git a/server/Makefile b/server/Makefile index f2a45cc0..5257b876 100644 --- a/server/Makefile +++ b/server/Makefile @@ -3,7 +3,6 @@ include Makefile-flash-att-v2 include Makefile-vllm include Makefile-awq include Makefile-eetq -include Makefile-marlin include Makefile-selective-scan unit-tests: diff --git a/server/Makefile-marlin b/server/Makefile-marlin deleted file mode 100644 index 816546af..00000000 --- a/server/Makefile-marlin +++ /dev/null @@ -1,11 +0,0 @@ -marlin_commit := 2f6d7c10e124b3c5fa29ff8d77d568bd7af3274c - -build-marlin: - if [ ! -d 'marlin' ]; then \ - pip install -U ninja packaging --no-cache-dir && \ - git clone https://github.com/IST-DASLab/marlin.git marlin; \ - fi - cd marlin && git fetch && git checkout $(marlin_commit) && python setup.py build - -install-marlin: build-marlin - cd marlin && git fetch && git checkout $(marlin_commit) && pip install -e . diff --git a/server/marlin/COPYRIGHT b/server/marlin/COPYRIGHT new file mode 100644 index 00000000..69f3b8e6 --- /dev/null +++ b/server/marlin/COPYRIGHT @@ -0,0 +1,20 @@ +These kernels were vendored from VLLM. The Marlin kernels were developed +by Elias Frantar and extended by Neural Magic. + +--- + +Copyright (C) Marlin.2024 Elias Frantar +Modified by Neural Magic +Copyright 2024 The vLLM team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/server/marlin/marlin_kernels/__init__.pyi b/server/marlin/marlin_kernels/__init__.pyi new file mode 100644 index 00000000..73597f0c --- /dev/null +++ b/server/marlin/marlin_kernels/__init__.pyi @@ -0,0 +1,44 @@ +import torch + +def gptq_marlin_gemm( + a: torch.Tensor, + b_q_weight: torch.Tensor, + b_scales: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + num_bits: int, + size_m: int, + size_n: int, + size_k: int, + is_k_full: bool, +) -> torch.Tensor: + """ + Matrix multiplication using Marlin kernels. This is an extension of + `marlin_gemm` that supports converted GPTQ kernels. + """ + ... + +def gptq_marlin_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + """Repack GPTQ parameters for Marlin kernels.""" + ... + +def marlin_gemm( + a: torch.Tensor, + b_q_weight: torch.Tensor, + b_scales: torch.Tensor, + workspace: torch.Tensor, + size_m: int, + size_n: int, + size_k: int, +) -> torch.Tensor: + """ + Matrix multiplication using Marlin kernels. + """ + ... diff --git a/server/marlin/marlin_kernels/ext.cpp b/server/marlin/marlin_kernels/ext.cpp new file mode 100644 index 00000000..5855714d --- /dev/null +++ b/server/marlin/marlin_kernels/ext.cpp @@ -0,0 +1,11 @@ +#include + +#include "ext.hh" + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("gptq_marlin_gemm", &gptq_marlin_gemm, + "Marlin gemm with GPTQ compatibility"); + m.def("gptq_marlin_repack", &gptq_marlin_repack, + "Repack GPTQ parameters for Marlin"); + m.def("marlin_gemm", &marlin_gemm, "Marlin gemm"); +} diff --git a/server/marlin/marlin_kernels/ext.hh b/server/marlin/marlin_kernels/ext.hh new file mode 100644 index 00000000..9ea01a3f --- /dev/null +++ b/server/marlin/marlin_kernels/ext.hh @@ -0,0 +1,23 @@ +#pragma once + +#include + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 +// No support for async +#else + +torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight, + torch::Tensor &b_scales, torch::Tensor &g_idx, + torch::Tensor &perm, torch::Tensor &workspace, + int64_t num_bits, int64_t size_m, int64_t size_n, + int64_t size_k, bool is_k_full); + +torch::Tensor gptq_marlin_repack(torch::Tensor &b_q_weight, torch::Tensor &perm, + int64_t size_k, int64_t size_n, + int64_t num_bits); + +torch::Tensor marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight, + torch::Tensor &b_scales, torch::Tensor &workspace, + int64_t size_m, int64_t size_n, int64_t size_k); + +#endif diff --git a/server/marlin/marlin_kernels/gptq_marlin.cu b/server/marlin/marlin_kernels/gptq_marlin.cu new file mode 100644 index 00000000..0beb9de1 --- /dev/null +++ b/server/marlin/marlin_kernels/gptq_marlin.cu @@ -0,0 +1,1870 @@ +/* + * Modified by Neural Magic + * Copyright (C) Marlin.2024 Elias Frantar + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Adapted from https://github.com/IST-DASLab/marlin + */ + +#include "gptq_marlin.cuh" +#include "gptq_marlin_dtypes.cuh" + +#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t) \ + static_assert(std::is_same::value || \ + std::is_same::value, \ + "only float16 and bfloat16 is supported"); + +template +inline std::string str(T x) { + return std::to_string(x); +} + +namespace gptq_marlin { + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + +__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr, + int const* __restrict__ perm_int_ptr, + int4* __restrict__ out_int4_ptr, int size_m, + int size_k, int block_rows) {} + +template shared + // fetch pipeline + const bool has_act_order, // whether act_order is enabled + const int group_blocks = -1 // number of consecutive 16x16 blocks + // with a separate quantization scale + > +__global__ void Marlin( + const int4* __restrict__ A, // fp16 input matrix of shape mxk + const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn + int4* __restrict__ C, // fp16 output buffer of shape mxn + const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape + // (k/groupsize)xn + const int* __restrict__ g_idx, // int32 group indices of shape k + int num_groups, // number of scale groups per output channel + int prob_m, // batch dimension m + int prob_n, // output dimension n + int prob_k, // reduction dimension k + int* locks // extra global storage for barrier synchronization +) {} + +} // namespace gptq_marlin + +torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, + torch::Tensor& b_scales, torch::Tensor& g_idx, + torch::Tensor& perm, torch::Tensor& workspace, + int64_t num_bits, int64_t size_m, int64_t size_n, + int64_t size_k, bool is_k_full) { + TORCH_CHECK_NOT_IMPLEMENTED(false, + "marlin_gemm(..) requires CUDA_ARCH >= 8.0"); + return torch::empty({1, 1}); +} + +#else + +// m16n8k16 tensor core mma instruction with fp16 inputs and fp32 +// output/accumulation. +template +__device__ inline void mma(const typename ScalarType::FragA& a_frag, + const typename ScalarType::FragB& frag_b, + typename ScalarType::FragC& frag_c) { + const uint32_t* a = reinterpret_cast(&a_frag); + const uint32_t* b = reinterpret_cast(&frag_b); + float* c = reinterpret_cast(&frag_c); + if constexpr (std::is_same::value) { + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), + "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); + } else if constexpr (std::is_same::value) { + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), + "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); + } else { + STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t); + } +} + +// Instruction for loading a full 16x16 matrix fragment of operand A from shared +// memory, directly in tensor core layout. +template +__device__ inline void ldsm4(typename ScalarType::FragA& frag_a, + const void* smem_ptr) { + uint32_t* a = reinterpret_cast(&frag_a); + uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); + asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n" + : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3]) + : "r"(smem)); +} + +// Lookup-table based 3-input logical operation; explicitly used for +// dequantization as the compiler does not seem to automatically recognize it in +// all cases. +template +__device__ inline int lop3(int a, int b, int c) { + int res; + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(res) + : "r"(a), "r"(b), "r"(c), "n"(lut)); + return res; +} + +// Constructs destination register by taking bytes from 2 sources (based on +// mask) +template +__device__ inline uint32_t prmt(uint32_t a) { + uint32_t res; + asm volatile("prmt.b32 %0, %1, %2, %3;\n" + : "=r"(res) + : "r"(a), "n"(start_byte), "n"(mask)); + return res; +} + +// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16 +// values. We mostly follow the strategy in the link below, with some small +// changes: +// - FP16: +// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287 +// - BF16: +// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L327-L385 +template +__device__ inline typename ScalarType::FragB dequant_4bit(int q) { + STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t); +} + +template <> +__device__ inline typename ScalarType::FragB dequant_4bit(int q) { + const int LO = 0x000f000f; + const int HI = 0x00f000f0; + const int EX = 0x64006400; + // Guarantee that the `(a & b) | c` operations are LOP3s. + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point + // directly into `SUB` and `ADD`. + const int SUB = 0x64086408; + const int MUL = 0x2c002c00; + const int ADD = 0xd480d480; + typename ScalarType::FragB frag_b; + frag_b[0] = __hsub2(*reinterpret_cast(&lo), + *reinterpret_cast(&SUB)); + frag_b[1] = __hfma2(*reinterpret_cast(&hi), + *reinterpret_cast(&MUL), + *reinterpret_cast(&ADD)); + return frag_b; +} + +template <> +__device__ inline typename ScalarType::FragB +dequant_4bit(int q) { + static constexpr uint32_t MASK = 0x000f000f; + static constexpr uint32_t EX = 0x43004300; + + // Guarantee that the `(a & b) | c` operations are LOP3s. + + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); + q >>= 4; + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); + + typename ScalarType::FragB frag_b; + static constexpr uint32_t MUL = 0x3F803F80; + static constexpr uint32_t ADD = 0xC308C308; + + frag_b[0] = __hfma2(*reinterpret_cast(&lo), + *reinterpret_cast(&MUL), + *reinterpret_cast(&ADD)); + frag_b[1] = __hfma2(*reinterpret_cast(&hi), + *reinterpret_cast(&MUL), + *reinterpret_cast(&ADD)); + return frag_b; +} + +// Fast Int8ToFp16/Int8ToBf16: Efficiently dequantize 8bit int values to fp16 or +// bf16 Reference: +// - FP16: +// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85 +// - BF16: +// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L125-L175 +template +__device__ inline typename ScalarType::FragB dequant_8bit(int q) { + STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t); +} + +template <> +__device__ inline typename ScalarType::FragB dequant_8bit(int q) { + static constexpr uint32_t mask_for_elt_01 = 0x5250; + static constexpr uint32_t mask_for_elt_23 = 0x5351; + static constexpr uint32_t start_byte_for_fp16 = 0x64646464; + + uint32_t lo = prmt(q); + uint32_t hi = prmt(q); + + static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480; + + typename ScalarType::FragB frag_b; + frag_b[0] = __hsub2(*reinterpret_cast(&lo), + *reinterpret_cast(&I8s_TO_F16s_MAGIC_NUM)); + frag_b[1] = __hsub2(*reinterpret_cast(&hi), + *reinterpret_cast(&I8s_TO_F16s_MAGIC_NUM)); + return frag_b; +} + +template <> +__device__ inline typename ScalarType::FragB +dequant_8bit(int q) { + typename ScalarType::FragB frag_b; + + float fp32_intermediates[4]; + uint32_t* fp32_intermediates_casted = + reinterpret_cast(fp32_intermediates); + + static constexpr uint32_t fp32_base = 0x4B000000; + fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650); + fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652); + fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651); + fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653); + + fp32_intermediates[0] -= 8388736.f; + fp32_intermediates[1] -= 8388736.f; + fp32_intermediates[2] -= 8388736.f; + fp32_intermediates[3] -= 8388736.f; + + uint32_t* bf16_result_ptr = reinterpret_cast(&frag_b); + bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0], + fp32_intermediates_casted[1], 0x7632); + bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2], + fp32_intermediates_casted[3], 0x7632); + + return frag_b; +} + +// Multiply dequantized values by the corresponding quantization scale; used +// only for grouped quantization. +template +__device__ inline void scale(typename ScalarType::FragB& frag_b, + typename ScalarType::FragS& frag_s, + int i) { + using scalar_t2 = typename ScalarType::scalar_t2; + scalar_t2 s = + ScalarType::num2num2(reinterpret_cast(&frag_s)[i]); + frag_b[0] = __hmul2(frag_b[0], s); + frag_b[1] = __hmul2(frag_b[1], s); +} + +// Same as above, but for act_order (each K is multiplied individually) +template +__device__ inline void scale4(typename ScalarType::FragB& frag_b, + typename ScalarType::FragS& frag_s_1, + typename ScalarType::FragS& frag_s_2, + typename ScalarType::FragS& frag_s_3, + typename ScalarType::FragS& frag_s_4, + int i) { + using scalar_t2 = typename ScalarType::scalar_t2; + scalar_t2 s_val_1_2; + s_val_1_2.x = reinterpret_cast(&frag_s_1)[i]; + s_val_1_2.y = reinterpret_cast(&frag_s_2)[i]; + + scalar_t2 s_val_3_4; + s_val_3_4.x = reinterpret_cast(&frag_s_3)[i]; + s_val_3_4.y = reinterpret_cast(&frag_s_4)[i]; + + frag_b[0] = __hmul2(frag_b[0], s_val_1_2); + frag_b[1] = __hmul2(frag_b[1], s_val_3_4); +} + +// Given 2 floats multiply by 2 scales (halves) +template +__device__ inline void scale_float(float* c, + typename ScalarType::FragS& s) { + scalar_t* s_ptr = reinterpret_cast(&s); + c[0] = __fmul_rn(c[0], ScalarType::num2float(s_ptr[0])); + c[1] = __fmul_rn(c[1], ScalarType::num2float(s_ptr[1])); +} + +// Wait until barrier reaches `count`, then lock for current threadblock. +__device__ inline void barrier_acquire(int* lock, int count) { + if (threadIdx.x == 0) { + int state = -1; + do + // Guarantee that subsequent writes by this threadblock will be visible + // globally. + asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n" + : "=r"(state) + : "l"(lock)); + while (state != count); + } + __syncthreads(); +} + +// Release barrier and increment visitation count. +__device__ inline void barrier_release(int* lock, bool reset = false) { + __syncthreads(); + if (threadIdx.x == 0) { + if (reset) { + lock[0] = 0; + return; + } + int val = 1; + // Make sure that all writes since acquiring this barrier are visible + // globally, while releasing the barrier. + asm volatile("fence.acq_rel.gpu;\n"); + asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n" + : + : "l"(lock), "r"(val)); + } +} + +// For a given "a" of size [M,K] performs a permutation of the K columns based +// on the given "perm" indices. +__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr, + int const* __restrict__ perm_int_ptr, + int4* __restrict__ out_int4_ptr, int size_m, + int size_k, int block_rows) { + int start_row = block_rows * blockIdx.x; + int finish_row = start_row + block_rows; + if (finish_row > size_m) { + finish_row = size_m; + } + int cur_block_rows = finish_row - start_row; + + int row_stride = size_k * sizeof(half) / 16; + + auto permute_row = [&](int row) { + int iters = size_k / default_threads; + int rest = size_k % default_threads; + + int offset = row * row_stride; + + half const* a_row_half = reinterpret_cast(a_int4_ptr + offset); + half* out_half = reinterpret_cast(out_int4_ptr + offset); + + int base_k = 0; + + for (int i = 0; i < iters; i++) { + int cur_k = base_k + threadIdx.x; + int src_pos = perm_int_ptr[cur_k]; + + out_half[cur_k] = a_row_half[src_pos]; + + base_k += default_threads; + } + + if (rest) { + if (threadIdx.x < rest) { + int cur_k = base_k + threadIdx.x; + int src_pos = perm_int_ptr[cur_k]; + + out_half[cur_k] = a_row_half[src_pos]; + } + } + }; + + for (int i = 0; i < cur_block_rows; i++) { + int cur_row = start_row + i; + if (cur_row < size_m) { + permute_row(cur_row); + } + } +} + +template shared + // fetch pipeline + const bool has_act_order, // whether act_order is enabled + const int group_blocks = -1 // number of consecutive 16x16 blocks + // with a separate quantization scale + > +__global__ void Marlin( + const int4* __restrict__ A, // fp16 input matrix of shape mxk + const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn + int4* __restrict__ C, // fp16 output buffer of shape mxn + const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape + // (k/groupsize)xn + const int* __restrict__ g_idx, // int32 group indices of shape k + int num_groups, // number of scale groups per output channel + int prob_m, // batch dimension m + int prob_n, // output dimension n + int prob_k, // reduction dimension k + int* locks // extra global storage for barrier synchronization +) { + // Each threadblock processes one "stripe" of the B matrix with (roughly) the + // same size, which might involve multiple column "slices" (of width 16 * + // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM + // example: + // 0 1 3 + // 0 2 3 + // 1 2 4 + // While this kind of partitioning makes things somewhat more complicated, it + // ensures good utilization of all SMs for many kinds of shape and GPU + // configurations, while requiring as few slow global cross-threadblock + // reductions as possible. + using Dtype = ScalarType; + using scalar_t2 = typename ScalarType::scalar_t2; + using FragA = typename ScalarType::FragA; + using FragB = typename ScalarType::FragB; + using FragC = typename ScalarType::FragC; + using FragS = typename ScalarType::FragS; + + constexpr int pack_factor = 32 / num_bits; + + // For larger GEMMs we run multiple batchsize 64 versions in parallel for a + // better partitioning with less reductions + int parallel = 1; + if (prob_m > 16 * thread_m_blocks) { + parallel = prob_m / (16 * thread_m_blocks); + prob_m = 16 * thread_m_blocks; + } + + int k_tiles = prob_k / 16 / thread_k_blocks; + int n_tiles = prob_n / 16 / thread_n_blocks; + int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x); + + if constexpr (!has_act_order && group_blocks != -1) { + if (group_blocks >= thread_k_blocks) { + // Ensure that the number of tiles in each stripe is a multiple of the + // groupsize; this avoids an annoying special case where a stripe starts + // in the middle of group. + iters = (group_blocks / thread_k_blocks) * + div_ceil(iters, (group_blocks / thread_k_blocks)); + } + } + + int slice_row = (iters * blockIdx.x) % k_tiles; + int slice_col_par = (iters * blockIdx.x) / k_tiles; + int slice_col = slice_col_par; + int slice_iters; // number of threadblock tiles in the current slice + int slice_count = + 0; // total number of active threadblocks in the current slice + int slice_idx; // index of threadblock in current slice; numbered bottom to + // top + + // We can easily implement parallel problem execution by just remapping + // indices and advancing global pointers + if (slice_col_par >= n_tiles) { + A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8; + C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8; + locks += (slice_col_par / n_tiles) * n_tiles; + slice_col = slice_col_par % n_tiles; + } + + // Compute all information about the current slice which is required for + // synchronization. + auto init_slice = [&]() { + slice_iters = + iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row); + if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0; + if (slice_iters == 0) return; + if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row; + slice_count = 1; + slice_idx = 0; + int col_first = iters * div_ceil(k_tiles * slice_col_par, iters); + if (col_first <= k_tiles * (slice_col_par + 1)) { + int col_off = col_first - k_tiles * slice_col_par; + slice_count = div_ceil(k_tiles - col_off, iters); + if (col_off > 0) slice_count++; + int delta_first = iters * blockIdx.x - col_first; + if (delta_first < 0 || (col_off == 0 && delta_first == 0)) + slice_idx = slice_count - 1; + else { + slice_idx = slice_count - 1 - delta_first / iters; + if (col_off > 0) slice_idx--; + } + } + if (slice_col == n_tiles) { + A += 16 * thread_m_blocks * prob_k / 8; + C += 16 * thread_m_blocks * prob_n / 8; + locks += n_tiles; + slice_col = 0; + } + }; + init_slice(); + + // A sizes/strides + + // stride of the A matrix in global memory + int a_gl_stride = prob_k / 8; + // stride of an A matrix tile in shared memory + constexpr int a_sh_stride = 16 * thread_k_blocks / 8; + // delta between subsequent A tiles in global memory + constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8; + // between subsequent accesses within a tile + int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o); + // between shared memory writes + constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o); + // between shared memory tile reads + constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4)); + // within a shared memory tile + constexpr int a_sh_rd_delta_i = a_sh_stride * 16; + // overall size of a tile + constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks); + // number of shared write iterations for a tile + constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta); + + // B sizes/strides + int b_gl_stride = 16 * prob_n / (pack_factor * 4); + constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4; + constexpr int b_thread_vecs = num_bits == 4 ? 1 : 2; + constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs; + + int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks; + int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads); + constexpr int b_sh_wr_delta = threads * b_thread_vecs; + constexpr int b_sh_rd_delta = threads * b_thread_vecs; + constexpr int b_sh_stage = b_sh_stride * thread_k_blocks; + constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta; + + // Scale sizes/strides without act_order + int s_gl_stride = prob_n / 8; + constexpr int s_sh_stride = 16 * thread_n_blocks / 8; + constexpr int s_tb_groups = + !has_act_order && group_blocks != -1 && group_blocks < thread_k_blocks + ? thread_k_blocks / group_blocks + : 1; + constexpr int s_sh_stage = s_tb_groups * s_sh_stride; + int s_gl_rd_delta = s_gl_stride; + + // Scale size/strides with act_order + constexpr int tb_k = 16 * thread_k_blocks; + constexpr int g_idx_stage = has_act_order ? (tb_k * sizeof(int)) / 16 : 0; + // constexpr int act_s_row_stride = 1; + // int act_s_col_stride = act_s_row_stride * num_groups; + int act_s_col_stride = 1; + int act_s_col_warp_stride = act_s_col_stride * 8; + int tb_n_warps = thread_n_blocks / 4; + int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps; + + // Global A read index of current thread. + int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + + (threadIdx.x % a_gl_rd_delta_o); + a_gl_rd += a_gl_rd_delta_o * slice_row; + // Shared write index of current thread. + int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) + + (threadIdx.x % a_gl_rd_delta_o); + // Shared read index. + int a_sh_rd = + a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16; + a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4)); + + int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) + + (threadIdx.x % b_sh_stride_threads) * b_thread_vecs; + b_gl_rd += b_sh_stride * slice_col; + b_gl_rd += b_gl_rd_delta_o * slice_row; + int b_sh_wr = threadIdx.x * b_thread_vecs; + int b_sh_rd = threadIdx.x * b_thread_vecs; + + // For act_order + constexpr int k_iter_size = tb_k / b_sh_wr_iters; + int slice_k_start = tb_k * slice_row; + int slice_k_finish = slice_k_start + tb_k * slice_iters; + int slice_k_start_shared_fetch = slice_k_start; + int slice_n_offset = act_s_col_tb_stride * slice_col; + + // No act_order + int s_gl_rd; + if constexpr (!has_act_order) { + if constexpr (group_blocks == -1) { + s_gl_rd = s_sh_stride * slice_col + threadIdx.x; + } else { + s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) + + s_sh_stride * slice_col + threadIdx.x; + } + } + int s_sh_wr = threadIdx.x; + bool s_sh_wr_pred = threadIdx.x < s_sh_stride; + + // We use a different scale layout for grouped and column-wise quantization as + // we scale a `half2` tile in column-major layout in the former and in + // row-major in the latter case. + int s_sh_rd; + if constexpr (group_blocks != -1) + s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + + (threadIdx.x % 32) / 4; + else + s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + + (threadIdx.x % 32) % 4; + + // Precompute which thread should not read memory in which iterations; this is + // needed if there are more threads than required for a certain tilesize or + // when the batchsize is not a multiple of 16. + bool a_sh_wr_pred[a_sh_wr_iters]; + #pragma unroll + for (int i = 0; i < a_sh_wr_iters; i++) + a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m; + + // To ensure that writing and reading A tiles to/from shared memory, the + // latter in fragment format, is fully bank conflict free, we need to use a + // rather fancy XOR-based layout. The key here is that neither reads nor + // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the + // same shared memory banks. Further, it seems (based on NSight-Compute) that + // each warp must also write a consecutive memory segment? + auto transform_a = [&](int i) { + int row = i / a_gl_rd_delta_o; + return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row; + }; + // Since the computation of this remapping is non-trivial and, due to our main + // loop unrolls, all shared memory accesses are static, we simply precompute + // both transformed reads and writes. + int a_sh_wr_trans[a_sh_wr_iters]; + #pragma unroll + for (int i = 0; i < a_sh_wr_iters; i++) + a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr); + int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks]; + #pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) { + #pragma unroll + for (int j = 0; j < thread_m_blocks; j++) + a_sh_rd_trans[i][j] = + transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd); + } + + // Since B-accesses have non-constant stride they have to be computed at + // runtime; we break dependencies between subsequent accesses with a tile by + // maintining multiple pointers (we have enough registers), a tiny + // optimization. + const int4* B_ptr[b_sh_wr_iters]; + #pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) + B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd; + + extern __shared__ int4 sh[]; + // Shared memory storage for global fetch pipelines. + int4* sh_a = sh; + int4* sh_b = sh_a + (stages * a_sh_stage); + int4* sh_g_idx = sh_b + (stages * b_sh_stage); + int4* sh_s = sh_g_idx + (stages * g_idx_stage); + + // Register storage for double buffer of shared memory reads. + FragA frag_a[2][thread_m_blocks]; + I4 frag_b_quant[2][b_thread_vecs]; + FragC frag_c[thread_m_blocks][4][2]; + FragS frag_s[2][4]; // No act-order + FragS act_frag_s[2][4][4]; // For act-order + + // Zero accumulators. + auto zero_accums = [&]() { + #pragma unroll + for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++) + reinterpret_cast(frag_c)[i] = 0; + }; + + int sh_first_group_id = -1; + int sh_num_groups = -1; + constexpr int sh_max_num_groups = 32; + + auto fetch_scales_to_shared = [&](bool is_async, int first_group_id, + int last_group_id) { + sh_first_group_id = first_group_id; + sh_num_groups = last_group_id - first_group_id + 1; + + if (sh_num_groups < sh_max_num_groups) { + sh_num_groups = sh_max_num_groups; + } + + if (sh_first_group_id + sh_num_groups > num_groups) { + sh_num_groups = num_groups - sh_first_group_id; + } + + int row_offset = first_group_id * s_gl_stride; + + if (is_async) { + for (int i = 0; i < sh_num_groups; i++) { + if (threadIdx.x < s_sh_stride) { + cp_async4_pred(&sh_s[(i * s_sh_stride) + threadIdx.x], + &scales_ptr[row_offset + (i * s_gl_stride) + + slice_n_offset + threadIdx.x]); + } + } + } else { + for (int i = 0; i < sh_num_groups; i++) { + if (threadIdx.x < s_sh_stride) { + sh_s[(i * s_sh_stride) + threadIdx.x] = + scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset + + threadIdx.x]; + } + } + } + }; + // Asynchronously fetch the next A, B and s tile from global to the next + // shared memory pipeline location. + auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) { + if (pred) { + int4* sh_a_stage = sh_a + a_sh_stage * pipe; + #pragma unroll + for (int i = 0; i < a_sh_wr_iters; i++) { + cp_async4_pred( + &sh_a_stage[a_sh_wr_trans[i]], + &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off], + a_sh_wr_pred[i]); + } + int4* sh_b_stage = sh_b + b_sh_stage * pipe; + #pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) { + #pragma unroll + for (int j = 0; j < b_thread_vecs; j++) { + cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j); + } + + B_ptr[i] += b_gl_rd_delta_o; + } + + if constexpr (has_act_order) { + // Fetch g_idx thread-block portion + int full_pipe = a_off; + int cur_k = slice_k_start_shared_fetch + tb_k * full_pipe; + if (cur_k < prob_k && cur_k < slice_k_finish) { + int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe; + + int4 const* cur_g_idx_stage_ptr = + reinterpret_cast(&g_idx[cur_k]); + + if (threadIdx.x < g_idx_stage) { + cp_async4_pred(&sh_g_idx_stage[threadIdx.x], + &cur_g_idx_stage_ptr[threadIdx.x]); + } + } + } else { + if constexpr (group_blocks != -1) { + int4* sh_s_stage = sh_s + s_sh_stage * pipe; + + if constexpr (group_blocks >= thread_k_blocks) { + // Only fetch scales if this tile starts a new group + if (pipe % (group_blocks / thread_k_blocks) == 0) { + if (s_sh_wr_pred) { + cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]); + } + s_gl_rd += s_gl_rd_delta; + } + } else { + for (int i = 0; i < s_tb_groups; i++) { + if (s_sh_wr_pred) { + cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr], + &scales_ptr[s_gl_rd]); + } + s_gl_rd += s_gl_rd_delta; + } + } + } + } + } + // Insert a fence even when we are winding down the pipeline to ensure that + // waiting is also correct at this point. + cp_async_fence(); + }; + + // Wait until the next thread tile has been loaded to shared memory. + auto wait_for_stage = [&]() { + // We only have `stages - 2` active fetches since we are double buffering + // and can only issue the next fetch when it is guaranteed that the previous + // shared memory load is fully complete (as it may otherwise be + // overwritten). + cp_async_wait(); + __syncthreads(); + }; + + // Load the next sub-tile from the current location in the shared memory pipe + // into the current register buffer. + auto fetch_to_registers = [&](int k, int pipe) { + int4* sh_a_stage = sh_a + a_sh_stage * pipe; + #pragma unroll + for (int i = 0; i < thread_m_blocks; i++) + ldsm4(frag_a[k % 2][i], + &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]); + int4* sh_b_stage = sh_b + b_sh_stage * pipe; + + #pragma unroll + for (int i = 0; i < b_thread_vecs; i++) { + frag_b_quant[k % 2][i] = *reinterpret_cast( + &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]); + } + }; + + bool is_same_group[stages]; + int same_group_id[stages]; + + auto init_same_group = [&](int pipe) { + if constexpr (!has_act_order) { + is_same_group[pipe] = false; + same_group_id[pipe] = 0; + return; + } + + int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe; + int* sh_g_idx_int_ptr = reinterpret_cast(sh_g_idx_stage); + + int group_id_1 = sh_g_idx_int_ptr[0]; + int group_id_2 = sh_g_idx_int_ptr[tb_k - 1]; + + is_same_group[pipe] = group_id_1 == group_id_2; + same_group_id[pipe] = group_id_1; + }; + + auto fetch_scales_to_registers = [&](int k, int full_pipe) { + int pipe = full_pipe % stages; + + if constexpr (!has_act_order) { + // No act-order case + if constexpr (group_blocks != -1) { + if constexpr (group_blocks >= thread_k_blocks) { + int4* sh_s_stage = + sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) * + (pipe / (group_blocks / thread_k_blocks))); + reinterpret_cast(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd]; + } else { + int warp_id = threadIdx.x / 32; + int n_warps = thread_n_blocks / 4; + + int warp_row = warp_id / n_warps; + + int cur_k = warp_row * 16; + cur_k += k_iter_size * (k % b_sh_wr_iters); + + int k_blocks = cur_k / 16; + int cur_group_id = k_blocks / group_blocks; + + int4* sh_s_stage = sh_s + s_sh_stage * pipe; + + reinterpret_cast(&frag_s[k % 2])[0] = + sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride]; + } + } + + return; + } + + // Act-order case + + // Determine K of the "current" thread-block + int cur_k = slice_k_start + tb_k * full_pipe; + if (cur_k >= prob_k || cur_k >= slice_k_finish) { + return; + } + + // Reset (to current thread-block) since we read g_idx portion from the + // shared memory + cur_k = 0; + + // Progress to current iteration + cur_k += k_iter_size * (k % b_sh_wr_iters); + + // Determine "position" inside the thread-block (based on warp and + // thread-id) + int warp_id = threadIdx.x / 32; + int n_warps = + thread_n_blocks / 4; // Each warp processes 4 16-size tiles over N + + int warp_row = warp_id / n_warps; + int warp_col = warp_id % n_warps; + + cur_k += warp_row * 16; + + int th_id = threadIdx.x % 32; + cur_k += (th_id % 4) * 2; // Due to tensor-core layout for fp16 B matrix + + int s_col_shift = + /*slice_n_offset +*/ (act_s_col_warp_stride * warp_col) + + (th_id / 4) * act_s_col_stride; + + if (is_same_group[pipe]) { + if (k % 2 == 0) { + *(reinterpret_cast(&(act_frag_s[k % 2][0][0]))) = + sh_s[(same_group_id[pipe] - sh_first_group_id) * s_sh_stride + + s_col_shift]; + } else { + *(reinterpret_cast(&(act_frag_s[k % 2][0][0]))) = + *(reinterpret_cast(&(act_frag_s[(k - 1) % 2][0][0]))); + } + + for (int i = 1; i < 4; i++) { + *(reinterpret_cast(&(act_frag_s[k % 2][i][0]))) = + *(reinterpret_cast(&(act_frag_s[k % 2][0][0]))); + } + return; + } + + int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe; + int* sh_g_idx_int_ptr = reinterpret_cast(sh_g_idx_stage); + + constexpr int k_frag_offsets[4] = {0, 1, 8, + 9}; // Tensor core offsets per thread + + #pragma unroll + for (int i = 0; i < 4; i++) { + int actual_k = cur_k + k_frag_offsets[i]; + + int group_id = sh_g_idx_int_ptr[actual_k]; + int rel_group_id = group_id - sh_first_group_id; + + *(reinterpret_cast(&(act_frag_s[k % 2][i][0]))) = + sh_s[rel_group_id * s_sh_stride + s_col_shift]; + } + }; + + // Execute the actual tensor core matmul of a sub-tile. + auto matmul = [&](int k) { + // We have the m dimension as the inner loop in order to encourage overlapping + // dequantization and matmul operations. + #pragma unroll + for (int j = 0; j < 4; j++) { + FragB frag_b0; + FragB frag_b1; + if constexpr (num_bits == 4) { + int b_quant = frag_b_quant[k % 2][0][j]; + int b_quant_shift = b_quant >> 8; + + frag_b0 = dequant_4bit(b_quant); + frag_b1 = dequant_4bit(b_quant_shift); + + } else { + int* frag_b_quant_ptr = reinterpret_cast(frag_b_quant[k % 2]); + int b_quant_0 = frag_b_quant_ptr[j * 2 + 0]; + int b_quant_1 = frag_b_quant_ptr[j * 2 + 1]; + + frag_b0 = dequant_8bit(b_quant_0); + frag_b1 = dequant_8bit(b_quant_1); + } + + // Apply scale to frag_b0 + if constexpr (has_act_order) { + scale4(frag_b0, act_frag_s[k % 2][0][j], + act_frag_s[k % 2][1][j], act_frag_s[k % 2][2][j], + act_frag_s[k % 2][3][j], 0); + } else { + if constexpr (group_blocks != -1) { + scale(frag_b0, frag_s[k % 2][j], 0); + } + } + + // Apply scale to frag_b1 + if constexpr (has_act_order) { + scale4(frag_b1, act_frag_s[k % 2][0][j], + act_frag_s[k % 2][1][j], act_frag_s[k % 2][2][j], + act_frag_s[k % 2][3][j], 1); + + } else { + if constexpr (group_blocks != -1) { + scale(frag_b1, frag_s[k % 2][j], 1); + } + } + + #pragma unroll + for (int i = 0; i < thread_m_blocks; i++) { + mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]); + mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]); + } + } + }; + + // Since we slice across the k dimension of a tile in order to increase the + // number of warps while keeping the n dimension of a tile reasonable, we have + // multiple warps that accumulate their partial sums of the same output + // location; which we have to reduce over in the end. We do in shared memory. + auto thread_block_reduce = [&]() { + constexpr int red_off = threads / b_sh_stride_threads / 2; + if (red_off >= 1) { + int red_idx = threadIdx.x / b_sh_stride_threads; + constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2; + constexpr int red_sh_delta = b_sh_stride_threads; + int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) + + (threadIdx.x % b_sh_stride_threads); + + // Parallel logarithmic shared memory reduction. We make sure to avoid any + // unnecessary read or write iterations, e.g., for two warps we write only + // once by warp 1 and read only once by warp 0. + + #pragma unroll + for (int m_block = 0; m_block < thread_m_blocks; m_block++) { + #pragma unroll + for (int i = red_off; i > 0; i /= 2) { + if (i <= red_idx && red_idx < 2 * i) { + #pragma unroll + for (int j = 0; j < 4 * 2; j++) { + int red_sh_wr = + red_sh_delta * j + (red_sh_rd - red_sh_stride * i); + if (i < red_off) { + float* c_rd = + reinterpret_cast(&sh[red_sh_delta * j + red_sh_rd]); + float* c_wr = reinterpret_cast(&sh[red_sh_wr]); + #pragma unroll + for (int k = 0; k < 4; k++) + reinterpret_cast(frag_c)[4 * 2 * m_block + j][k] += + c_rd[k] + c_wr[k]; + } + sh[red_sh_wr] = + reinterpret_cast(&frag_c)[4 * 2 * m_block + j]; + } + } + __syncthreads(); + } + if (red_idx == 0) { + #pragma unroll + for (int i = 0; i < 4 * 2; i++) { + float* c_rd = + reinterpret_cast(&sh[red_sh_delta * i + red_sh_rd]); + #pragma unroll + for (int j = 0; j < 4; j++) + reinterpret_cast(frag_c)[4 * 2 * m_block + i][j] += + c_rd[j]; + } + } + __syncthreads(); + } + } + }; + + // Since multiple threadblocks may process parts of the same column slice, we + // finally have to globally reduce over the results. As the striped + // partitioning minimizes the number of such reductions and our outputs are + // usually rather small, we perform this reduction serially in L2 cache. + auto global_reduce = [&](bool first = false, bool last = false) { + // We are very careful here to reduce directly in the output buffer to + // maximize L2 cache utilization in this step. To do this, we write out + // results in FP16 (but still reduce with FP32 compute). + constexpr int active_threads = 32 * thread_n_blocks / 4; + if (threadIdx.x < active_threads) { + int c_gl_stride = prob_n / 8; + int c_gl_wr_delta_o = 8 * c_gl_stride; + int c_gl_wr_delta_i = 4 * (active_threads / 32); + int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) + + 4 * (threadIdx.x / 32) + threadIdx.x % 4; + c_gl_wr += (2 * thread_n_blocks) * slice_col; + constexpr int c_sh_wr_delta = active_threads; + int c_sh_wr = threadIdx.x; + + int row = (threadIdx.x % 32) / 4; + + if (!first) { + // Interestingly, doing direct global accesses here really seems to mess up + // the compiler and lead to slowdowns, hence we also use async-copies even + // though these fetches are not actually asynchronous. + #pragma unroll + for (int i = 0; i < thread_m_blocks * 4; i++) { + cp_async4_pred( + &sh[c_sh_wr + c_sh_wr_delta * i], + &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + + c_gl_wr_delta_i * (i % 2)], + i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m); + } + cp_async_fence(); + cp_async_wait<0>(); + } + + #pragma unroll + for (int i = 0; i < thread_m_blocks * 4; i++) { + if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) { + if (!first) { + int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta]; + #pragma unroll + for (int j = 0; j < 2 * 4; j++) { + reinterpret_cast( + &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] += + Dtype::num2float(reinterpret_cast(&c_red)[j]); + } + } + if (!last) { + int4 c; + #pragma unroll + for (int j = 0; j < 2 * 4; j++) { + reinterpret_cast(&c)[j] = + Dtype::float2num(reinterpret_cast( + &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]); + } + C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] = + c; + } + } + } + } + }; + + // Write out the reduce final result in the correct layout. We only actually + // reshuffle matrix fragments in this step, the reduction above is performed + // in fragment layout. + auto write_result = [&]() { + int c_gl_stride = prob_n / 8; + constexpr int c_sh_stride = 2 * thread_n_blocks + 1; + int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks)); + constexpr int c_sh_rd_delta = + c_sh_stride * (threads / (2 * thread_n_blocks)); + + int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) + + (threadIdx.x % (2 * thread_n_blocks)); + c_gl_wr += (2 * thread_n_blocks) * slice_col; + int c_sh_wr = + (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4; + c_sh_wr += 32 * (threadIdx.x / 32); + int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) + + (threadIdx.x % (2 * thread_n_blocks)); + + int c_gl_wr_end = c_gl_stride * prob_m; + + // We first reorder in shared memory to guarantee the most efficient final + // global write patterns + auto write = [&](int idx, float c0, float c1, FragS& s) { + scalar_t2 res = + Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1)); + + // For per-column quantization we finally apply the scale here (only for + // 4-bit) + if constexpr (!has_act_order && group_blocks == -1 && num_bits == 4) { + res = __hmul2(res, s[0]); + } + + ((scalar_t2*)sh)[idx] = res; + }; + + if (threadIdx.x / 32 < thread_n_blocks / 4) { + #pragma unroll + for (int i = 0; i < thread_m_blocks; i++) { + #pragma unroll + for (int j = 0; j < 4; j++) { + int wr = c_sh_wr + 8 * j; + write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0], + frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]); + write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2], + frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]); + write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0], + frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]); + write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2], + frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]); + } + c_sh_wr += 16 * (4 * c_sh_stride); + } + } + __syncthreads(); + + #pragma unroll + for (int i = 0; + i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks)); + i++) { + if (c_gl_wr < c_gl_wr_end) { + C[c_gl_wr] = sh[c_sh_rd]; + c_gl_wr += c_gl_wr_delta; + c_sh_rd += c_sh_rd_delta; + } + } + }; + + // Start global fetch and register load pipelines. + auto start_pipes = [&]() { + + #pragma unroll + for (int i = 0; i < stages - 1; i++) { + if (has_act_order && i == 0) { + int last_g_idx = slice_k_start + stages * tb_k * 2; + if (last_g_idx >= prob_k) { + last_g_idx = prob_k - 1; + } + fetch_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]); + } + fetch_to_shared(i, i, i < slice_iters); + } + + zero_accums(); + wait_for_stage(); + init_same_group(0); + fetch_to_registers(0, 0); + fetch_scales_to_registers(0, 0); + a_gl_rd += a_gl_rd_delta_o * (stages - 1); + slice_k_start_shared_fetch += tb_k * (stages - 1); + }; + if (slice_iters) { + start_pipes(); + } + + // Main loop. + while (slice_iters) { + // We unroll over both the global fetch and the register load pipeline to + // ensure all shared memory accesses are static. Note that both pipelines + // have even length meaning that the next iteration will always start at + // index 0. + + #pragma unroll + for (int pipe = 0; pipe < stages;) { + #pragma unroll + for (int k = 0; k < b_sh_wr_iters; k++) { + fetch_to_registers(k + 1, pipe % stages); + fetch_scales_to_registers(k + 1, pipe); + if (k == b_sh_wr_iters - 2) { + fetch_to_shared((pipe + stages - 1) % stages, pipe, + slice_iters >= stages); + pipe++; + wait_for_stage(); + init_same_group(pipe % stages); + } + matmul(k); + } + slice_iters--; + if (slice_iters == 0) { + break; + } + } + + a_gl_rd += a_gl_rd_delta_o * stages; + slice_k_start += tb_k * stages; + slice_k_start_shared_fetch += tb_k * stages; + + if constexpr (has_act_order) { + int first_group_id = g_idx[slice_k_start]; + int last_g_idx = slice_k_start + stages * tb_k * 2; + if (last_g_idx >= prob_k) { + last_g_idx = prob_k - 1; + } + int last_group_id = g_idx[last_g_idx]; + if (last_group_id >= sh_first_group_id + sh_num_groups) { + fetch_scales_to_shared(false, first_group_id, last_group_id); + __syncthreads(); + } + } + + // Process results and, if necessary, proceed to the next column slice. + // While this pattern may not be the most readable, other ways of writing + // the loop seemed to noticeably worse performance after compilation. + if (slice_iters == 0) { + cp_async_wait<0>(); + bool last = slice_idx == slice_count - 1; + // For per-column scales, we only fetch them here in the final step before + // write-out + if constexpr (!has_act_order && group_blocks == -1) { + if constexpr (num_bits == 8) { + if (s_sh_wr_pred) { + cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]); + } + cp_async_fence(); + } else { + if (last) { + if (s_sh_wr_pred) { + cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]); + } + cp_async_fence(); + } + } + } + + thread_block_reduce(); + if constexpr (!has_act_order && group_blocks == -1) { + if constexpr (num_bits == 8) { + cp_async_wait<0>(); + __syncthreads(); + if (threadIdx.x / 32 < thread_n_blocks / 4) { + reinterpret_cast(&frag_s)[0] = sh_s[s_sh_rd + 0]; + reinterpret_cast(&frag_s)[1] = sh_s[s_sh_rd + 4]; + } + + } else { + if (last) { + cp_async_wait<0>(); + __syncthreads(); + if (threadIdx.x / 32 < thread_n_blocks / 4) { + reinterpret_cast(&frag_s)[0] = sh_s[s_sh_rd + 0]; + reinterpret_cast(&frag_s)[1] = sh_s[s_sh_rd + 4]; + } + } + } + } + + // For 8-bit channelwise, we apply the scale before the global reduction + // that converts the fp32 results to fp16 (so that we avoid possible + // overflow in fp16) + if constexpr (!has_act_order && group_blocks == -1 && num_bits == 8) { + if (threadIdx.x / 32 < thread_n_blocks / 4) { + #pragma unroll + for (int i = 0; i < thread_m_blocks; i++) { + #pragma unroll + for (int j = 0; j < 4; j++) { + scale_float( + reinterpret_cast(&frag_c[i][j][0][0]), + frag_s[j / 2][2 * (j % 2) + 0]); + scale_float( + reinterpret_cast(&frag_c[i][j][0][2]), + frag_s[j / 2][2 * (j % 2) + 0]); + + scale_float( + reinterpret_cast(&frag_c[i][j][1][0]), + frag_s[j / 2][2 * (j % 2) + 1]); + scale_float( + reinterpret_cast(&frag_c[i][j][1][2]), + frag_s[j / 2][2 * (j % 2) + 1]); + } + } + } + } + + if (slice_count > 1) { // only globally reduce if there is more than one + // block in a slice + barrier_acquire(&locks[slice_col], slice_idx); + global_reduce(slice_idx == 0, last); + barrier_release(&locks[slice_col], last); + } + if (last) // only the last block in a slice actually writes the result + write_result(); + slice_row = 0; + slice_col_par++; + slice_col++; + init_slice(); + if (slice_iters) { + a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + + (threadIdx.x % a_gl_rd_delta_o); + #pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) + B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles; + if (slice_col == 0) { + #pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride; + } + + // Update slice k/n for scales loading + if constexpr (has_act_order) { + slice_k_start = tb_k * slice_row; + slice_k_finish = slice_k_start + tb_k * slice_iters; + slice_k_start_shared_fetch = slice_k_start; + slice_n_offset = act_s_col_tb_stride * slice_col; + + } else { + s_gl_rd = s_sh_stride * slice_col + threadIdx.x; + } + + start_pipes(); + } + } + } +} + + #define __CALL_IF(NUM_BITS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, \ + THREAD_K_BLOCKS, HAS_ACT_ORDER, GROUP_BLOCKS, NUM_THREADS) \ + else if (num_bits == NUM_BITS && thread_m_blocks == THREAD_M_BLOCKS && \ + thread_n_blocks == THREAD_N_BLOCKS && \ + thread_k_blocks == THREAD_K_BLOCKS && \ + has_act_order == HAS_ACT_ORDER && group_blocks == GROUP_BLOCKS && \ + num_threads == NUM_THREADS) { \ + cudaFuncSetAttribute( \ + Marlin, \ + cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \ + Marlin<<>>( \ + A_ptr, B_ptr, C_ptr, s_ptr, g_idx_ptr, num_groups, prob_m, prob_n, \ + prob_k, locks); \ + } + +typedef struct { + int thread_k; + int thread_n; + int num_threads; +} thread_config_t; + +typedef struct { + int max_m_blocks; + thread_config_t tb_cfg; +} exec_config_t; + +thread_config_t small_batch_thread_configs[] = { + // Ordered by priority + + // thread_k, thread_n, num_threads + {128, 128, 256}, + {64, 128, 128}, + {128, 64, 128}, +}; + +thread_config_t large_batch_thread_configs[] = { + // Ordered by priority + + // thread_k, thread_n, num_threads + {64, 256, 256}, + {64, 128, 128}, + {128, 64, 128}, + +}; + +int get_scales_cache_size(thread_config_t const& th_config, int prob_m, + int prob_n, int prob_k, int num_bits, int group_size, + bool has_act_order, bool is_k_full) { + bool cache_scales_chunk = has_act_order && !is_k_full; + + int tb_n = th_config.thread_n; + int tb_k = th_config.thread_k; + + // Get max scale groups per thread-block + int tb_groups; + if (group_size == -1) { + tb_groups = 1; + } else if (group_size == 0) { + tb_groups = div_ceil(tb_k, 32); // Worst case is 32 group size + } else { + tb_groups = div_ceil(tb_k, group_size); + } + + if (cache_scales_chunk) { + int load_groups = + tb_groups * pipe_stages * 2; // Chunk size is 2x pipeline over dim K + load_groups = max(load_groups, 32); // We load at least 32 scale groups + return load_groups * tb_n * 2; + + } else { + int tb_scales = tb_groups * tb_n * 2; + + return tb_scales * pipe_stages; + } +} + +bool is_valid_cache_size(thread_config_t const& th_config, int max_m_blocks, + int prob_m, int prob_n, int prob_k, int num_bits, + int scales_cache_size, int max_shared_mem) { + int pack_factor = 32 / num_bits; + + // Get B size + int tb_k = th_config.thread_k; + int tb_n = th_config.thread_n; + + int b_size = (tb_k * tb_n / pack_factor) * 4; + + // Get A size + int m_blocks = div_ceil(prob_m, 16); + int tb_max_m = 16; + + while (true) { + if (m_blocks >= max_m_blocks) { + tb_max_m *= max_m_blocks; + break; + } + + max_m_blocks--; + if (max_m_blocks == 0) { + TORCH_CHECK(false, "Unexpected m_blocks = ", m_blocks); + } + } + + int a_size = (tb_max_m * tb_k) * 2; + + float pipe_size = (a_size + b_size) * pipe_stages; + + TORCH_CHECK(max_shared_mem / 2 > scales_cache_size); // Sanity + + return pipe_size < 0.95f * (max_shared_mem - scales_cache_size); +} + +bool is_valid_config(thread_config_t const& th_config, int max_m_blocks, + int prob_m, int prob_n, int prob_k, int num_bits, + int group_size, bool has_act_order, bool is_k_full, + int max_shared_mem) { + // Sanity + if (th_config.thread_k == -1 || th_config.thread_n == -1 || + th_config.num_threads == -1) { + return false; + } + + // Verify K/N are divisible by thread K/N + if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) { + return false; + } + + // Verify min for thread K/N + if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) { + return false; + } + + // num_threads must be at least 128 (= 4 warps) + if (th_config.num_threads < 128) { + return false; + } + + // Determine cache for scales + int scales_cache_size = + get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits, + group_size, has_act_order, is_k_full); + + // Check that pipeline fits into cache + if (!is_valid_cache_size(th_config, max_m_blocks, prob_m, prob_n, prob_k, + num_bits, scales_cache_size, max_shared_mem)) { + return false; + } + + return true; +} + +exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k, + int num_bits, int group_size, + bool has_act_order, bool is_k_full, + int max_shared_mem) { + int max_m_blocks = 4; + while (max_m_blocks > 0) { + if (prob_m <= 16) { + for (auto th_config : small_batch_thread_configs) { + if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k, + num_bits, group_size, has_act_order, is_k_full, + max_shared_mem)) { + return exec_config_t{max_m_blocks, th_config}; + } + } + } else { + for (auto th_config : large_batch_thread_configs) { + if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k, + num_bits, group_size, has_act_order, is_k_full, + max_shared_mem)) { + return exec_config_t{max_m_blocks, th_config}; + } + } + } + + max_m_blocks--; // Process less M blocks per invocation to reduce cache + // usage + } + + return exec_config_t{0, {-1, -1, -1}}; +} + + #define CALL_IF(NUM_BITS, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS) \ + \ + __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS) \ + \ + __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS) \ + \ + __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS) \ + \ + __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS) + +template +void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s, + void* g_idx, void* perm, void* a_tmp, int prob_m, + int prob_n, int prob_k, void* workspace, int num_bits, + bool has_act_order, bool is_k_full, int num_groups, + int group_size, int dev, cudaStream_t stream, int thread_k, + int thread_n, int sms, int max_par) { + TORCH_CHECK(num_bits == 4 || num_bits == 8, + "num_bits must be 4 or 8. Got = ", num_bits); + TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m, + ", ", prob_n, ", ", prob_k, "]"); + + int tot_m = prob_m; + int tot_m_blocks = div_ceil(tot_m, 16); + int pad = 16 * tot_m_blocks - tot_m; + + if (sms == -1) { + cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev); + } + + int max_shared_mem = 0; + cudaDeviceGetAttribute(&max_shared_mem, + cudaDevAttrMaxSharedMemoryPerBlockOptin, dev); + TORCH_CHECK(max_shared_mem > 0); + + // Set thread config + exec_config_t exec_cfg; + if (thread_k != -1 && thread_n != -1) { + // User-defined config + exec_cfg = + exec_config_t{4, thread_config_t{thread_k, thread_n, default_threads}}; + } else { + // Auto config + exec_cfg = + determine_thread_config(prob_m, prob_n, prob_k, num_bits, group_size, + has_act_order, is_k_full, max_shared_mem); + } + + TORCH_CHECK(exec_cfg.max_m_blocks > 0 && + is_valid_config(exec_cfg.tb_cfg, exec_cfg.max_m_blocks, + prob_m, prob_n, prob_k, num_bits, group_size, + has_act_order, is_k_full, max_shared_mem), + "Invalid thread config: max_m_blocks = ", exec_cfg.max_m_blocks, + ", thread_k = ", exec_cfg.tb_cfg.thread_k, + ", thread_n = ", exec_cfg.tb_cfg.thread_n, + ", num_threads = ", exec_cfg.tb_cfg.num_threads, " for MKN = [", + prob_m, ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits, + ", group_size = ", group_size, + ", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full, + ", max_shared_mem = ", max_shared_mem); + + int num_threads = exec_cfg.tb_cfg.num_threads; + thread_k = exec_cfg.tb_cfg.thread_k; + thread_n = exec_cfg.tb_cfg.thread_n; + + int thread_k_blocks = thread_k / 16; + int thread_n_blocks = thread_n / 16; + + int blocks = sms; + + TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n, + " is not divisible by thread_n = ", thread_n); + TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k, + " is not divisible by thread_k = ", thread_k); + + int group_blocks = 0; + if (has_act_order) { + if (is_k_full) { + TORCH_CHECK(group_size != -1); + group_blocks = group_size / 16; + TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k, + " is not divisible by group_blocks = ", group_blocks); + } else { + TORCH_CHECK(group_size == 0); + group_blocks = 0; + } + + } else { + if (group_size == -1) { + group_blocks = -1; + } else { + group_blocks = group_size / 16; + TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k, + " is not divisible by group_blocks = ", group_blocks); + } + } + + const int4* A_ptr = (const int4*)A; + const int4* B_ptr = (const int4*)B; + int4* C_ptr = (int4*)C; + const int4* s_ptr = (const int4*)s; + const int* g_idx_ptr = (const int*)g_idx; + const int* perm_ptr = (const int*)perm; + int4* a_tmp_ptr = (int4*)a_tmp; + + int* locks = (int*)workspace; + + if (has_act_order) { + // Permute A columns + int block_rows = div_ceil(prob_m, blocks); + permute_cols_kernel<<>>( + A_ptr, perm_ptr, a_tmp_ptr, prob_m, prob_k, block_rows); + A_ptr = a_tmp_ptr; + } + + // If we have a full K, then we can run the non-act-order version of Marlin + // (since the weight rows are reordered by increasing group ids, and by having + // a full K, we have full original groups) + if (is_k_full) { + has_act_order = false; + } + + // Main loop + for (int i = 0; i < tot_m_blocks; i += exec_cfg.max_m_blocks) { + int thread_m_blocks = tot_m_blocks - i; + prob_m = tot_m - 16 * i; + int par = 1; + if (thread_m_blocks > exec_cfg.max_m_blocks) { + // Note that parallel > 1 currently only works for inputs without any + // padding + par = (16 * thread_m_blocks - pad) / (16 * exec_cfg.max_m_blocks); + if (par > max_par) par = max_par; + prob_m = (16 * exec_cfg.max_m_blocks) * par; + i += exec_cfg.max_m_blocks * (par - 1); + thread_m_blocks = exec_cfg.max_m_blocks; + } + + // Define kernel configurations + if (false) { + } + CALL_IF(4, 32, 2, 256) + CALL_IF(4, 16, 4, 256) + CALL_IF(4, 8, 8, 256) + CALL_IF(4, 8, 4, 128) + CALL_IF(4, 4, 8, 128) + CALL_IF(8, 32, 2, 256) + CALL_IF(8, 16, 4, 256) + CALL_IF(8, 8, 8, 256) + CALL_IF(8, 8, 4, 128) + CALL_IF(8, 4, 8, 128) + else { + TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " + + str(prob_n) + ", " + str(prob_k) + "]" + + ", has_act_order = " + str(has_act_order) + + ", num_groups = " + str(num_groups) + + ", group_size = " + str(group_size) + + ", thread_m_blocks = " + str(thread_m_blocks) + + ", thread_n_blocks = " + str(thread_n_blocks) + + ", thread_k_blocks = " + str(thread_k_blocks)); + } + + A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par; + C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par; + } +} + +} // namespace gptq_marlin + +torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, + torch::Tensor& b_scales, torch::Tensor& g_idx, + torch::Tensor& perm, torch::Tensor& workspace, + int64_t num_bits, int64_t size_m, int64_t size_n, + int64_t size_k, bool is_k_full) { + // Verify num_bits + TORCH_CHECK(num_bits == 4 || num_bits == 8, + "num_bits must be 4 or 8. Got = ", num_bits); + int pack_factor = 32 / num_bits; + + // Verify A + TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0), + ", size_m = ", size_m); + TORCH_CHECK(a.size(1) == size_k, "Shape mismatch: a.size(1) = ", a.size(1), + ", size_k = ", size_k); + + // Verify B + TORCH_CHECK(size_k % gptq_marlin::tile_size == 0, "size_k = ", size_k, + " is not divisible by tile_size = ", gptq_marlin::tile_size); + TORCH_CHECK((size_k / gptq_marlin::tile_size) == b_q_weight.size(0), + "Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0), + ", size_k = ", size_k, ", tile_size = ", gptq_marlin::tile_size); + TORCH_CHECK(b_q_weight.size(1) % gptq_marlin::tile_size == 0, + "b_q_weight.size(1) = ", b_q_weight.size(1), + " is not divisible by tile_size = ", gptq_marlin::tile_size); + int actual_size_n = + (b_q_weight.size(1) / gptq_marlin::tile_size) * pack_factor; + TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n, + ", actual_size_n = ", actual_size_n); + + // Verify device and strides + TORCH_CHECK(a.device().is_cuda(), "A is not on GPU"); + TORCH_CHECK(a.is_contiguous(), "A is not contiguous"); + + TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU"); + TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous"); + + TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU"); + TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous"); + + TORCH_CHECK(g_idx.device().is_cuda(), "g_idx is not on GPU"); + TORCH_CHECK(g_idx.is_contiguous(), "g_idx is not contiguous"); + + TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU"); + TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous"); + + // Alloc buffers + const at::cuda::OptionalCUDAGuard device_guard(device_of(a)); + auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device()); + torch::Tensor c = torch::empty({size_m, size_n}, options); + torch::Tensor a_tmp = torch::empty({size_m, size_k}, options); + + // thread_k: `k` size of a thread_tile in `weights` (can usually be left as + // auto -1) + int thread_k = -1; + // thread_n: `n` size of a thread_tile in `weights` (can usually be left as + // auto -1) + int thread_n = -1; + // sms: number of SMs to use for the kernel (can usually be left as auto -1) + int sms = -1; + + // Verify g_idx and perm + TORCH_CHECK((g_idx.size(0) == 0 && perm.size(0) == 0) || + (g_idx.size(0) == size_k && perm.size(0) == size_k), + "Unexpected g_idx.size(0) = ", g_idx.size(0), + " and perm.size(0) = ", perm.size(0), + ", where size_k = ", size_k); + + // Detect groupsize and act_order + int num_groups = -1; + int group_size = -1; + bool has_act_order = g_idx.size(0) != 0; + + int b_rank = b_scales.sizes().size(); + TORCH_CHECK(b_rank == 2, "b_scales rank = ", b_rank, " is not 2"); + TORCH_CHECK(b_scales.size(1) == size_n, "b_scales dim 1 = ", b_scales.size(1), + " is not size_n = ", size_n); + num_groups = b_scales.size(0); + + if (has_act_order) { + if (is_k_full) { + TORCH_CHECK(num_groups > 1, "For act_order, num_groups must be > 1"); + TORCH_CHECK(size_k % num_groups == 0, "size_k = ", size_k, + ", is not divisible by num_groups = ", num_groups); + group_size = size_k / num_groups; + } else { + group_size = 0; + } + + } else { + if (num_groups > 1) { + TORCH_CHECK( + size_k % num_groups == 0, "size_k = ", size_k, + ", is not divisible by b_scales.size(0) = ", b_scales.size(0)); + group_size = size_k / num_groups; + } else { + group_size = -1; + } + } + + // Verify workspace size + TORCH_CHECK( + size_n % gptq_marlin::min_thread_n == 0, "size_n = ", size_n, + ", is not divisible by min_thread_n = ", gptq_marlin::min_thread_n); + int min_workspace_size = + (size_n / gptq_marlin::min_thread_n) * gptq_marlin::max_par; + TORCH_CHECK(workspace.numel() >= min_workspace_size, + "workspace.numel = ", workspace.numel(), + " is below min_workspace_size = ", min_workspace_size); + + int dev = a.get_device(); + if (a.scalar_type() == at::ScalarType::Half) { + gptq_marlin::marlin_mm_f16i4( + a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), + b_scales.data_ptr(), g_idx.data_ptr(), perm.data_ptr(), + a_tmp.data_ptr(), size_m, size_n, size_k, + workspace.data_ptr(), num_bits, has_act_order, is_k_full, num_groups, + group_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k, + thread_n, sms, gptq_marlin::max_par); + } else if (a.scalar_type() == at::ScalarType::BFloat16) { + gptq_marlin::marlin_mm_f16i4( + a.data_ptr(), b_q_weight.data_ptr(), + c.data_ptr(), b_scales.data_ptr(), + g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(), + size_m, size_n, size_k, workspace.data_ptr(), num_bits, has_act_order, + is_k_full, num_groups, group_size, dev, + at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, + gptq_marlin::max_par); + } else { + TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16"); + } + + return c; +} + +#endif diff --git a/server/marlin/marlin_kernels/gptq_marlin.cuh b/server/marlin/marlin_kernels/gptq_marlin.cuh new file mode 100644 index 00000000..42af4495 --- /dev/null +++ b/server/marlin/marlin_kernels/gptq_marlin.cuh @@ -0,0 +1,76 @@ +#pragma once + +#include + +#include +#include +#include +#include +#include +#include + +namespace gptq_marlin { + +// 8 warps are a good choice since every SM has 4 schedulers and having more +// than 1 warp per schedule allows some more latency hiding. At the same time, +// we want relatively few warps to have many registers per warp and small tiles. +static constexpr int default_threads = 256; + +static constexpr int pipe_stages = + 4; // 4 pipeline stages fit into shared memory + +static constexpr int min_thread_n = 64; +static constexpr int min_thread_k = 64; + +static constexpr int tile_size = 16; +static constexpr int max_par = 16; + +template +struct Vec { + T elems[n]; + __device__ T& operator[](int i) { return elems[i]; } +}; + +using I4 = Vec; + +constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; } + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 +// No support for async +#else + +__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr, + bool pred = true) { + const int BYTES = 16; + uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %0, 0;\n" + " @p cp.async.cg.shared.global [%1], [%2], %3;\n" + "}\n" ::"r"((int)pred), + "r"(smem), "l"(glob_ptr), "n"(BYTES)); +} + +__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) { + const int BYTES = 16; + uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); + asm volatile( + "{\n" + " cp.async.cg.shared.global [%0], [%1], %2;\n" + "}\n" ::"r"(smem), + "l"(glob_ptr), "n"(BYTES)); +} + +__device__ inline void cp_async_fence() { + asm volatile("cp.async.commit_group;\n" ::); +} + +template +__device__ inline void cp_async_wait() { + asm volatile("cp.async.wait_group %0;\n" ::"n"(n)); +} + +#endif + +} // namespace gptq_marlin diff --git a/server/marlin/marlin_kernels/gptq_marlin_dtypes.cuh b/server/marlin/marlin_kernels/gptq_marlin_dtypes.cuh new file mode 100644 index 00000000..ca1b7099 --- /dev/null +++ b/server/marlin/marlin_kernels/gptq_marlin_dtypes.cuh @@ -0,0 +1,77 @@ + +#ifndef _data_types_cuh +#define _data_types_cuh +#include "gptq_marlin.cuh" +#include +#include + +namespace gptq_marlin { + +template +class ScalarType {}; + +template <> +class ScalarType { + public: + using scalar_t = half; + using scalar_t2 = half2; + + // Matrix fragments for tensor core instructions; their precise layout is + // documented here: + // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type + using FragA = Vec; + using FragB = Vec; + using FragC = Vec; + using FragS = Vec; + + static __device__ float inline num2float(const half x) { + return __half2float(x); + } + + static __device__ half2 inline num2num2(const half x) { + return __half2half2(x); + } + + static __device__ half2 inline nums2num2(const half x1, const half x2) { + return __halves2half2(x1, x2); + } + + static __host__ __device__ half inline float2num(const float x) { + return __float2half(x); + } +}; + +template <> +class ScalarType { + public: + using scalar_t = nv_bfloat16; + using scalar_t2 = nv_bfloat162; + + using FragA = Vec; + using FragB = Vec; + using FragC = Vec; + using FragS = Vec; + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + static __device__ float inline num2float(const nv_bfloat16 x) { + return __bfloat162float(x); + } + + static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) { + return __bfloat162bfloat162(x); + } + + static __device__ nv_bfloat162 inline nums2num2(const nv_bfloat16 x1, + const nv_bfloat16 x2) { + return __halves2bfloat162(x1, x2); + } + + static __host__ __device__ nv_bfloat16 inline float2num(const float x) { + return __float2bfloat16(x); + } +#endif +}; + +} // namespace gptq_marlin + +#endif diff --git a/server/marlin/marlin_kernels/gptq_marlin_repack.cu b/server/marlin/marlin_kernels/gptq_marlin_repack.cu new file mode 100644 index 00000000..4adc158e --- /dev/null +++ b/server/marlin/marlin_kernels/gptq_marlin_repack.cu @@ -0,0 +1,350 @@ +#include "gptq_marlin.cuh" + +namespace gptq_marlin { + +static constexpr int repack_stages = 8; + +static constexpr int repack_threads = 256; + +static constexpr int tile_k_size = tile_size; +static constexpr int tile_n_size = tile_k_size * 4; + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + +template +__global__ void marlin_repack_kernel( + uint32_t const* __restrict__ b_q_weight_ptr, + uint32_t const* __restrict__ perm_ptr, uint32_t* __restrict__ out_ptr, + int size_k, int size_n) {} + +} // namespace gptq_marlin + +torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm, + int64_t size_k, int64_t size_n, + int64_t num_bits) { + TORCH_CHECK_NOT_IMPLEMENTED( + false, "marlin_repack_from_gptq(..) requires CUDA_ARCH >= 8.0"); + return torch::empty({1, 1}); +} + +#else + +template +__global__ void marlin_repack_kernel( + uint32_t const* __restrict__ b_q_weight_ptr, + uint32_t const* __restrict__ perm_ptr, uint32_t* __restrict__ out_ptr, + int size_k, int size_n) { + constexpr int pack_factor = 32 / num_bits; + + int k_tiles = size_k / tile_k_size; + int n_tiles = size_n / tile_n_size; + int block_k_tiles = div_ceil(k_tiles, gridDim.x); + + int start_k_tile = blockIdx.x * block_k_tiles; + if (start_k_tile >= k_tiles) { + return; + } + + int finish_k_tile = min(start_k_tile + block_k_tiles, k_tiles); + + // Wait until the next thread tile has been loaded to shared memory. + auto wait_for_stage = [&]() { + // We only have `stages - 2` active fetches since we are double buffering + // and can only issue the next fetch when it is guaranteed that the previous + // shared memory load is fully complete (as it may otherwise be + // overwritten). + cp_async_wait(); + __syncthreads(); + }; + + extern __shared__ int4 sh[]; + + constexpr int perm_size = tile_k_size / 4; + + int4* sh_perm_ptr = sh; + int4* sh_pipe_ptr = sh_perm_ptr; + if constexpr (has_perm) { + sh_pipe_ptr += perm_size; + } + + constexpr int tile_ints = tile_k_size / pack_factor; + + constexpr int stage_n_threads = tile_n_size / 4; + constexpr int stage_k_threads = has_perm ? tile_k_size : tile_ints; + constexpr int stage_size = stage_k_threads * stage_n_threads; + + auto load_perm_to_shared = [&](int k_tile_id) { + int first_k_int4 = (k_tile_id * tile_k_size) / 4; + + int4 const* perm_int4_ptr = reinterpret_cast(perm_ptr); + + if (threadIdx.x < perm_size) { + sh_perm_ptr[threadIdx.x] = perm_int4_ptr[first_k_int4 + threadIdx.x]; + } + __syncthreads(); + }; + + auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) { + if (n_tile_id >= n_tiles) { + cp_async_fence(); + return; + } + + int first_n = n_tile_id * tile_n_size; + + int4* sh_ptr = sh_pipe_ptr + stage_size * pipe; + + if constexpr (has_perm) { + if (threadIdx.x < stage_size) { + int k_id = threadIdx.x / stage_n_threads; + int n_id = threadIdx.x % stage_n_threads; + + uint32_t const* sh_perm_int_ptr = + reinterpret_cast(sh_perm_ptr); + + int src_k = sh_perm_int_ptr[k_id]; + int src_k_packed = src_k / pack_factor; + + cp_async4( + &sh_ptr[k_id * stage_n_threads + n_id], + reinterpret_cast(&( + b_q_weight_ptr[src_k_packed * size_n + first_n + (n_id * 4)]))); + } + + } else { + if (threadIdx.x < stage_size) { + int k_id = threadIdx.x / stage_n_threads; + int n_id = threadIdx.x % stage_n_threads; + + int first_k = k_tile_id * tile_k_size; + int first_k_packed = first_k / pack_factor; + + cp_async4(&sh_ptr[k_id * stage_n_threads + n_id], + reinterpret_cast( + &(b_q_weight_ptr[(first_k_packed + k_id) * size_n + + first_n + (n_id * 4)]))); + } + } + + cp_async_fence(); + }; + + auto repack_tile = [&](int pipe, int k_tile_id, int n_tile_id) { + if (n_tile_id >= n_tiles) { + return; + } + + int warp_id = threadIdx.x / 32; + int th_id = threadIdx.x % 32; + + if (warp_id >= 4) { + return; + } + + int tc_col = th_id / 4; + int tc_row = (th_id % 4) * 2; + + constexpr int tc_offsets[4] = {0, 1, 8, 9}; + + int cur_n = warp_id * 16 + tc_col; + + constexpr int sh_stride = 64; + constexpr uint32_t mask = (1 << num_bits) - 1; + + int4* sh_stage_ptr = sh_pipe_ptr + stage_size * pipe; + uint32_t* sh_stage_int_ptr = reinterpret_cast(sh_stage_ptr); + + uint32_t* sh_perm_int_ptr = reinterpret_cast(sh_perm_ptr); + + uint32_t vals[8]; + + if constexpr (has_perm) { + for (int i = 0; i < 4; i++) { + int k_idx = tc_row + tc_offsets[i]; + + uint32_t src_k = sh_perm_int_ptr[k_idx]; + uint32_t src_k_pos = src_k % pack_factor; + + uint32_t b1_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n]; + uint32_t b1_cur_val = (b1_val >> (src_k_pos * num_bits)) & mask; + + uint32_t b2_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n + 8]; + uint32_t b2_cur_val = (b2_val >> (src_k_pos * num_bits)) & mask; + + vals[i] = b1_cur_val; + vals[4 + i] = b2_cur_val; + } + + } else { + uint32_t b1_vals[tile_ints]; + uint32_t b2_vals[tile_ints]; + + #pragma unroll + for (int i = 0; i < tile_ints; i++) { + b1_vals[i] = sh_stage_int_ptr[cur_n + sh_stride * i]; + b2_vals[i] = sh_stage_int_ptr[cur_n + 8 + sh_stride * i]; + } + + #pragma unroll + for (int i = 0; i < 4; i++) { + int cur_elem = tc_row + tc_offsets[i]; + int cur_int = cur_elem / pack_factor; + int cur_pos = cur_elem % pack_factor; + + vals[i] = (b1_vals[cur_int] >> (cur_pos * num_bits)) & mask; + vals[4 + i] = (b2_vals[cur_int] >> (cur_pos * num_bits)) & mask; + } + } + + constexpr int tile_size = tile_k_size * tile_n_size / pack_factor; + int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size; + + // Result of: + // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h + if constexpr (num_bits == 4) { + constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + + uint32_t res = 0; + #pragma unroll + for (int i = 0; i < 8; i++) { + res |= vals[pack_idx[i]] << (i * 4); + } + + out_ptr[out_offset + th_id * 4 + warp_id] = res; + + } else { + constexpr int pack_idx[4] = {0, 2, 1, 3}; + + uint32_t res1 = 0; + uint32_t res2 = 0; + #pragma unroll + for (int i = 0; i < 4; i++) { + res1 |= vals[pack_idx[i]] << (i * 8); + res2 |= vals[4 + pack_idx[i]] << (i * 8); + } + + out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1; + out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 1] = res2; + } + }; + + auto start_pipes = [&](int k_tile_id, int n_tile_id) { + #pragma unroll + for (int pipe = 0; pipe < repack_stages - 1; pipe++) { + fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe); + } + + wait_for_stage(); + }; + #pragma unroll + for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) { + int n_tile_id = 0; + + if constexpr (has_perm) { + load_perm_to_shared(k_tile_id); + } + + start_pipes(k_tile_id, n_tile_id); + + while (n_tile_id < n_tiles) { + #pragma unroll + for (int pipe = 0; pipe < repack_stages; pipe++) { + fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id, + n_tile_id + pipe + repack_stages - 1); + repack_tile(pipe, k_tile_id, n_tile_id + pipe); + wait_for_stage(); + } + n_tile_id += repack_stages; + } + } +} + +} // namespace gptq_marlin + + #define CALL_IF(NUM_BITS, HAS_PERM) \ + else if (num_bits == NUM_BITS && has_perm == HAS_PERM) { \ + cudaFuncSetAttribute( \ + gptq_marlin::marlin_repack_kernel, \ + cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \ + gptq_marlin::marlin_repack_kernel \ + <<>>( \ + b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n); \ + } + +torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm, + int64_t size_k, int64_t size_n, + int64_t num_bits) { + // Verify compatibility with marlin tile of 16x64 + TORCH_CHECK(size_k % gptq_marlin::tile_k_size == 0, "size_k = ", size_k, + " is not divisible by tile_k_size = ", gptq_marlin::tile_k_size); + TORCH_CHECK(size_n % gptq_marlin::tile_n_size == 0, "size_n = ", size_n, + " is not divisible by tile_n_size = ", gptq_marlin::tile_n_size); + + TORCH_CHECK(num_bits == 4 || num_bits == 8, + "num_bits must be 4 or 8. Got = ", num_bits); + int const pack_factor = 32 / num_bits; + + // Verify B + TORCH_CHECK((size_k / pack_factor) == b_q_weight.size(0), + "Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0), + ", size_k = ", size_k, ", pack_factor = ", pack_factor); + TORCH_CHECK(b_q_weight.size(1) == size_n, + "b_q_weight.size(1) = ", b_q_weight.size(1), + " is not size_n = ", size_n); + + // Verify device and strides + TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU"); + TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous"); + TORCH_CHECK(b_q_weight.dtype() == at::kInt, "b_q_weight type is not kInt"); + + TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU"); + TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous"); + TORCH_CHECK(perm.dtype() == at::kInt, "perm type is not at::kInt"); + + // Alloc buffers + const at::cuda::OptionalCUDAGuard device_guard(device_of(b_q_weight)); + auto options = torch::TensorOptions() + .dtype(b_q_weight.dtype()) + .device(b_q_weight.device()); + torch::Tensor out = + torch::empty({size_k / gptq_marlin::tile_size, + size_n * gptq_marlin::tile_size / pack_factor}, + options); + + // Detect if there is act_order + bool has_perm = perm.size(0) != 0; + + // Get ptrs + uint32_t const* b_q_weight_ptr = + reinterpret_cast(b_q_weight.data_ptr()); + uint32_t const* perm_ptr = reinterpret_cast(perm.data_ptr()); + uint32_t* out_ptr = reinterpret_cast(out.data_ptr()); + + // Get dev info + int dev = b_q_weight.get_device(); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev); + int blocks; + cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev); + + int max_shared_mem = 0; + cudaDeviceGetAttribute(&max_shared_mem, + cudaDevAttrMaxSharedMemoryPerBlockOptin, dev); + TORCH_CHECK(max_shared_mem > 0); + + if (false) { + } + CALL_IF(4, false) + CALL_IF(4, true) + CALL_IF(8, false) + CALL_IF(8, true) + else { + TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits, + ", has_perm = ", has_perm); + } + + return out; +} + +#endif diff --git a/server/marlin/marlin_kernels/marlin_cuda_kernel.cu b/server/marlin/marlin_kernels/marlin_cuda_kernel.cu new file mode 100644 index 00000000..d124c014 --- /dev/null +++ b/server/marlin/marlin_kernels/marlin_cuda_kernel.cu @@ -0,0 +1,1136 @@ +/* + * Modified by Neural Magic + * Copyright (C) Marlin.2024 Elias Frantar + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include + +#include + +template +inline std::string str(T x) { + return std::to_string(x); +} + +namespace marlin { + +constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; } + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + +// Instances of `Vec` are used to organize groups of >>registers<<, as needed +// for instance as inputs to tensor core operations. Consequently, all +// corresponding index accesses must be compile-time constants, which is why we +// extensively use `#pragma unroll` throughout the kernel code to guarantee +// this. +template +struct Vec { + T elems[n]; + __device__ T& operator[](int i) { return elems[i]; } +}; + +using I4 = Vec; + +// Matrix fragments for tensor core instructions; their precise layout is +// documented here: +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type +using FragA = Vec; +using FragB = Vec; +using FragC = Vec; +using FragS = Vec; // quantization scales + +// Predicated asynchronous global->shared copy; used for inputs A where we apply +// predication to handle batchsizes that are not multiples of 16. +__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr, + bool pred = true) { + const int BYTES = 16; + uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %0, 0;\n" + " @p cp.async.cg.shared.global [%1], [%2], %3;\n" + "}\n" ::"r"((int)pred), + "r"(smem), "l"(glob_ptr), "n"(BYTES)); +} + +// Asynchronous global->shared copy +__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) { + const int BYTES = 16; + uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); + asm volatile( + "{\n" + " cp.async.cg.shared.global [%0], [%1], %2;\n" + "}\n" ::"r"(smem), + "l"(glob_ptr), "n"(BYTES)); +} + +// Async copy fence. +__device__ inline void cp_async_fence() { + asm volatile("cp.async.commit_group;\n" ::); +} + +// Wait until at most `n` async copy stages are still pending. +template +__device__ inline void cp_async_wait() { + asm volatile("cp.async.wait_group %0;\n" ::"n"(n)); +} + +// m16n8k16 tensor core mma instruction with fp16 inputs and fp32 +// output/accumulation. +__device__ inline void mma(const FragA& a_frag, const FragB& frag_b, + FragC& frag_c) { + const uint32_t* a = reinterpret_cast(&a_frag); + const uint32_t* b = reinterpret_cast(&frag_b); + float* c = reinterpret_cast(&frag_c); + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), + "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); +} + +// Instruction for loading a full 16x16 matrix fragment of operand A from shared +// memory, directly in tensor core layout. +__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) { + uint32_t* a = reinterpret_cast(&frag_a); + uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); + asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n" + : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3]) + : "r"(smem)); +} + +// Lookup-table based 3-input logical operation; explicitly used for +// dequantization as the compiler does not seem to automatically recognize it in +// all cases. +template +__device__ inline int lop3(int a, int b, int c) { + int res; + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(res) + : "r"(a), "r"(b), "r"(c), "n"(lut)); + return res; +} + +// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16 +// values. We mostly follow the strategy in the link below, with some small +// changes: +// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h +__device__ inline FragB dequant(int q) { + const int LO = 0x000f000f; + const int HI = 0x00f000f0; + const int EX = 0x64006400; + // Guarantee that the `(a & b) | c` operations are LOP3s. + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point + // directly into `SUB` and `ADD`. + const int SUB = 0x64086408; + const int MUL = 0x2c002c00; + const int ADD = 0xd480d480; + FragB frag_b; + frag_b[0] = __hsub2(*reinterpret_cast(&lo), + *reinterpret_cast(&SUB)); + frag_b[1] = __hfma2(*reinterpret_cast(&hi), + *reinterpret_cast(&MUL), + *reinterpret_cast(&ADD)); + return frag_b; +} + +// Multiply dequantized values by the corresponding quantization scale; used +// only for grouped quantization. +__device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) { + half2 s = __half2half2(reinterpret_cast<__half*>(&frag_s)[i]); + frag_b[0] = __hmul2(frag_b[0], s); + frag_b[1] = __hmul2(frag_b[1], s); +} + +// Wait until barrier reaches `count`, then lock for current threadblock. +__device__ inline void barrier_acquire(int* lock, int count) { + if (threadIdx.x == 0) { + int state = -1; + do + // Guarantee that subsequent writes by this threadblock will be visible + // globally. + asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n" + : "=r"(state) + : "l"(lock)); + while (state != count); + } + __syncthreads(); +} + +// Release barrier and increment visitation count. +__device__ inline void barrier_release(int* lock, bool reset = false) { + __syncthreads(); + if (threadIdx.x == 0) { + if (reset) { + lock[0] = 0; + return; + } + int val = 1; + // Make sure that all writes since acquiring this barrier are visible + // globally, while releasing the barrier. + asm volatile("fence.acq_rel.gpu;\n"); + asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n" + : + : "l"(lock), "r"(val)); + } +} + +template shared + // fetch pipeline + const int group_blocks = -1 // number of consecutive 16x16 blocks + // with a separate quantization scale + > +__global__ void Marlin( + const int4* __restrict__ A, // fp16 input matrix of shape mxk + const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn + int4* __restrict__ C, // fp16 output buffer of shape mxn + const int4* __restrict__ s, // fp16 quantization scales of shape + // (k/groupsize)xn + int prob_m, // batch dimension m + int prob_n, // output dimension n + int prob_k, // reduction dimension k + int* locks // extra global storage for barrier synchronization +) { + // Each threadblock processes one "stripe" of the B matrix with (roughly) the + // same size, which might involve multiple column "slices" (of width 16 * + // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM + // example: + // 0 1 3 + // 0 2 3 + // 1 2 4 + // While this kind of partitioning makes things somewhat more complicated, it + // ensures good utilization of all SMs for many kinds of shape and GPU + // configurations, while requiring as few slow global cross-threadblock + // reductions as possible. + + // For larger GEMMs we run multiple batchsize 64 versions in parallel for a + // better partitioning with less reductions + int parallel = 1; + if (prob_m > 16 * thread_m_blocks) { + parallel = prob_m / (16 * thread_m_blocks); + prob_m = 16 * thread_m_blocks; + } + + int k_tiles = prob_k / 16 / thread_k_blocks; + int n_tiles = prob_n / 16 / thread_n_blocks; + int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x); + // Ensure that the number of tiles in each stripe is a multiple of the + // groupsize; this avoids an annoying special case where a stripe starts in + // the middle of group. + if (group_blocks != -1) + iters = (group_blocks / thread_k_blocks) * + ceildiv(iters, (group_blocks / thread_k_blocks)); + + int slice_row = (iters * blockIdx.x) % k_tiles; + int slice_col_par = (iters * blockIdx.x) / k_tiles; + int slice_col = slice_col_par; + int slice_iters; // number of threadblock tiles in the current slice + int slice_count = + 0; // total number of active threadblocks in the current slice + int slice_idx; // index of threadblock in current slice; numbered bottom to + // top + + // We can easily implement parallel problem execution by just remapping + // indices and advancing global pointers + if (slice_col_par >= n_tiles) { + A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8; + C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8; + locks += (slice_col_par / n_tiles) * n_tiles; + slice_col = slice_col_par % n_tiles; + } + + // Compute all information about the current slice which is required for + // synchronization. + auto init_slice = [&]() { + slice_iters = + iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row); + if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0; + if (slice_iters == 0) return; + if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row; + slice_count = 1; + slice_idx = 0; + int col_first = iters * ceildiv(k_tiles * slice_col_par, iters); + if (col_first <= k_tiles * (slice_col_par + 1)) { + int col_off = col_first - k_tiles * slice_col_par; + slice_count = ceildiv(k_tiles - col_off, iters); + if (col_off > 0) slice_count++; + int delta_first = iters * blockIdx.x - col_first; + if (delta_first < 0 || (col_off == 0 && delta_first == 0)) + slice_idx = slice_count - 1; + else { + slice_idx = slice_count - 1 - delta_first / iters; + if (col_off > 0) slice_idx--; + } + } + if (slice_col == n_tiles) { + A += 16 * thread_m_blocks * prob_k / 8; + C += 16 * thread_m_blocks * prob_n / 8; + locks += n_tiles; + slice_col = 0; + } + }; + init_slice(); + + int a_gl_stride = prob_k / 8; // stride of the A matrix in global memory + // We typically use `constexpr` to indicate that this value is a compile-time + // constant + constexpr int a_sh_stride = + 16 * thread_k_blocks / 8; // stride of an A matrix tile in shared memory + constexpr int a_gl_rd_delta_o = + 16 * thread_k_blocks / + 8; // delta between subsequent A tiles in global memory + int a_gl_rd_delta_i = + a_gl_stride * + (threads / a_gl_rd_delta_o); // between subsequent accesses within a tile + constexpr int a_sh_wr_delta = + a_sh_stride * + (threads / a_gl_rd_delta_o); // between shared memory writes + constexpr int a_sh_rd_delta_o = + 2 * ((threads / 32) / + (thread_n_blocks / 4)); // between shared memory tile reads + constexpr int a_sh_rd_delta_i = + a_sh_stride * 16; // within a shared memory tile + constexpr int a_sh_stage = + a_sh_stride * (16 * thread_m_blocks); // overall size of a tile + constexpr int a_sh_wr_iters = + ceildiv(a_sh_stage, + a_sh_wr_delta); // number of shared write iterations for a tile + + int b_gl_stride = 16 * prob_n / 32; + constexpr int b_sh_stride = 32 * thread_n_blocks / 4; + int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks; + int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride); + constexpr int b_sh_wr_delta = threads; + constexpr int b_sh_rd_delta = threads; + constexpr int b_sh_stage = b_sh_stride * thread_k_blocks; + constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta; + + int s_gl_stride = prob_n / 8; + constexpr int s_sh_stride = 16 * thread_n_blocks / 8; + constexpr int s_sh_stage = s_sh_stride; + int s_gl_rd_delta = s_gl_stride; + + // Global A read index of current thread. + int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + + (threadIdx.x % a_gl_rd_delta_o); + a_gl_rd += a_gl_rd_delta_o * slice_row; + // Shared write index of current thread. + int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) + + (threadIdx.x % a_gl_rd_delta_o); + // Shared read index. + int a_sh_rd = + a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16; + a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4)); + + int b_gl_rd = + b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride); + b_gl_rd += b_sh_stride * slice_col; + b_gl_rd += b_gl_rd_delta_o * slice_row; + int b_sh_wr = threadIdx.x; + int b_sh_rd = threadIdx.x; + + int s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) + + s_sh_stride * slice_col + threadIdx.x; + int s_sh_wr = threadIdx.x; + int s_sh_rd; + // We use a different scale layout for grouped and column-wise quantization as + // we scale a `half2` tile in column-major layout in the former and in + // row-major in the latter case. + if (group_blocks != -1) + s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + + (threadIdx.x % 32) / 4; + else + s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + + (threadIdx.x % 32) % 4; + + // Precompute which thread should not read memory in which iterations; this is + // needed if there are more threads than required for a certain tilesize or + // when the batchsize is not a multiple of 16. + bool a_sh_wr_pred[a_sh_wr_iters]; + #pragma unroll + for (int i = 0; i < a_sh_wr_iters; i++) + a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m; + bool s_sh_wr_pred = threadIdx.x < s_sh_stride; + + // To ensure that writing and reading A tiles to/from shared memory, the + // latter in fragment format, is fully bank conflict free, we need to use a + // rather fancy XOR-based layout. The key here is that neither reads nor + // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the + // same shared memory banks. Further, it seems (based on NSight-Compute) that + // each warp must also write a consecutive memory segment? + auto transform_a = [&](int i) { + int row = i / a_gl_rd_delta_o; + return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row; + }; + // Since the computation of this remapping is non-trivial and, due to our main + // loop unrolls, all shared memory accesses are static, we simply precompute + // both transformed reads and writes. + int a_sh_wr_trans[a_sh_wr_iters]; + #pragma unroll + for (int i = 0; i < a_sh_wr_iters; i++) + a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr); + int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks]; + #pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) { + #pragma unroll + for (int j = 0; j < thread_m_blocks; j++) + a_sh_rd_trans[i][j] = + transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd); + } + + // Since B-accesses have non-constant stride they have to be computed at + // runtime; we break dependencies between subsequent accesses with a tile by + // maintining multiple pointers (we have enough registers), a tiny + // optimization. + const int4* B_ptr[b_sh_wr_iters]; + #pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) + B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd; + + extern __shared__ int4 sh[]; + // Shared memory storage for global fetch pipelines. + int4* sh_a = sh; + int4* sh_b = sh_a + (stages * a_sh_stage); + int4* sh_s = sh_b + (stages * b_sh_stage); + // Register storage for double buffer of shared memory reads. + FragA frag_a[2][thread_m_blocks]; + I4 frag_b_quant[2]; + FragC frag_c[thread_m_blocks][4][2]; + FragS frag_s[2][4]; + + // Zero accumulators. + auto zero_accums = [&]() { + #pragma unroll + for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++) + reinterpret_cast(frag_c)[i] = 0; + }; + + // Asynchronously fetch the next A, B and s tile from global to the next + // shared memory pipeline location. + auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) { + if (pred) { + int4* sh_a_stage = sh_a + a_sh_stage * pipe; + #pragma unroll + for (int i = 0; i < a_sh_wr_iters; i++) { + cp_async4_pred( + &sh_a_stage[a_sh_wr_trans[i]], + &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off], + a_sh_wr_pred[i]); + } + int4* sh_b_stage = sh_b + b_sh_stage * pipe; + #pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) { + cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]); + B_ptr[i] += b_gl_rd_delta_o; + } + // Only fetch scales if this tile starts a new group + if (group_blocks != -1 && pipe % (group_blocks / thread_k_blocks) == 0) { + int4* sh_s_stage = sh_s + s_sh_stage * pipe; + if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]); + s_gl_rd += s_gl_rd_delta; + } + } + // Insert a fence even when we are winding down the pipeline to ensure that + // waiting is also correct at this point. + cp_async_fence(); + }; + + // Wait until the next thread tile has been loaded to shared memory. + auto wait_for_stage = [&]() { + // We only have `stages - 2` active fetches since we are double buffering + // and can only issue the next fetch when it is guaranteed that the previous + // shared memory load is fully complete (as it may otherwise be + // overwritten). + cp_async_wait(); + __syncthreads(); + }; + + // Load the next sub-tile from the current location in the shared memory pipe + // into the current register buffer. + auto fetch_to_registers = [&](int k, int pipe) { + // It may seem inefficient that we reload the groups for every sub-tile; + // however, this does not seem to be a significant bottleneck, while some + // theoretically better attempts have lead to bad instruction ordering by + // the compiler and correspondingly a noticeable drop in performance. + if (group_blocks != -1) { + int4* sh_s_stage = + sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) * + (pipe / (group_blocks / thread_k_blocks))); + reinterpret_cast(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd]; + } + int4* sh_a_stage = sh_a + a_sh_stage * pipe; + #pragma unroll + for (int i = 0; i < thread_m_blocks; i++) + ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]); + int4* sh_b_stage = sh_b + b_sh_stage * pipe; + frag_b_quant[k % 2] = *reinterpret_cast( + &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]); + }; + + // Execute the actual tensor core matmul of a sub-tile. + auto matmul = [&](int k) { + // We have the m dimension as the inner loop in order to encourage overlapping + // dequantization and matmul operations. + #pragma unroll + for (int j = 0; j < 4; j++) { + int b_quant = frag_b_quant[k % 2][j]; + int b_quant_shift = b_quant >> 8; + FragB frag_b0 = dequant(b_quant); + // If there are no groups, we can just scale the final output once and can + // avoid doing so for each weight. + if (group_blocks != -1) scale(frag_b0, frag_s[k % 2][j], 0); + FragB frag_b1 = dequant(b_quant_shift); + if (group_blocks != -1) scale(frag_b1, frag_s[k % 2][j], 1); + #pragma unroll + for (int i = 0; i < thread_m_blocks; i++) { + mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]); + mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]); + } + } + }; + + // Since we slice across the k dimension of a tile in order to increase the + // number of warps while keeping the n dimension of a tile reasonable, we have + // multiple warps that accumulate their partial sums of the same output + // location; which we have to reduce over in the end. We do in shared memory. + auto thread_block_reduce = [&]() { + constexpr int red_off = threads / b_sh_stride / 2; + if (red_off >= 1) { + int red_idx = threadIdx.x / b_sh_stride; + constexpr int red_sh_stride = b_sh_stride * 4 * 2; + constexpr int red_sh_delta = b_sh_stride; + int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) + + (threadIdx.x % b_sh_stride); + + // Parallel logarithmic shared memory reduction. We make sure to avoid any + // unnecessary read or write iterations, e.g., for two warps we write only + // once by warp 1 and read only once by warp 0. + + #pragma unroll + for (int m_block = 0; m_block < thread_m_blocks; m_block++) { + #pragma unroll + for (int i = red_off; i > 0; i /= 2) { + if (i <= red_idx && red_idx < 2 * i) { + #pragma unroll + for (int j = 0; j < 4 * 2; j++) { + int red_sh_wr = + red_sh_delta * j + (red_sh_rd - red_sh_stride * i); + if (i < red_off) { + float* c_rd = + reinterpret_cast(&sh[red_sh_delta * j + red_sh_rd]); + float* c_wr = reinterpret_cast(&sh[red_sh_wr]); + #pragma unroll + for (int k = 0; k < 4; k++) + reinterpret_cast(frag_c)[4 * 2 * m_block + j][k] += + c_rd[k] + c_wr[k]; + } + sh[red_sh_wr] = + reinterpret_cast(&frag_c)[4 * 2 * m_block + j]; + } + } + __syncthreads(); + } + if (red_idx == 0) { + #pragma unroll + for (int i = 0; i < 4 * 2; i++) { + float* c_rd = + reinterpret_cast(&sh[red_sh_delta * i + red_sh_rd]); + #pragma unroll + for (int j = 0; j < 4; j++) + reinterpret_cast(frag_c)[4 * 2 * m_block + i][j] += + c_rd[j]; + } + } + __syncthreads(); + } + } + }; + + // Since multiple threadblocks may process parts of the same column slice, we + // finally have to globally reduce over the results. As the striped + // partitioning minimizes the number of such reductions and our outputs are + // usually rather small, we perform this reduction serially in L2 cache. + auto global_reduce = [&](bool first = false, bool last = false) { + // We are very careful here to reduce directly in the output buffer to + // maximize L2 cache utilization in this step. To do this, we write out + // results in FP16 (but still reduce with FP32 compute). + constexpr int active_threads = 32 * thread_n_blocks / 4; + if (threadIdx.x < active_threads) { + int c_gl_stride = prob_n / 8; + int c_gl_wr_delta_o = 8 * c_gl_stride; + int c_gl_wr_delta_i = 4 * (active_threads / 32); + int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) + + 4 * (threadIdx.x / 32) + threadIdx.x % 4; + c_gl_wr += (2 * thread_n_blocks) * slice_col; + constexpr int c_sh_wr_delta = active_threads; + int c_sh_wr = threadIdx.x; + + int row = (threadIdx.x % 32) / 4; + + if (!first) { + // Interestingly, doing direct global accesses here really seems to mess up + // the compiler and lead to slowdowns, hence we also use async-copies even + // though these fetches are not actually asynchronous. + #pragma unroll + for (int i = 0; i < thread_m_blocks * 4; i++) { + cp_async4_pred( + &sh[c_sh_wr + c_sh_wr_delta * i], + &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + + c_gl_wr_delta_i * (i % 2)], + i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m); + } + cp_async_fence(); + cp_async_wait<0>(); + } + + #pragma unroll + for (int i = 0; i < thread_m_blocks * 4; i++) { + if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) { + if (!first) { + int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta]; + #pragma unroll + for (int j = 0; j < 2 * 4; j++) { + reinterpret_cast( + &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] += + __half2float(reinterpret_cast<__half*>(&c_red)[j]); + } + } + if (!last) { + int4 c; + #pragma unroll + for (int j = 0; j < 2 * 4; j++) { + reinterpret_cast<__half*>(&c)[j] = + __float2half(reinterpret_cast( + &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]); + } + C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] = + c; + } + } + } + } + }; + + // Write out the reduce final result in the correct layout. We only actually + // reshuffle matrix fragments in this step, the reduction above is performed + // in fragment layout. + auto write_result = [&]() { + int c_gl_stride = prob_n / 8; + constexpr int c_sh_stride = 2 * thread_n_blocks + 1; + int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks)); + constexpr int c_sh_rd_delta = + c_sh_stride * (threads / (2 * thread_n_blocks)); + + int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) + + (threadIdx.x % (2 * thread_n_blocks)); + c_gl_wr += (2 * thread_n_blocks) * slice_col; + int c_sh_wr = + (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4; + c_sh_wr += 32 * (threadIdx.x / 32); + int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) + + (threadIdx.x % (2 * thread_n_blocks)); + + int c_gl_wr_end = c_gl_stride * prob_m; + + // We first reorder in shared memory to guarantee the most efficient final + // global write patterns + auto write = [&](int idx, float c0, float c1, FragS& s) { + half2 res = __halves2half2(__float2half(c0), __float2half(c1)); + if (group_blocks == + -1) // for per-column quantization we finally apply the scale here + res = __hmul2(res, s[0]); + ((half2*)sh)[idx] = res; + }; + if (threadIdx.x / 32 < thread_n_blocks / 4) { + #pragma unroll + for (int i = 0; i < thread_m_blocks; i++) { + #pragma unroll + for (int j = 0; j < 4; j++) { + int wr = c_sh_wr + 8 * j; + write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0], + frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]); + write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2], + frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]); + write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0], + frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]); + write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2], + frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]); + } + c_sh_wr += 16 * (4 * c_sh_stride); + } + } + __syncthreads(); + + #pragma unroll + for (int i = 0; + i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks)); + i++) { + if (c_gl_wr < c_gl_wr_end) { + C[c_gl_wr] = sh[c_sh_rd]; + c_gl_wr += c_gl_wr_delta; + c_sh_rd += c_sh_rd_delta; + } + } + }; + + // Start global fetch and register load pipelines. + auto start_pipes = [&]() { + #pragma unroll + for (int i = 0; i < stages - 1; i++) fetch_to_shared(i, i, i < slice_iters); + zero_accums(); + wait_for_stage(); + fetch_to_registers(0, 0); + a_gl_rd += a_gl_rd_delta_o * (stages - 1); + }; + start_pipes(); + + // Main loop. + while (slice_iters) { + // We unroll over both the global fetch and the register load pipeline to + // ensure all shared memory accesses are static. Note that both pipelines have + // even length meaning that the next iteration will always start at index 0. + #pragma unroll + for (int pipe = 0; pipe < stages;) { + #pragma unroll + for (int k = 0; k < b_sh_wr_iters; k++) { + fetch_to_registers(k + 1, pipe % stages); + if (k == b_sh_wr_iters - 2) { + fetch_to_shared((pipe + stages - 1) % stages, pipe, + slice_iters >= stages); + pipe++; + wait_for_stage(); + } + matmul(k); + } + slice_iters--; + if (slice_iters == 0) break; + } + a_gl_rd += a_gl_rd_delta_o * stages; + + // Process results and, if necessary, proceed to the next column slice. + // While this pattern may not be the most readable, other ways of writing + // the loop seemed to noticeably worse performance after compilation. + if (slice_iters == 0) { + cp_async_wait<0>(); + bool last = slice_idx == slice_count - 1; + // For per-column scales, we only fetch them here in the final step before + // write-out + if (group_blocks == -1 && last) { + if (s_sh_wr_pred) cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]); + cp_async_fence(); + } + thread_block_reduce(); + if (group_blocks == -1 && last) { + cp_async_wait<0>(); + __syncthreads(); + if (threadIdx.x / 32 < thread_n_blocks / 4) { + reinterpret_cast(&frag_s)[0] = sh_s[s_sh_rd + 0]; + reinterpret_cast(&frag_s)[1] = sh_s[s_sh_rd + 4]; + } + } + if (slice_count > 1) { // only globally reduce if there is more than one + // block in a slice + barrier_acquire(&locks[slice_col], slice_idx); + global_reduce(slice_idx == 0, last); + barrier_release(&locks[slice_col], last); + } + if (last) // only the last block in a slice actually writes the result + write_result(); + slice_row = 0; + slice_col_par++; + slice_col++; + init_slice(); + if (slice_iters) { + a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + + (threadIdx.x % a_gl_rd_delta_o); + #pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) + B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles; + if (slice_col == 0) { + #pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride; + } + s_gl_rd = s_sh_stride * slice_col + threadIdx.x; + start_pipes(); + } + } + } +} + +#else + +template shared + // fetch pipeline + const int group_blocks = -1 // number of consecutive 16x16 blocks + // with a separate quantization scale + > +__global__ void Marlin( + const int4* __restrict__ A, // fp16 input matrix of shape mxk + const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn + int4* __restrict__ C, // fp16 output buffer of shape mxn + const int4* __restrict__ s, // fp16 quantization scales of shape + // (k/groupsize)xn + int prob_m, // batch dimension m + int prob_n, // output dimension n + int prob_k, // reduction dimension k + int* locks // extra global storage for barrier synchronization +) { + // Marlin is not implemented yet for SM < 8.0 + assert(false); + return; +} + +#endif + +// 8 warps are a good choice since every SM has 4 schedulers and having more +// than 1 warp per schedule allows some more latency hiding. At the same time, +// we want relatively few warps to have many registers per warp and small tiles. +const int USER_THREADS = + 256; // Note: This is only used with user-provided thread_k/n +const int STAGES = 4; // 4 pipeline stages fit into shared memory +const int SHARED_MEM = + 96 * 1024; // max shared memory on compute capability 8.6 (< 8.0) + +static constexpr int min_thread_n = 64; +static constexpr int min_thread_k = 64; + +static constexpr int tile_size = 16; +static constexpr int max_par = 16; + +static constexpr int pack_factor_4bit = + 8; // We have 8 4-bit vals inside a 32 bit + +#define __CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \ + GROUP_BLOCKS, NUM_THREADS) \ + else if (thread_m_blocks == THREAD_M_BLOCKS && \ + thread_n_blocks == THREAD_N_BLOCKS && \ + thread_k_blocks == THREAD_K_BLOCKS && \ + group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) { \ + cudaFuncSetAttribute(Marlin, \ + cudaFuncAttributeMaxDynamicSharedMemorySize, \ + SHARED_MEM); \ + Marlin<<>>( \ + A_ptr, B_ptr, C_ptr, s_ptr, prob_m, prob_n, prob_k, locks); \ + } + +typedef struct { + int thread_k; + int thread_n; + int num_threads; +} thread_config_t; + +thread_config_t small_batch_thread_configs[] = { + // Ordered by priority + + // thread_k, thread_n, num_threads + {128, 128, 256}, // Default + {128, 64, 128}, // Reduce N 2X, same K + {64, 256, 256}, // Reduce K 2X, increase N 2X + {64, 128, 128}, // Reduce K 2X, same N +}; + +thread_config_t large_batch_thread_configs[] = { + // Ordered by priority + + // thread_k, thread_n, num_threads + {64, 256, 256}, // Default + {128, 128, 256}, // Reduce N 2X, increase K 2X + {64, 128, 128}, // Reduce N 2X, same K + {128, 64, 128}, // Reduce N 4X, increase K 2X +}; + +bool is_valid_config(thread_config_t const& th_config, int prob_m, int prob_n, + int prob_k) { + // Sanity + if (th_config.thread_k == -1 || th_config.thread_n == -1 || + th_config.num_threads == -1) { + return false; + } + + // Verify K/N are divisible by thread K/N + if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) { + return false; + } + + // thread_k can be only 128 or 64 (because it must be less than groupsize + // which is 128) + if (th_config.thread_k != 128 && th_config.thread_k != 64) { + return false; + } + + // Verify min for thread K/N + if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) { + return false; + } + + // num_threads must be at least 128 (= 4 warps) + if (th_config.num_threads < 128) { + return false; + } + + return true; +} + +thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) { + if (prob_m <= 16) { + for (auto th_config : small_batch_thread_configs) { + if (is_valid_config(th_config, prob_m, prob_n, prob_k)) { + return th_config; + } + } + + } else { + for (auto th_config : large_batch_thread_configs) { + if (is_valid_config(th_config, prob_m, prob_n, prob_k)) { + return th_config; + } + } + } + + return thread_config_t{-1, -1, -1}; +} + +#define CALL_IF(N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ + __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ + __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ + __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ + __CALL_IF(2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ + __CALL_IF(2, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ + __CALL_IF(3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ + __CALL_IF(3, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ + __CALL_IF(4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ + __CALL_IF(4, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) + +void marlin_cuda(const void* A, const void* B, void* C, void* s, int prob_m, + int prob_n, int prob_k, void* workspace, int groupsize = -1, + int dev = 0, cudaStream_t stream = 0, int thread_k = -1, + int thread_n = -1, int sms = -1, int max_par = 16) { + int tot_m = prob_m; + int tot_m_blocks = ceildiv(tot_m, 16); + int pad = 16 * tot_m_blocks - tot_m; + + if (sms == -1) + cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev); + + // Set thread config + thread_config_t th_config; + if (thread_k != -1 && thread_n != -1) { + // User-defined config + th_config = thread_config_t{thread_k, thread_n, USER_THREADS}; + } else { + // Auto config + th_config = determine_thread_config(prob_m, prob_n, prob_k); + } + + if (!is_valid_config(th_config, prob_m, prob_n, prob_k)) { + throw std::runtime_error( + "Invalid thread config: thread_k = " + str(th_config.thread_k) + + ", thread_n = " + str(th_config.thread_n) + + ", num_threads = " + str(th_config.num_threads) + " for MKN = [" + + str(prob_m) + ", " + str(prob_k) + ", " + str(prob_n) + "]"); + } + + // Uncomment for debug + // std::cout << "Using thread_config: thread_k = " + str(th_config.thread_k) + + // ", thread_n = " + str(th_config.thread_n) + + // ", num_threads = " + str(th_config.num_threads) + " for + // MKN = [" + str(prob_m) + + // ", " + str(prob_k) + ", " + str(prob_n) + "]\n"; + + int num_threads = th_config.num_threads; + thread_k = th_config.thread_k; + thread_n = th_config.thread_n; + + int thread_k_blocks = thread_k / 16; + int thread_n_blocks = thread_n / 16; + int group_blocks = (groupsize == -1) ? -1 : groupsize / 16; + int blocks = sms; + + if (prob_m == 0 || prob_n == 0 || prob_k == 0) { + return; + } + + TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n, + " is not divisible by thread_n = ", thread_n); + TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k, + " is not divisible by thread_k = ", thread_k); + if (group_blocks != -1) { + TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k, + " is not divisible by group_blocks = ", group_blocks); + } + + const int4* A_ptr = (const int4*)A; + const int4* B_ptr = (const int4*)B; + int4* C_ptr = (int4*)C; + const int4* s_ptr = (const int4*)s; + + int* locks = (int*)workspace; + + for (int i = 0; i < tot_m_blocks; i += 4) { + int thread_m_blocks = tot_m_blocks - i; + prob_m = tot_m - 16 * i; + int par = 1; + if (thread_m_blocks > 4) { + // Note that parallel > 1 currently only works for inputs without any + // padding + par = (16 * thread_m_blocks - pad) / 64; + if (par > max_par) par = max_par; + prob_m = 64 * par; + i += 4 * (par - 1); + thread_m_blocks = 4; + } + + // For compilation speed, we only define the kernel configurations that have + // seemed useful (in terms of performance) in our testing, however many more + // are, in principle, possible. + if (false) { + } + CALL_IF(8, 8, 256) + CALL_IF(16, 4, 256) + CALL_IF(8, 4, 128) + CALL_IF(4, 8, 128) + else { + throw std::runtime_error("Unsupported shapes: MKN = [" + str(prob_m) + + ", " + str(prob_k) + ", " + str(prob_n) + "]" + + ", groupsize = " + str(groupsize) + + ", thread_m_blocks = " + str(thread_m_blocks) + + ", thread_n_blocks = " + str(thread_n_blocks) + + ", thread_k_blocks = " + str(thread_k_blocks)); + } + + A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par; + C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par; + } +} + +} // namespace marlin + +torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, + torch::Tensor& b_scales, torch::Tensor& workspace, + int64_t size_m, int64_t size_n, int64_t size_k) { + // Verify M + TORCH_CHECK(size_m == a.size(0), + "Shape mismatch: a.size(0) = " + str(a.size(0)) + + ", size_m = " + str(size_m)); + + // Verify K + TORCH_CHECK(size_k == a.size(1), + "Shape mismatch: a.size(1) = " + str(a.size(1)) + + ", size_k = " + str(size_k)); + TORCH_CHECK(size_k % marlin::tile_size == 0, + "size_k = " + str(size_k) + + " is not divisible by tile_size = " + str(marlin::tile_size)); + TORCH_CHECK((size_k / marlin::tile_size) == b_q_weight.size(0), + "Shape mismatch: b_q_weight.size(0) = " + + str(b_q_weight.size(0)) + ", size_k = " + str(size_k) + + ", tile_size = " + str(marlin::tile_size)); + + // Verify N + TORCH_CHECK(b_scales.size(1) == size_n, + "b_scales.size(1) = " + str(b_scales.size(1)) + + ", size_n = " + str(size_n)); + TORCH_CHECK(b_q_weight.size(1) % marlin::tile_size == 0, + "b_q_weight.size(1) = " + str(b_q_weight.size(1)) + + " is not divisible by tile_size = " + str(marlin::tile_size)); + + int actual_size_n = + (b_q_weight.size(1) / marlin::tile_size) * marlin::pack_factor_4bit; + TORCH_CHECK( + size_n == actual_size_n, + "size_n = " + str(size_n) + ", actual_size_n = " + str(actual_size_n)); + + // Verify A device and strides + TORCH_CHECK(a.device().is_cuda(), "A is not on GPU"); + TORCH_CHECK(a.is_contiguous(), "A is not contiguous"); + + // Verify B device and strides + TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU"); + TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous"); + + // Verify scales device and strides + TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU"); + TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous"); + + // Alloc C matrix + const at::cuda::OptionalCUDAGuard device_guard(device_of(a)); + auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device()); + torch::Tensor c = torch::empty({size_m, size_n}, options); + + // thread_k: `k` size of a thread_tile in `weights` (can usually be left as + // auto -1) + int thread_k = -1; + // thread_n: `n` size of a thread_tile in `weights` (can usually be left as + // auto -1) + int thread_n = -1; + // sms: number of SMs to use for the kernel (can usually be left as auto -1) + int sms = -1; + + // Detect groupsize + if (b_scales.size(0) != 1) { + TORCH_CHECK(size_k % b_scales.size(0) == 0, + "size_k = " + str(size_k) + + ", is not divisible by b_scales.size(0) = " + + str(b_scales.size(0))); + } + int groupsize = b_scales.size(0) == 1 ? -1 : size_k / b_scales.size(0); + + // Verify groupsize + TORCH_CHECK(groupsize == -1 || groupsize == 128, + "Unexpected groupsize = " + str(groupsize)); + + // Verify workspace size + TORCH_CHECK( + size_n % marlin::min_thread_n == 0, + "size_n = " + str(size_n) + + ", is not divisible by min_thread_n = " + str(marlin::min_thread_n)); + int min_workspace_size = (size_n / marlin::min_thread_n) * marlin::max_par; + TORCH_CHECK(workspace.numel() >= min_workspace_size, + "workspace.numel = " + str(workspace.numel()) + + " is below min_workspace_size = " + str(min_workspace_size)); + + int dev = a.get_device(); + marlin::marlin_cuda(a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), + b_scales.data_ptr(), size_m, size_n, size_k, + workspace.data_ptr(), groupsize, dev, + at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, + sms, marlin::max_par); + + return c; +} diff --git a/server/marlin/marlin_kernels/py.typed b/server/marlin/marlin_kernels/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/server/marlin/setup.py b/server/marlin/setup.py new file mode 100644 index 00000000..844e1139 --- /dev/null +++ b/server/marlin/setup.py @@ -0,0 +1,21 @@ +from setuptools import setup +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +extra_compile_args = [] + +setup( + name="marlin_kernels", + ext_modules=[ + CUDAExtension( + name="marlin_kernels", + sources=[ + "marlin_kernels/gptq_marlin.cu", + "marlin_kernels/gptq_marlin_repack.cu", + "marlin_kernels/marlin_cuda_kernel.cu", + "marlin_kernels/ext.cpp", + ], + extra_compile_args=extra_compile_args, + ), + ], + cmdclass={"build_ext": BuildExtension}, +) diff --git a/server/text_generation_server/layers/linear.py b/server/text_generation_server/layers/linear.py index 3537b62d..d40b192f 100644 --- a/server/text_generation_server/layers/linear.py +++ b/server/text_generation_server/layers/linear.py @@ -1,6 +1,7 @@ from typing import Optional import torch from torch.nn import functional as F +from text_generation_server.layers.marlin import GPTQMarlinLinear from text_generation_server.utils.import_utils import SYSTEM if SYSTEM == "rocm": @@ -223,13 +224,23 @@ def get_linear(weight, bias, quantize): "You do not seem to have awq installed, either install it (cd server && make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly" ) elif quantize == "marlin": - from text_generation_server.layers.marlin import MarlinLinear, MarlinWeight + from text_generation_server.layers.marlin import ( + GPTQMarlinWeight, + MarlinLinear, + MarlinWeight, + ) - if not isinstance(weight, MarlinWeight): + if isinstance(weight, GPTQMarlinWeight): + linear = GPTQMarlinLinear( + weight=weight, + bias=bias, + ) + elif isinstance(weight, MarlinWeight): + linear = MarlinLinear(weight=weight, bias=bias) + else: raise NotImplementedError( f"The passed weight is not `marlin` compatible, loader needs to be updated." ) - linear = MarlinLinear(B=weight.B, s=weight.s, bias=bias) else: raise NotImplementedError(f"Quantization `{quantize}` is not implemented yet.") return linear diff --git a/server/text_generation_server/layers/marlin.py b/server/text_generation_server/layers/marlin.py index a860d84b..4d4c635e 100644 --- a/server/text_generation_server/layers/marlin.py +++ b/server/text_generation_server/layers/marlin.py @@ -1,13 +1,15 @@ from dataclasses import dataclass -from typing import Optional +from typing import Optional, Tuple, List import torch import torch.nn as nn +from text_generation_server.utils.import_utils import SYSTEM + try: - import marlin + import marlin_kernels except ImportError: - marlin = None + marlin_kernels = None try: major, _minor = torch.cuda.get_device_capability() @@ -15,9 +17,204 @@ try: except Exception: has_sm_8_0 = False + +GPTQ_MARLIN_BITS = [4, 8] +GPTQ_MARLIN_GROUP_SIZES = [-1, 32, 64, 128] MARLIN_TILE_SIZE = 16 +def _check_marlin_kernels(): + if not (SYSTEM == "cuda" and has_sm_8_0): + raise NotImplementedError( + "Using quantized Marlin models requires a GPU with CUDA capability 8.0 or later." + ) + + if marlin_kernels is None: + raise NotImplementedError( + "marlin is not installed, install it with: pip install server/marlin" + ) + + +def _check_valid_shape(in_features: int, out_features: int): + if (in_features % 128 != 0 or out_features % 64 != 0) and ( + in_features % 64 != 0 or out_features % 128 != 0 + ): + raise ValueError( + f"The GPTQ Marlin kernel does not have a valid thread configuration for weight matrix with shape ({out_features}, {in_features})." + " The shape elements must be divisible by (128, 64) or (64, 128)." + ) + + +# https://github.com/IST-DASLab/marlin/blob/2f6d7c10e124b3c5fa29ff8d77d568bd7af3274c/marlin/__init__.py#L40C1-L68C54 +def _get_perms() -> Tuple[List[int], List[int]]: + scale_perm = [] + for i in range(8): + scale_perm.extend([i + 8 * j for j in range(8)]) + scale_perm_single = [] + for i in range(4): + scale_perm_single.extend([2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]]) + return scale_perm, scale_perm_single + + +_scale_perm, _scale_perm_single = _get_perms() + + +def permute_scales(scales: torch.Tensor): + out_features = scales.shape[1] + if scales.shape[0] == 1: + scales = scales.reshape((-1, len(_scale_perm_single)))[:, _scale_perm_single] + else: + scales = scales.reshape((-1, len(_scale_perm)))[:, _scale_perm] + return scales.reshape((-1, out_features)).contiguous() + + +@dataclass +class GPTQMarlinWeight: + """ + Repacked GPTQ Marlin weights. + """ + + qweight: torch.Tensor + scales: torch.Tensor + g_idx: torch.Tensor + perm: torch.Tensor + bits: int + is_full_k: bool + + def __post_init__(self): + assert self.qweight.dtype == torch.int32 + assert self.scales.dtype == torch.float16 + assert self.g_idx.dtype == torch.int32 + assert self.perm.dtype == torch.int32 + + +def repack_gptq_for_marlin( + *, + qweight: torch.Tensor, + scales: torch.Tensor, + g_idx: torch.Tensor, + bits: int, + desc_act: bool, + groupsize: int, + sym: bool, + sharded_infeatures: bool, +) -> GPTQMarlinWeight: + """Convert GPTQ weights to a layout that's compatible with GPTQ-Marlin kernels.""" + _check_marlin_kernels() + assert marlin_kernels is not None + + if bits not in GPTQ_MARLIN_BITS: + supported_bits = ", ".join(str(b) for b in GPTQ_MARLIN_BITS) + raise RuntimeError( + f"Repacking {bits}-bit GPTQ weights as Marlin is not supported, must be one of: {supported_bits}" + ) + + if groupsize not in GPTQ_MARLIN_GROUP_SIZES: + supported_sizes = ", ".join(str(b) for b in GPTQ_MARLIN_GROUP_SIZES) + raise RuntimeError( + f"Repacking GPTQ weights with group size {groupsize} as Marlin is not supported, must be one of: {supported_sizes}" + ) + if not sym: + raise RuntimeError( + "Repacking GPTQ weights with asymmetric quantization as Marlin is not supported." + ) + + weights_per_int = 32 // bits + in_features = qweight.shape[0] * weights_per_int + out_features = qweight.shape[1] + + if in_features % groupsize != 0: + raise ValueError( + f"Number of input features ({in_features}) not divisible by group size ({groupsize})" + ) + + if desc_act and groupsize != -1: + perm = torch.argsort(g_idx).to(torch.int) + g_idx = g_idx[perm] + else: + perm = torch.empty(0, dtype=torch.int, device=qweight.device) + g_idx = torch.empty(0, dtype=torch.int, device=qweight.device) + + repacked = marlin_kernels.gptq_marlin_repack( + qweight, perm, in_features, out_features, bits + ) + + scales = permute_scales(scales) + + is_full_k = not (desc_act and sharded_infeatures) + + return GPTQMarlinWeight( + qweight=repacked, + scales=scales, + g_idx=g_idx, + perm=perm, + bits=bits, + is_full_k=is_full_k, + ) + + +class GPTQMarlinLinear(nn.Module): + """ + Linear layer for GPTQ weights that were converted for the GPTQ-Marlin + kernels. + """ + + def __init__( + self, + *, + weight: GPTQMarlinWeight, + bias: Optional[torch.Tensor], + ): + super().__init__() + + _check_marlin_kernels() + assert marlin_kernels is not None + + in_features = weight.qweight.shape[0] * MARLIN_TILE_SIZE + out_features = weight.scales.shape[1] + _check_valid_shape(in_features=in_features, out_features=out_features) + + self.bits = weight.bits + self.is_full_k = weight.is_full_k + + self.register_buffer("qweight", weight.qweight) + self.register_buffer("scales", weight.scales) + self.register_buffer("g_idx", weight.g_idx) + self.register_buffer("perm", weight.perm) + if bias is not None: + self.register_buffer("bias", bias) + else: + self.bias = None + + self.workspace = torch.zeros( + out_features // 64 * 16, dtype=torch.int, device=weight.qweight.device + ) + + def forward(self, A: torch.Tensor) -> torch.Tensor: + assert marlin_kernels is not None + + A_flat = A.view(-1, A.shape[-1]) + C = marlin_kernels.gptq_marlin_gemm( + A_flat, + self.qweight, + self.scales, + self.g_idx, + self.perm, + self.workspace, + self.bits, + A_flat.shape[0], + self.scales.shape[1], + A_flat.shape[1], + self.is_full_k, + ) + C = C.reshape(A.shape[:-1] + (self.scales.shape[1],)) + + if self.bias is not None: + C += self.bias + + return C + + @dataclass class MarlinWeight: """ @@ -31,28 +228,20 @@ class MarlinWeight: B: torch.Tensor s: torch.Tensor + def __post_init__(self): + assert self.B.dtype == torch.int32 + assert self.s.dtype == torch.float16 + class MarlinLinear(nn.Module): - def __init__( - self, *, B: torch.Tensor, s: torch.Tensor, bias: Optional[torch.Tensor] - ): + def __init__(self, *, weight: MarlinWeight, bias: Optional[torch.Tensor]): super().__init__() - if not has_sm_8_0: - raise NotImplementedError( - "Using quantized marlin models requires CUDA capability 8.0 or later" - ) + _check_marlin_kernels() + assert marlin_kernels is not None - if marlin is None: - raise NotImplementedError( - "You do not seem to have marlin installed, either install it (cd server && make install-marlin)" - ) - - assert B.dtype == torch.int32 - assert s.dtype == torch.float16 - - in_features = B.shape[0] * MARLIN_TILE_SIZE - out_features = s.shape[1] + in_features = weight.B.shape[0] * MARLIN_TILE_SIZE + out_features = weight.s.shape[1] assert ( in_features % 128 == 0 ), f"Number of input features ({in_features}) not divisable by 128" @@ -60,35 +249,36 @@ class MarlinLinear(nn.Module): out_features % 256 == 0 ), f"Number of output features ({out_features}) not divisable by 256" - group_size = -1 if s.shape[0] == 1 else in_features // s.shape[0] - assert group_size in { + groupsize = -1 if weight.s.shape[0] == 1 else in_features // weight.s.shape[0] + assert groupsize in { -1, 128, - }, f"Group size must be -1 or 128, was {group_size}" + }, f"Group size must be -1 or 128, was {groupsize}" - self.register_buffer("B", B) - self.register_buffer("s", s) + self.register_buffer("B", weight.B) + self.register_buffer("s", weight.s) if bias is not None: self.register_buffer("bias", bias) else: self.bias = None self.workspace = torch.zeros( - out_features // 128 * 16, dtype=torch.int, device=B.device + out_features // 64 * 16, dtype=torch.int, device=weight.B.device ) def forward(self, A: torch.Tensor) -> torch.Tensor: - assert marlin is not None - C = torch.empty( - A.shape[:-1] + (self.s.shape[1],), dtype=A.dtype, device=A.device - ) - marlin.mul( - A.view((-1, A.shape[-1])), + assert marlin_kernels is not None + + C = marlin_kernels.marlin_gemm( + A.view(-1, A.shape[-1]), self.B, - C.view((-1, C.shape[-1])), self.s, self.workspace, + A.shape[0], + self.s.shape[1], + A.shape[1], ) + C = C.reshape(A.shape[:-1] + (self.s.shape[1],)) if self.bias is not None: C += self.bias diff --git a/server/text_generation_server/models/bloom.py b/server/text_generation_server/models/bloom.py index 65c9f317..38006502 100644 --- a/server/text_generation_server/models/bloom.py +++ b/server/text_generation_server/models/bloom.py @@ -83,7 +83,7 @@ class BLOOMSharded(CausalLM): process_group=self.process_group, prefix="transformer", ) - if config.quantize == "gptq": + if config.quantize in ["gptq", "marlin"]: weights._set_gptq_params(model_id, revision) model = BloomForCausalLM(config, weights) diff --git a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py index 764dc6e2..6d315ba5 100644 --- a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py @@ -166,7 +166,7 @@ def _load_gqa(config, prefix: str, weights): dim=0, ) - if config.quantize not in ["gptq", "awq"]: + if config.quantize not in ["gptq", "awq", "marlin"]: weight = weight.to(dtype=weights.dtype).to(device=weights.device) head_size = config.hidden_size // config.num_attention_heads diff --git a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py index 4fa6516e..2ae0908c 100644 --- a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py @@ -81,16 +81,11 @@ def _load_multi_mqa_gptq( qzeros = torch.cat([q_tensor, kv_tensor], dim=1) qzeros = qzeros.to(device=weights.device) - ( - bits, - groupsize, - _, - quant_method, - ) = weights._get_gptq_params() - if quant_method == "gptq": + gptq_params = weights._get_gptq_params() + if gptq_params.quant_method == "gptq": g_idx = weights.get_tensor(f"{prefix}.c_attn.g_idx") g_idx = g_idx.to(device=weights.device) - elif quant_method == "awq": + elif gptq_params.quant_method == "awq": g_idx = None from text_generation_server.layers.awq.conversion_utils import ( fast_awq_to_gptq, @@ -105,8 +100,8 @@ def _load_multi_mqa_gptq( qzeros=qzeros, scales=scales, g_idx=g_idx, - bits=bits, - groupsize=groupsize, + bits=gptq_params.bits, + groupsize=gptq_params.groupsize, use_exllama=HAS_EXLLAMA, ) diff --git a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py index 37486e9d..c3e2e099 100644 --- a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py @@ -130,7 +130,7 @@ def _load_gqa(config, prefix: str, weights): dim=0, ) - if config.quantize not in ["gptq", "awq"]: + if config.quantize not in ["gptq", "awq", "marlin"]: weight = weight.to(dtype=weights.dtype).to(device=weights.device) head_size = config.hidden_size // config.num_attention_heads diff --git a/server/text_generation_server/models/flash_cohere.py b/server/text_generation_server/models/flash_cohere.py index b907ee08..1077d78e 100644 --- a/server/text_generation_server/models/flash_cohere.py +++ b/server/text_generation_server/models/flash_cohere.py @@ -55,7 +55,7 @@ class FlashCohere(FlashCausalLM): filenames = weight_files(model_id, revision=revision, extension=".safetensors") weights = Weights(filenames, device, dtype, process_group=self.process_group) - if config.quantize in ["gptq", "awq"]: + if config.quantize in ["gptq", "awq", "marlin"]: weights._set_gptq_params(model_id, revision) model = FlashCohereForCausalLM(config, weights) diff --git a/server/text_generation_server/models/flash_dbrx.py b/server/text_generation_server/models/flash_dbrx.py index d5eb1a6e..ffb6d5a6 100644 --- a/server/text_generation_server/models/flash_dbrx.py +++ b/server/text_generation_server/models/flash_dbrx.py @@ -80,7 +80,7 @@ class FlashDbrx(FlashCausalLM): filenames = weight_files(model_id, revision=revision, extension=".safetensors") weights = Weights(filenames, device, dtype, process_group=self.process_group) - if config.quantize in ["gptq", "awq"]: + if config.quantize in ["gptq", "awq", "marlin"]: weights._set_gptq_params(model_id, revision) model = FlashDbrxForCausalLM(config, weights) diff --git a/server/text_generation_server/models/flash_gemma.py b/server/text_generation_server/models/flash_gemma.py index 358883e6..1b7b2772 100644 --- a/server/text_generation_server/models/flash_gemma.py +++ b/server/text_generation_server/models/flash_gemma.py @@ -53,7 +53,7 @@ class FlashGemma(FlashCausalLM): filenames = weight_files(model_id, revision=revision, extension=".safetensors") weights = Weights(filenames, device, dtype, process_group=self.process_group) - if config.quantize in ["gptq", "awq"]: + if config.quantize in ["gptq", "awq", "marlin"]: weights._set_gptq_params(model_id, revision) # TODO hardcoded diff --git a/server/text_generation_server/models/flash_llama.py b/server/text_generation_server/models/flash_llama.py index c5cbd2b8..e27f0da2 100644 --- a/server/text_generation_server/models/flash_llama.py +++ b/server/text_generation_server/models/flash_llama.py @@ -67,7 +67,7 @@ class FlashLlama(FlashCausalLM): filenames = weight_files(model_id, revision=revision, extension=".safetensors") weights = Weights(filenames, device, dtype, process_group=self.process_group) - if config.quantize in ["gptq", "awq", "exl2"]: + if config.quantize in ["awq", "exl2", "gptq", "marlin"]: weights._set_gptq_params(model_id, revision) prefix = "" diff --git a/server/text_generation_server/models/flash_mistral.py b/server/text_generation_server/models/flash_mistral.py index 081c2e2c..0fdda6d2 100644 --- a/server/text_generation_server/models/flash_mistral.py +++ b/server/text_generation_server/models/flash_mistral.py @@ -68,7 +68,7 @@ class BaseFlashMistral(FlashCausalLM): filenames = weight_files(model_id, revision=revision, extension=".safetensors") weights = Weights(filenames, device, dtype, process_group=self.process_group) - if config.quantize in ["gptq", "awq"]: + if config.quantize in ["gptq", "awq", "marlin"]: weights._set_gptq_params(model_id, revision) prefix = "" diff --git a/server/text_generation_server/models/flash_neox.py b/server/text_generation_server/models/flash_neox.py index adefaeb2..d3871c2f 100644 --- a/server/text_generation_server/models/flash_neox.py +++ b/server/text_generation_server/models/flash_neox.py @@ -58,7 +58,7 @@ class FlashNeoXSharded(FlashCausalLM): weights = Weights( filenames, device=device, dtype=dtype, process_group=self.process_group ) - if config.quantize == "gptq": + if config.quantize in ["gptq", "marlin"]: weights._set_gptq_params(model_id, revision) model = FlashGPTNeoXForCausalLM(config, weights) diff --git a/server/text_generation_server/models/flash_phi.py b/server/text_generation_server/models/flash_phi.py index 6a52c1d7..0cc67cec 100644 --- a/server/text_generation_server/models/flash_phi.py +++ b/server/text_generation_server/models/flash_phi.py @@ -53,7 +53,7 @@ class FlashPhi(FlashCausalLM): filenames = weight_files(model_id, revision=revision, extension=".safetensors") weights = Weights(filenames, device, dtype, process_group=self.process_group) - if config.quantize in ["gptq", "awq"]: + if config.quantize in ["gptq", "awq", "marlin"]: weights._set_gptq_params(model_id, revision) model = FlashPhiForCausalLM(config, weights) diff --git a/server/text_generation_server/models/flash_qwen2.py b/server/text_generation_server/models/flash_qwen2.py index 75285863..9fcfce9d 100644 --- a/server/text_generation_server/models/flash_qwen2.py +++ b/server/text_generation_server/models/flash_qwen2.py @@ -62,7 +62,7 @@ class FlashQwen2(BaseFlashMistral): filenames = weight_files(model_id, revision=revision, extension=".safetensors") weights = Weights(filenames, device, dtype, process_group=self.process_group) - if config.quantize in ["gptq", "awq"]: + if config.quantize in ["gptq", "awq", "marlin"]: weights._set_gptq_params(model_id, revision) model = Qwen2ForCausalLM(config, weights) diff --git a/server/text_generation_server/models/flash_rw.py b/server/text_generation_server/models/flash_rw.py index e6350611..187f26a8 100644 --- a/server/text_generation_server/models/flash_rw.py +++ b/server/text_generation_server/models/flash_rw.py @@ -67,7 +67,7 @@ class FlashRWSharded(FlashCausalLM): config.quantize = quantize config.speculator = speculator - if config.quantize == "gptq": + if config.quantize in ["gptq", "marlin"]: weights._set_gptq_params(model_id, revision) model = FlashRWForCausalLM(config, weights) diff --git a/server/text_generation_server/models/flash_santacoder.py b/server/text_generation_server/models/flash_santacoder.py index 2ad36b93..a8d84fca 100644 --- a/server/text_generation_server/models/flash_santacoder.py +++ b/server/text_generation_server/models/flash_santacoder.py @@ -69,7 +69,7 @@ class FlashSantacoderSharded(FlashCausalLM): process_group=self.process_group, aliases={"transformer.wte.weight": ["lm_head.weight"]}, ) - if config.quantize == "gptq": + if config.quantize in ["gptq", "marlin"]: weights._set_gptq_params(model_id, revision) model = FlashSantacoderForCausalLM(config, weights) diff --git a/server/text_generation_server/models/flash_starcoder2.py b/server/text_generation_server/models/flash_starcoder2.py index 5533c9d9..1ac731be 100644 --- a/server/text_generation_server/models/flash_starcoder2.py +++ b/server/text_generation_server/models/flash_starcoder2.py @@ -61,7 +61,7 @@ class FlashStarcoder2(BaseFlashMistral): filenames = weight_files(model_id, revision=revision, extension=".safetensors") weights = Weights(filenames, device, dtype, process_group=self.process_group) - if config.quantize in ["gptq", "awq"]: + if config.quantize in ["gptq", "awq", "marlin"]: weights._set_gptq_params(model_id, revision) model = FlashStarcoder2ForCausalLM(config, weights) diff --git a/server/text_generation_server/models/galactica.py b/server/text_generation_server/models/galactica.py index d0f2b915..f39bd1e9 100644 --- a/server/text_generation_server/models/galactica.py +++ b/server/text_generation_server/models/galactica.py @@ -205,7 +205,7 @@ class GalacticaSharded(CausalLM): weights = Weights( filenames, device=device, dtype=dtype, process_group=self.process_group ) - if config.quantize == "gptq": + if config.quantize in ["gptq", "marlin"]: weights._set_gptq_params(model_id, revision) model = OPTForCausalLM(config, weights) diff --git a/server/text_generation_server/models/gpt_neox.py b/server/text_generation_server/models/gpt_neox.py index d1f8f5be..8d2cb0e1 100644 --- a/server/text_generation_server/models/gpt_neox.py +++ b/server/text_generation_server/models/gpt_neox.py @@ -58,7 +58,7 @@ class GPTNeoxSharded(CausalLM): weights = Weights( filenames, device=device, dtype=dtype, process_group=self.process_group ) - if config.quantize == "gptq": + if config.quantize in ["gptq", "marlin"]: weights._set_gptq_params(model_id, revision) model = GPTNeoxForCausalLM(config, weights) diff --git a/server/text_generation_server/models/mpt.py b/server/text_generation_server/models/mpt.py index 8d8b4909..65180e73 100644 --- a/server/text_generation_server/models/mpt.py +++ b/server/text_generation_server/models/mpt.py @@ -82,7 +82,7 @@ class MPTSharded(CausalLM): filenames = weight_files(model_id, revision=revision, extension=".safetensors") weights = Weights(filenames, device, dtype, process_group=self.process_group) - if config.quantize == "gptq": + if config.quantize in ["gptq", "marlin"]: weights._set_gptq_params(model_id, revision) config.quantize = quantize diff --git a/server/text_generation_server/models/opt.py b/server/text_generation_server/models/opt.py index 87319ef0..1f4fbfcd 100644 --- a/server/text_generation_server/models/opt.py +++ b/server/text_generation_server/models/opt.py @@ -56,7 +56,7 @@ class OPTSharded(CausalLM): weights = Weights( filenames, device=device, dtype=dtype, process_group=self.process_group ) - if config.quantize == "gptq": + if config.quantize in ["gptq", "marlin"]: weights._set_gptq_params(model_id, revision) model = OPTForCausalLM(config, weights) diff --git a/server/text_generation_server/utils/weights.py b/server/text_generation_server/utils/weights.py index 4d5fcb25..45cfc073 100644 --- a/server/text_generation_server/utils/weights.py +++ b/server/text_generation_server/utils/weights.py @@ -1,4 +1,5 @@ import os +from dataclasses import dataclass from pathlib import Path from typing import Dict, List, Optional, Tuple, Union from safetensors import safe_open, SafetensorError @@ -9,6 +10,15 @@ import json from text_generation_server.utils.log import log_once +@dataclass +class _GPTQParams: + bits: int + groupsize: int + desc_act: bool + quant_method: str + sym: bool + + class Weights: def __init__( self, @@ -181,15 +191,15 @@ class Weights: f"Cannot load `{quantize}` weight, make sure the model is already quantized." ) - bits, groupsize, _, quant_method = self._get_gptq_params() + gptq_params = self._get_gptq_params() qzeros = self._get_qweight(f"{prefix}.qzeros", block_sizes) scales = self._get_qweight(f"{prefix}.scales", block_sizes) scales = scales.to(dtype=self.dtype) - if quantize == "gptq" and quant_method == "gptq": + if quantize == "gptq" and gptq_params.quant_method == "gptq": g_idx = self.get_tensor(f"{prefix}.g_idx") - elif quantize == "gptq" and quant_method == "awq": + elif quantize == "gptq" and gptq_params.quant_method == "awq": log_once( logger.info, "Converting AWQ model to Exllama/GPTQ packing format." ) @@ -199,8 +209,11 @@ class Weights: qweight, qzeros = fast_awq_to_gptq(qweight, qzeros) g_idx = ( - torch.arange(qweight.shape[0] * (32 // bits), device=qweight.device) - // groupsize + torch.arange( + qweight.shape[0] * (32 // gptq_params.bits), + device=qweight.device, + ) + // gptq_params.groupsize ).to(dtype=torch.int32) else: g_idx = None @@ -210,16 +223,43 @@ class Weights: qzeros=qzeros, scales=scales, g_idx=g_idx, - bits=bits, - groupsize=groupsize, + bits=gptq_params.bits, + groupsize=gptq_params.groupsize, use_exllama=False, ) elif quantize == "marlin": - from text_generation_server.layers.marlin import MarlinWeight + from text_generation_server.layers.marlin import ( + MarlinWeight, + repack_gptq_for_marlin, + ) - B = self._get_qweight(f"{prefix}.B", block_sizes) - s = self._get_qweight(f"{prefix}.s", block_sizes) - weight = MarlinWeight(B=B, s=s) + quant_method = getattr(self, "quant_method", "marlin") + if quant_method == "gptq": + gptq_params = self._get_gptq_params() + try: + qweight = self._get_qweight(f"{prefix}.qweight", block_sizes) + except RuntimeError: + raise RuntimeError( + f"Cannot load `{quantize}` weight for GPTQ -> Marlin repacking, make sure the model is already quantized" + ) + + scales = self._get_qweight(f"{prefix}.scales", block_sizes) + g_idx = self.get_tensor(f"{prefix}.g_idx") + weight = repack_gptq_for_marlin( + qweight=qweight, + scales=scales, + g_idx=g_idx, + bits=gptq_params.bits, + desc_act=gptq_params.desc_act, + groupsize=gptq_params.groupsize, + sym=gptq_params.sym, + sharded_infeatures=False, + ) + + else: + B = self._get_qweight(f"{prefix}.B", block_sizes) + s = self._get_qweight(f"{prefix}.s", block_sizes) + weight = MarlinWeight(B=B, s=s) else: slice_ = self._get_slice(f"{prefix}.weight") total_size = slice_.get_shape()[0] @@ -295,20 +335,23 @@ class Weights: [self.get_sharded(f"{p}.scales", dim=1) for p in prefixes], dim=1 ) - bits, groupsize, desc_act, quant_method = self._get_gptq_params() + gptq_params = self._get_gptq_params() from text_generation_server.layers.gptq import HAS_EXLLAMA use_exllama = ( - bits == 4 and HAS_EXLLAMA and quantize == "gptq" and not desc_act + gptq_params.bits == 4 + and HAS_EXLLAMA + and quantize == "gptq" + and not gptq_params.desc_act ) - if quantize == "gptq" and quant_method == "gptq": + if quantize == "gptq" and gptq_params.quant_method == "gptq": w = [self.get_tensor(f"{p}.g_idx") for p in prefixes] for w2 in w[1:]: torch.testing.assert_close(w2, w[0]) g_idx = w[0] - elif quantize == "gptq" and quant_method == "awq": + elif quantize == "gptq" and gptq_params.quant_method == "awq": log_once( logger.info, "Converting AWQ model to Exllama/GPTQ packing format." ) @@ -322,9 +365,10 @@ class Weights: else: g_idx = ( torch.arange( - qweight.shape[0] * (32 // bits), device=qweight.device + qweight.shape[0] * (32 // gptq_params.bits), + device=qweight.device, ) - // groupsize + // gptq_params.groupsize ).to(dtype=torch.int32) else: g_idx = None @@ -334,24 +378,62 @@ class Weights: qzeros=qzeros, scales=scales, g_idx=g_idx, - bits=bits, - groupsize=groupsize, + bits=gptq_params.bits, + groupsize=gptq_params.groupsize, use_exllama=use_exllama, ) elif quantize == "marlin": - from text_generation_server.layers.marlin import MarlinWeight + from text_generation_server.layers.gptq import GPTQWeight + from text_generation_server.layers.marlin import ( + MarlinWeight, + repack_gptq_for_marlin, + ) - try: - B = torch.cat( - [self.get_sharded(f"{p}.B", dim=1) for p in prefixes], dim=1 - ) - except RuntimeError: - raise RuntimeError( - f"Cannot load `{quantize}` weight, make sure the model is already quantized" - ) - s = torch.cat([self.get_sharded(f"{p}.s", dim=1) for p in prefixes], dim=1) + quant_method = getattr(self, "quant_method", "marlin") + if quant_method == "gptq": + gptq_params = self._get_gptq_params() + try: + qweight = torch.cat( + [self.get_sharded(f"{p}.qweight", dim=1) for p in prefixes], + dim=1, + ) + except RuntimeError: + raise RuntimeError( + f"Cannot load `{quantize}` weight for GPTQ -> Marlin repacking, make sure the model is already quantized" + ) - weight = MarlinWeight(B=B, s=s) + scales = torch.cat( + [self.get_sharded(f"{p}.scales", dim=1) for p in prefixes], dim=1 + ) + w = [self.get_tensor(f"{p}.g_idx") for p in prefixes] + for w2 in w[1:]: + torch.testing.assert_close(w2, w[0]) + g_idx = w[0] + + weight = repack_gptq_for_marlin( + qweight=qweight, + scales=scales, + g_idx=g_idx, + bits=gptq_params.bits, + desc_act=gptq_params.desc_act, + groupsize=gptq_params.groupsize, + sym=gptq_params.sym, + sharded_infeatures=False, + ) + else: + try: + B = torch.cat( + [self.get_sharded(f"{p}.B", dim=1) for p in prefixes], dim=1 + ) + except RuntimeError: + raise RuntimeError( + f"Cannot load `{quantize}` weight, make sure the model is already quantized" + ) + s = torch.cat( + [self.get_sharded(f"{p}.s", dim=1) for p in prefixes], dim=1 + ) + + weight = MarlinWeight(B=B, s=s) else: w = [self.get_sharded(f"{p}.weight", dim=0) for p in prefixes] @@ -401,12 +483,12 @@ class Weights: elif quantize == "gptq": use_exllama = True - bits, groupsize, desc_act, quant_method = self._get_gptq_params() + gptq_params = self._get_gptq_params() - if bits != 4: + if gptq_params.bits != 4: use_exllama = False - if desc_act: + if gptq_params.desc_act: log_once(logger.warning, "Disabling exllama because desc_act=True") use_exllama = False @@ -417,9 +499,9 @@ class Weights: "Cannot load `gptq` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`" ) - if quant_method == "gptq": + if gptq_params.quant_method == "gptq": g_idx = self.get_sharded(f"{prefix}.g_idx", dim=0) - elif quant_method == "awq": + elif gptq_params.quant_method == "awq": g_idx = None if self.process_group.size() > 1: @@ -428,7 +510,10 @@ class Weights: not torch.equal( g_idx.cpu(), torch.tensor( - [i // groupsize for i in range(g_idx.shape[0])], + [ + i // gptq_params.groupsize + for i in range(g_idx.shape[0]) + ], dtype=torch.int32, ), ) @@ -455,7 +540,7 @@ class Weights: else: log_once(logger.info, f"Using exllama kernels v{HAS_EXLLAMA}") - if use_exllama and groupsize != -1: + if use_exllama and gptq_params.groupsize != -1: qzeros = self.get_sharded(f"{prefix}.qzeros", dim=0) scales = self.get_sharded(f"{prefix}.scales", dim=0) else: @@ -465,7 +550,7 @@ class Weights: if use_exllama and g_idx is not None: g_idx = g_idx - g_idx[0] - if quant_method == "awq": + if gptq_params.quant_method == "awq": log_once( logger.info, "Converting AWQ model to Exllama/GPTQ packing format." ) @@ -479,9 +564,10 @@ class Weights: else: g_idx = ( torch.arange( - qweight.shape[0] * (32 // bits), device=qweight.device + qweight.shape[0] * (32 // gptq_params.bits), + device=qweight.device, ) - // groupsize + // gptq_params.groupsize ).to(dtype=torch.int32) weight = GPTQWeight( @@ -489,14 +575,14 @@ class Weights: qzeros=qzeros, scales=scales, g_idx=g_idx, - bits=bits, - groupsize=groupsize, + bits=gptq_params.bits, + groupsize=gptq_params.groupsize, use_exllama=use_exllama, ) elif quantize == "awq": from text_generation_server.layers.gptq import GPTQWeight - bits, groupsize, _, _ = self._get_gptq_params() + gptq_params = self._get_gptq_params() try: qweight = self.get_sharded(f"{prefix}.qweight", dim=0) @@ -515,38 +601,74 @@ class Weights: qzeros=qzeros, scales=scales, g_idx=g_idx, - bits=bits, - groupsize=groupsize, + bits=gptq_params.bits, + groupsize=gptq_params.groupsize, use_exllama=use_exllama, ) elif quantize == "marlin": - from text_generation_server.layers.marlin import MarlinWeight + from text_generation_server.layers.gptq import GPTQWeight + from text_generation_server.layers.marlin import ( + MarlinWeight, + repack_gptq_for_marlin, + ) - try: - B = self.get_sharded(f"{prefix}.B", dim=0) - except RuntimeError: - raise RuntimeError( - "Cannot load `marlin` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`" + quant_method = getattr(self, "quant_method", "marlin") + if quant_method == "gptq": + log_once(logger.info, "Converting GPTQ model to Marlin packing format.") + gptq_params = self._get_gptq_params() + + try: + qweight = self.get_sharded(f"{prefix}.qweight", dim=0) + except RuntimeError: + raise RuntimeError( + f"Cannot load `{quantize}` weight for GPTQ -> Marlin repacking, make sure the model is already quantized" + ) + + g_idx = self.get_sharded(f"{prefix}.g_idx", dim=0) + if gptq_params.desc_act or gptq_params.groupsize == -1: + scales = self.get_tensor(f"{prefix}.scales") + else: + scales = self.get_sharded(f"{prefix}.scales", dim=0) + + sharded_in_features = self.process_group.size() > 1 + + weight = repack_gptq_for_marlin( + qweight=qweight, + scales=scales, + g_idx=g_idx, + bits=gptq_params.bits, + desc_act=gptq_params.desc_act, + groupsize=gptq_params.groupsize, + sym=gptq_params.sym, + sharded_infeatures=sharded_in_features, ) - - num_groups = self._get_slice(f"{prefix}.s").get_shape()[0] - if num_groups == 1: - # The number of groups is 1 when group_size == -1. share - # scales between all shards in this case. - s = self.get_tensor(f"{prefix}.s") else: - s = self.get_sharded(f"{prefix}.s", dim=0) - weight = MarlinWeight(B=B, s=s) + try: + B = self.get_sharded(f"{prefix}.B", dim=0) + except RuntimeError: + raise RuntimeError( + "Cannot load `marlin` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`" + ) + + num_groups = self._get_slice(f"{prefix}.s").get_shape()[0] + if num_groups == 1: + # The number of groups is 1 when groupsize == -1. share + # scales between all shards in this case. + s = self.get_tensor(f"{prefix}.s") + else: + s = self.get_sharded(f"{prefix}.s", dim=0) + weight = MarlinWeight(B=B, s=s) else: weight = self.get_sharded(f"{prefix}.weight", dim=1) return weight - def _get_gptq_params(self) -> Tuple[int, int, int, str]: + def _get_gptq_params(self) -> _GPTQParams: try: bits = self.get_tensor("gptq_bits").item() groupsize = self.get_tensor("gptq_groupsize").item() desc_act = False + sym = True quant_method = "gptq" except (SafetensorError, RuntimeError) as e: try: @@ -554,10 +676,17 @@ class Weights: groupsize = self.gptq_groupsize desc_act = getattr(self, "gptq_desc_act", False) quant_method = getattr(self, "quant_method", "gptq") + sym = getattr(self, "sym", True) except Exception: raise e - return bits, groupsize, desc_act, quant_method + return _GPTQParams( + bits=bits, + desc_act=desc_act, + groupsize=groupsize, + quant_method=quant_method, + sym=sym, + ) def _set_gptq_params(self, model_id, revision): filename = "config.json" @@ -574,6 +703,7 @@ class Weights: self.gptq_groupsize = data["quantization_config"]["group_size"] # Order is important here, desc_act is missing on some real models self.quant_method = data["quantization_config"]["quant_method"] + self.gptq_sym = data["quantization_config"]["sym"] self.gptq_desc_act = data["quantization_config"]["desc_act"] except Exception: filename = "quantize_config.json" @@ -588,6 +718,7 @@ class Weights: data = json.load(f) self.gptq_bits = data["bits"] self.gptq_groupsize = data["group_size"] + self.gptq_sym = data["sym"] self.gptq_desc_act = data["desc_act"] if "version" in data and data["version"] == "GEMM": self.quant_method = "awq" From 96b7b40ca3e39f7ca5b875bff9a4665c1b175289 Mon Sep 17 00:00:00 2001 From: Tiezhen WANG <38108242+xianbaoqian@users.noreply.github.com> Date: Fri, 14 Jun 2024 17:59:33 +0800 Subject: [PATCH 08/15] Update the link for qwen2 (#2068) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update the link for qwen2 * Fix Qwen2 model URL in model table * Fix too eager staging --------- Co-authored-by: Daniël de Kok --- docs/source/supported_models.md | 2 +- server/text_generation_server/models/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md index 4b6cf731..3468e988 100644 --- a/docs/source/supported_models.md +++ b/docs/source/supported_models.md @@ -20,7 +20,7 @@ Text Generation Inference enables serving optimized models on specific hardware - [Baichuan](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat) - [Falcon](https://huggingface.co/tiiuae/falcon-7b-instruct) - [StarCoder 2](https://huggingface.co/bigcode/starcoder2-15b-instruct-v0.1) -- [Qwen 2](https://huggingface.co/bigcode/starcoder2-15b-instruct-v0.1) +- [Qwen 2](https://huggingface.co/collections/Qwen/qwen2-6659360b33528ced941e557f) - [Opt](https://huggingface.co/facebook/opt-6.7b) - [T5](https://huggingface.co/google/flan-t5-xxl) - [Galactica](https://huggingface.co/facebook/galactica-120b) diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py index a61cb83b..76dca3dc 100644 --- a/server/text_generation_server/models/__init__.py +++ b/server/text_generation_server/models/__init__.py @@ -196,7 +196,7 @@ class ModelType(enum.Enum): QWEN2 = { "type": "qwen2", "name": "Qwen 2", - "url": "https://huggingface.co/bigcode/starcoder2-15b-instruct-v0.1", + "url": "https://huggingface.co/collections/Qwen/qwen2-6659360b33528ced941e557f", } OPT = { "type": "opt", From 445f3135048a618586082af4ab0a1ce7874d85e2 Mon Sep 17 00:00:00 2001 From: Alvaro Moran <6949769+tengomucho@users.noreply.github.com> Date: Fri, 14 Jun 2024 15:28:34 +0200 Subject: [PATCH 09/15] Adding architecture document (#2044) * doc: adding architecture document * doc: add architecture to toctree * fix: avoid cargo lock changes * fix: avoid cargo lock tweak --------- Co-authored-by: drbh --- docs/source/_toctree.yml | 2 + docs/source/architecture.md | 227 ++++++++++++++++++++++++++++++++++++ 2 files changed, 229 insertions(+) create mode 100644 docs/source/architecture.md diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index a7351a33..7599562a 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -17,6 +17,8 @@ title: Supported Models and Hardware - local: messages_api title: Messages API + - local: architecture + title: Internal Architecture title: Getting started - sections: - local: basic_tutorials/consuming_tgi diff --git a/docs/source/architecture.md b/docs/source/architecture.md new file mode 100644 index 00000000..b7885879 --- /dev/null +++ b/docs/source/architecture.md @@ -0,0 +1,227 @@ +# Text Generation Inference Architecture + +This document aims at describing the architecture of Text Generation Inference (TGI), by describing the call flow between the separate components. + +A high-level architecture diagram can be seen here: + +![TGI architecture](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/TGI.png) + +This diagram shows well there are these separate components: + +- **The router**, also named `webserver`, that receives the client requests, buffers them, creates some batches, and prepares gRPC calls to a model server. +- **The model server**, responsible of receiving the gRPC requests and to process the inference on the model. If the model is sharded across multiple accelerators (e.g.: multiple GPUs), the model server shards might be synchronized via NCCL or equivalent. +- **The launcher** is a helper thar will be able to launch one or several model servers (if model is sharded), and it launches the router with the compatible arguments. + +The router and the model server can be two different machines, they do not need to be deployed together. + +## The Router + +This component is a rust web server binary that accepts HTTP requests using the custom [HTTP API](https://huggingface.github.io/text-generation-inference/), as well as OpenAI's [Messages API](https://huggingface.co/docs/text-generation-inference/messages_api). +The router receives the API calls and handles the "baches" logic (and introduction to batching can be found [here](https://github.com/huggingface/text-generation-inference/blob/main/router/README.md)). +It uses different strategies to reduce latency between requests and responses, especially oriented to decoding latency. It will use queues, schedulers, and block allocators to achieve that and produce batched requests that it will then be sent to the model server. + +### Router's command line + +The router command line will be the way to pass parameters to it (it does not rely on configuration file): + +``` +Text Generation Webserver + +Usage: text-generation-router [OPTIONS] + +Options: + --max-concurrent-requests + [env: MAX_CONCURRENT_REQUESTS=] [default: 128] + --max-best-of + [env: MAX_BEST_OF=] [default: 2] + --max-stop-sequences + [env: MAX_STOP_SEQUENCES=] [default: 4] + --max-top-n-tokens + [env: MAX_TOP_N_TOKENS=] [default: 5] + --max-input-tokens + [env: MAX_INPUT_TOKENS=] [default: 1024] + --max-total-tokens + [env: MAX_TOTAL_TOKENS=] [default: 2048] + --waiting-served-ratio + [env: WAITING_SERVED_RATIO=] [default: 1.2] + --max-batch-prefill-tokens + [env: MAX_BATCH_PREFILL_TOKENS=] [default: 4096] + --max-batch-total-tokens + [env: MAX_BATCH_TOTAL_TOKENS=] + --max-waiting-tokens + [env: MAX_WAITING_TOKENS=] [default: 20] + --max-batch-size + [env: MAX_BATCH_SIZE=] + --hostname + [env: HOSTNAME=] [default: 0.0.0.0] + -p, --port + [env: PORT=] [default: 3000] + --master-shard-uds-path + [env: MASTER_SHARD_UDS_PATH=] [default: /tmp/text-generation-server-0] + --tokenizer-name + [env: TOKENIZER_NAME=] [default: bigscience/bloom] + --tokenizer-config-path + [env: TOKENIZER_CONFIG_PATH=] + --revision + [env: REVISION=] + --validation-workers + [env: VALIDATION_WORKERS=] [default: 2] + --json-output + [env: JSON_OUTPUT=] + --otlp-endpoint + [env: OTLP_ENDPOINT=] + --cors-allow-origin + [env: CORS_ALLOW_ORIGIN=] + --ngrok + [env: NGROK=] + --ngrok-authtoken + [env: NGROK_AUTHTOKEN=] + --ngrok-edge + [env: NGROK_EDGE=] + --messages-api-enabled + [env: MESSAGES_API_ENABLED=] + --disable-grammar-support + [env: DISABLE_GRAMMAR_SUPPORT=] + --max-client-batch-size + [env: MAX_CLIENT_BATCH_SIZE=] [default: 4] + -h, --help + Print help + -V, --version + Print version +``` + +## The Model Server + +The model server is a python server, capable of starting a server waiting for gRPC requests, loads a given model, perform sharding to provide [tensor parallelism](https://huggingface.co/docs/text-generation-inference/conceptual/tensor_parallelism), and stays alive while waiting for new requests. +The model server supports models instantiated using Pytorch and optimized for inference mainly on CUDA/ROCM. + +### Model Server Variants + +Several variants of the model server exist that are actively supported by Hugging Face: + +- By default, the model server will attempt building [a server optimized for Nvidia GPUs with CUDA](https://huggingface.co/docs/text-generation-inference/installation_nvidia). The code for this version is hosted in the [main TGI repository](https://github.com/huggingface/text-generation-inference). +- A [version optimized for AMD with ROCm](https://huggingface.co/docs/text-generation-inference/installation_amd) is hosted in the main TGI repository. Some model features differ. +- The [version for Intel Gaudi](https://huggingface.co/docs/text-generation-inference/installation_gaudi) is maintained on a forked repository, often resynchronized with the main [TGI repository](https://github.com/huggingface/tgi-gaudi). +- A [version for Neuron (AWS Inferentia2)](https://huggingface.co/docs/text-generation-inference/installation_inferentia) is maintained as part of [Optimum Neuron](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference). +- A version for Google TPUs is maintained as part of [Optimum TPU](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference). + +Not all variants provide the same features, as hardware and middleware capabilities do not provide the same optimizations. + +### Command Line Interface + +The official command line interface (CLI) for the server supports three subcommands, `download-weights`, `quantize` and `serve`: + +- `download-weights` will download weights from the hub and, in some variants it will convert weights to a format that is adapted to the given implementation; +- `quantize` will allow to quantize a model using the `qptq` package. This feature is not available nor supported on all variants; +- `serve` will start the server that load a model (or a model shard), receives gRPC calls from the router, performs an inference and provides a formatted response to the given request. + +Serve's command line parameters on the TGI repository are these: + +``` + Usage: cli.py serve [OPTIONS] MODEL_ID + +╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────╮ +│ * model_id TEXT [default: None] [required] │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ --revision TEXT [default: None] │ +│ --sharded --no-sharded [default: no-sharded] │ +│ --quantize [bitsandbytes|bitsandbytes [default: None] │ +│ -nf4|bitsandbytes-fp4|gptq │ +│ |awq|eetq|exl2|fp8] │ +│ --speculate INTEGER [default: None] │ +│ --dtype [float16|bfloat16] [default: None] │ +│ --trust-remote-code --no-trust-remote-code [default: │ +│ no-trust-remote-code] │ +│ --uds-path PATH [default: │ +│ /tmp/text-generation-serve… │ +│ --logger-level TEXT [default: INFO] │ +│ --json-output --no-json-output [default: no-json-output] │ +│ --otlp-endpoint TEXT [default: None] │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +``` + +Note that some variants might support different parameters, and they could possibly accept more options that can be passed on using environment variables. + +## Call Flow + +Once both components are initialized, weights downloaded and model server is up and running, router and model server exchange data and info through the gRPC call. There are currently two supported schemas, [v2](https://github.com/huggingface/text-generation-inference/blob/main/proto/generate.proto) and [v3](https://github.com/huggingface/text-generation-inference/blob/main/proto/v3/generate.proto). These two versions are almost identical, except for: + +- input chunks support, for text and image data, +- paged attention support + +Here's a diagram that displays the exchanges that follow the router and model server startup. + +```mermaid +sequenceDiagram + + Router->>Model Server: service discovery + Model Server-->>Router: urls for other shards + + Router->>Model Server: get model info + Model Server-->>Router: shard info + + Router->>Model Server: health check + Model Server-->>Router: health OK + + Router->>Model Server: warmup(max_input_tokens, max_batch_prefill_tokens, max_total_tokens, max_batch_size) + Model Server-->>Router: warmup result +``` + +After these are done, the router is ready to receive generate calls from multiple clients. Here's an example. + +```mermaid +sequenceDiagram + participant Client 1 + participant Client 2 + participant Client 3 + participant Router + participant Model Server + + Client 1->>Router: generate_stream + Router->>Model Server: prefill(batch1) + Model Server-->>Router: generations, cached_batch1, timings + Router-->>Client 1: token 1 + + Router->>Model Server: decode(cached_batch1) + Model Server-->>Router: generations, cached_batch1, timings + Router-->>Client 1: token 2 + + Router->>Model Server: decode(cached_batch1) + Model Server-->>Router: generations, cached_batch1, timings + Router-->>Client 1: token 3 + + Client 2->>Router: generate_stream + Router->>Model Server: prefill(batch2) + Note right of Model Server: This stops previous batch, that is restarted + Model Server-->>Router: generations, cached_batch2, timings + Router-->>Client 2: token 1' + + Router->>Model Server: decode(cached_batch1, cached_batch2) + Model Server-->>Router: generations, cached_batch1, timings + Router-->>Client 1: token 4 + Router-->>Client 2: token 2' + + Note left of Client 1: Client 1 leaves + Router->>Model Server: filter_batch(cached_batch1, request_ids_to_keep=batch2) + Model Server-->>Router: filtered batch + + Router->>Model Server: decode(cached_batch2) + Model Server-->>Router: generations, cached_batch2, timings + Router-->>Client 2: token 3' + + Client 3->>Router: generate_stream + Note right of Model Server: This stops previous batch, that is restarted + Router->>Model Server: prefill(batch3) + Note left of Client 1: Client 3 leaves without receiving any batch + Router->>Model Server: clear_cache(batch3) + Note right of Model Server: This stops previous batch, that is restarted + + Router->>Model Server: decode(cached_batch3) + Note right of Model Server: Last token (stopping criteria) + Model Server-->>Router: generations, cached_batch3, timings + Router-->>Client 2: token 4' + + +``` From e903770897ae80f9b9ea02ba02eac4c680fd6202 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Mon, 17 Jun 2024 10:49:41 +0200 Subject: [PATCH 10/15] Support different image sizes in prefill in VLMs (#2065) When a batch contained images if different sizes during prefill, the server would fail (see e.g. #2056). Images were processed separately and then concatenated. However, this can fail for images with different sizes. Fix this by preprocessing all images in the batch together, so that the image processor can ensure that all image tensors have compatible sizes. --- .../test_flash_pali_gemma_two_images.json | 61 ++++++++ .../test_idefics/test_idefics_two_images.json | 85 +++++++++++ .../test_flash_idefics2_two_images.json | 133 ++++++++++++++++++ .../models/test_flash_pali_gemma.py | 23 +++ integration-tests/models/test_idefics.py | 21 +++ integration-tests/models/test_idefics2.py | 23 +++ .../models/vlm_causal_lm.py | 57 ++++---- 7 files changed, 376 insertions(+), 27 deletions(-) create mode 100644 integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma_two_images.json create mode 100644 integration-tests/models/__snapshots__/test_idefics/test_idefics_two_images.json create mode 100644 integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_two_images.json diff --git a/integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma_two_images.json b/integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma_two_images.json new file mode 100644 index 00000000..ab4f3015 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma_two_images.json @@ -0,0 +1,61 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "eos_token", + "generated_tokens": 8, + "prefill": [], + "seed": null, + "tokens": [ + { + "id": 2502, + "logprob": -1.734375, + "special": false, + "text": "image" + }, + { + "id": 2196, + "logprob": -0.5756836, + "special": false, + "text": " result" + }, + { + "id": 604, + "logprob": -0.007843018, + "special": false, + "text": " for" + }, + { + "id": 12254, + "logprob": -1.7167969, + "special": false, + "text": " chicken" + }, + { + "id": 611, + "logprob": -0.17053223, + "special": false, + "text": " on" + }, + { + "id": 573, + "logprob": -0.7626953, + "special": false, + "text": " the" + }, + { + "id": 8318, + "logprob": -0.02709961, + "special": false, + "text": " beach" + }, + { + "id": 1, + "logprob": -0.20739746, + "special": true, + "text": "" + } + ], + "top_tokens": null + }, + "generated_text": "image result for chicken on the beach" +} diff --git a/integration-tests/models/__snapshots__/test_idefics/test_idefics_two_images.json b/integration-tests/models/__snapshots__/test_idefics/test_idefics_two_images.json new file mode 100644 index 00000000..a4727707 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_idefics/test_idefics_two_images.json @@ -0,0 +1,85 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "eos_token", + "generated_tokens": 12, + "prefill": [], + "seed": null, + "tokens": [ + { + "id": 450, + "logprob": -0.26342773, + "special": false, + "text": " The" + }, + { + "id": 21282, + "logprob": -0.01838684, + "special": false, + "text": " cow" + }, + { + "id": 322, + "logprob": -0.18041992, + "special": false, + "text": " and" + }, + { + "id": 521, + "logprob": -0.62841797, + "special": false, + "text": " ch" + }, + { + "id": 21475, + "logprob": -0.0037956238, + "special": false, + "text": "icken" + }, + { + "id": 526, + "logprob": -0.018737793, + "special": false, + "text": " are" + }, + { + "id": 373, + "logprob": -1.0820312, + "special": false, + "text": " on" + }, + { + "id": 263, + "logprob": -0.5083008, + "special": false, + "text": " a" + }, + { + "id": 25695, + "logprob": -0.07128906, + "special": false, + "text": " beach" + }, + { + "id": 29889, + "logprob": -0.12573242, + "special": false, + "text": "." + }, + { + "id": 32002, + "logprob": -0.0029792786, + "special": true, + "text": "" + }, + { + "id": 2, + "logprob": -0.00024962425, + "special": true, + "text": "" + } + ], + "top_tokens": null + }, + "generated_text": " The cow and chicken are on a beach." +} diff --git a/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_two_images.json b/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_two_images.json new file mode 100644 index 00000000..86c95b29 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_two_images.json @@ -0,0 +1,133 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 20, + "prefill": [], + "seed": null, + "tokens": [ + { + "id": 415, + "logprob": -0.04421997, + "special": false, + "text": " The" + }, + { + "id": 12072, + "logprob": -0.13500977, + "special": false, + "text": " cow" + }, + { + "id": 349, + "logprob": -0.06750488, + "special": false, + "text": " is" + }, + { + "id": 6328, + "logprob": -0.6352539, + "special": false, + "text": " standing" + }, + { + "id": 356, + "logprob": -0.16186523, + "special": false, + "text": " on" + }, + { + "id": 272, + "logprob": -0.5078125, + "special": false, + "text": " the" + }, + { + "id": 10305, + "logprob": -0.017913818, + "special": false, + "text": " beach" + }, + { + "id": 304, + "logprob": -1.5205078, + "special": false, + "text": " and" + }, + { + "id": 272, + "logprob": -0.029174805, + "special": false, + "text": " the" + }, + { + "id": 13088, + "logprob": -0.003479004, + "special": false, + "text": " chicken" + }, + { + "id": 349, + "logprob": -0.0035095215, + "special": false, + "text": " is" + }, + { + "id": 6398, + "logprob": -0.3088379, + "special": false, + "text": " sitting" + }, + { + "id": 356, + "logprob": -0.027755737, + "special": false, + "text": " on" + }, + { + "id": 264, + "logprob": -0.31884766, + "special": false, + "text": " a" + }, + { + "id": 17972, + "logprob": -0.047943115, + "special": false, + "text": " pile" + }, + { + "id": 302, + "logprob": -0.0002925396, + "special": false, + "text": " of" + }, + { + "id": 2445, + "logprob": -0.02935791, + "special": false, + "text": " money" + }, + { + "id": 28723, + "logprob": -0.031219482, + "special": false, + "text": "." + }, + { + "id": 32002, + "logprob": -0.00034475327, + "special": true, + "text": "" + }, + { + "id": 2, + "logprob": -1.1920929e-07, + "special": true, + "text": "" + } + ], + "top_tokens": null + }, + "generated_text": " The cow is standing on the beach and the chicken is sitting on a pile of money." +} diff --git a/integration-tests/models/test_flash_pali_gemma.py b/integration-tests/models/test_flash_pali_gemma.py index d4e83c9f..6be1750c 100644 --- a/integration-tests/models/test_flash_pali_gemma.py +++ b/integration-tests/models/test_flash_pali_gemma.py @@ -22,6 +22,12 @@ async def flash_pali_gemma(flash_pali_gemma_handle): return flash_pali_gemma_handle.client +def get_chicken(): + with open("integration-tests/images/chicken_on_money.png", "rb") as image_file: + encoded_string = base64.b64encode(image_file.read()) + return f"data:image/png;base64,{encoded_string.decode('utf-8')}" + + def get_cow_beach(): with open("integration-tests/images/cow_beach.png", "rb") as image_file: encoded_string = base64.b64encode(image_file.read()) @@ -37,3 +43,20 @@ async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot): assert response.generated_text == "beach" assert response == response_snapshot + + +@pytest.mark.asyncio +@pytest.mark.private +async def test_flash_pali_gemma_two_images(flash_pali_gemma, response_snapshot): + chicken = get_chicken() + cow_beach = get_cow_beach() + response = await flash_pali_gemma.generate( + f"caption![]({chicken})![]({cow_beach})\n", + max_new_tokens=20, + ) + # Is PaliGemma not able to handle two separate images? At least we + # get output showing that both images are used. + assert ( + response.generated_text == "image result for chicken on the beach" + ), f"{repr(response.generated_text)}" + assert response == response_snapshot diff --git a/integration-tests/models/test_idefics.py b/integration-tests/models/test_idefics.py index aeeaffa1..ac807b76 100644 --- a/integration-tests/models/test_idefics.py +++ b/integration-tests/models/test_idefics.py @@ -23,6 +23,12 @@ def get_chicken(): return f"data:image/png;base64,{encoded_string.decode('utf-8')}" +def get_cow_beach(): + with open("integration-tests/images/cow_beach.png", "rb") as image_file: + encoded_string = base64.b64encode(image_file.read()) + return f"data:image/png;base64,{encoded_string.decode('utf-8')}" + + @pytest.mark.asyncio async def test_idefics(idefics, response_snapshot): chicken = get_chicken() @@ -39,6 +45,21 @@ async def test_idefics(idefics, response_snapshot): assert response == response_snapshot +@pytest.mark.asyncio +@pytest.mark.private +async def test_idefics_two_images(idefics, response_snapshot): + chicken = get_chicken() + cow_beach = get_cow_beach() + response = await idefics.generate( + f"User:![]({chicken})![]({cow_beach})Where are the cow and chicken? \nAssistant:", + max_new_tokens=20, + ) + assert ( + response.generated_text == " The cow and chicken are on a beach." + ), f"{repr(response.generated_text)}" + assert response == response_snapshot + + @pytest.mark.asyncio async def test_idefics_load(idefics, generate_load, response_snapshot): chicken = get_chicken() diff --git a/integration-tests/models/test_idefics2.py b/integration-tests/models/test_idefics2.py index d34cce34..9aaf6d8a 100644 --- a/integration-tests/models/test_idefics2.py +++ b/integration-tests/models/test_idefics2.py @@ -9,6 +9,12 @@ def get_chicken(): return f"data:image/png;base64,{encoded_string.decode('utf-8')}" +def get_cow_beach(): + with open("integration-tests/images/cow_beach.png", "rb") as image_file: + encoded_string = base64.b64encode(image_file.read()) + return f"data:image/png;base64,{encoded_string.decode('utf-8')}" + + @pytest.fixture(scope="module") def flash_idefics2_next_handle(launcher): with launcher( @@ -38,6 +44,23 @@ async def test_flash_idefics2_next_simple(flash_idefics2_next, response_snapshot assert response == response_snapshot +@pytest.mark.asyncio +@pytest.mark.private +async def test_flash_idefics2_two_images(flash_idefics2_next, response_snapshot): + chicken = get_chicken() + cow_beach = get_cow_beach() + response = await flash_idefics2_next.generate( + f"User:![]({chicken})![]({cow_beach})Where are the cow and chicken? \nAssistant:", + max_new_tokens=20, + ) + assert ( + response.generated_text + == " The cow is standing on the beach and the chicken is sitting on a pile of money." + ), f"{repr(response.generated_text)}" + assert response.details.generated_tokens == 20 + assert response == response_snapshot + + @pytest.mark.asyncio @pytest.mark.private async def test_flash_idefics2_next_all_params(flash_idefics2_next, response_snapshot): diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py index 59a6fab1..8b5819d1 100644 --- a/server/text_generation_server/models/vlm_causal_lm.py +++ b/server/text_generation_server/models/vlm_causal_lm.py @@ -53,7 +53,9 @@ def image_text_replacement(image_input, config, image_id) -> str: num_features = get_number_of_features(height, width, config) from loguru import logger - logger.info(f"Found {num_features} in image of resolution {height}x{width}") + logger.info( + f"Found {num_features} features in image of resolution {height}x{width}" + ) return "" * num_features elif config.model_type == "paligemma": @@ -133,23 +135,41 @@ class VlmCausalLMBatch(FlashCausalLMBatch): def batch_tokenized_inputs( cls, requests: Iterable[generate_pb2.Request], tokenizer, processor, config ): + # Process images first. We need all of them so that the processor + # can make the image splits the same size. And we need the final + # sizes to insert correct number of image tokens. + images = [] + for r in requests: + for chunk in r.input_chunks.chunks: + chunk_type = chunk.WhichOneof("chunk") + if chunk_type == "text": + pass + elif chunk_type == "image": + image = Image.open(BytesIO(chunk.image.data)) + if config.model_type == "llava_next": + images.append(image) + else: + images.append([image]) + else: + raise RuntimeError(f"Invalid chunk type {chunk_type}") + + if images: + image_inputs = processor.image_processor(images, return_tensors="pt") + else: + image_inputs = None + batch_inputs = [] - image_inputs = [] max_truncation = 0 + image_id = 0 for r in requests: full_text = "" - image_id = 0 for chunk in r.input_chunks.chunks: chunk_type = chunk.WhichOneof("chunk") if chunk_type == "text": full_text += chunk.text elif chunk_type == "image": - image = Image.open(BytesIO(chunk.image.data)) - image_input = processor.image_processor(image, return_tensors="pt") - full_text += image_text_replacement(image_input, config, image_id) - image_inputs.append(image_input) - else: - raise RuntimeError(f"Invalid chunk type {chunk_type}") + full_text += image_text_replacement(image_inputs, config, image_id) + image_id += 1 batch_inputs.append(full_text) max_truncation = max(max_truncation, r.truncate) @@ -160,24 +180,7 @@ class VlmCausalLMBatch(FlashCausalLMBatch): max_length=max_truncation, add_special_tokens=not config.model_type == "paligemma", )["input_ids"] - if image_inputs: - image_input = image_inputs[0] - new_image_inputs = { - "pixel_values": torch.cat( - [img["pixel_values"] for img in image_inputs], dim=0 - ), - } - if "pixel_attention_mask" in image_input: - new_image_inputs["pixel_attention_mask"] = torch.cat( - [img["pixel_attention_mask"] for img in image_inputs], dim=0 - ) - if "image_sizes" in image_input: - new_image_inputs["image_sizes"] = torch.cat( - [img["image_sizes"] for img in image_inputs], dim=0 - ) - image_inputs = new_image_inputs - else: - image_inputs = None + return batch_tokenized_inputs, image_inputs @classmethod From 131838919e680f4ed0519786e8fc2a9baf182802 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Mon, 17 Jun 2024 12:09:31 +0200 Subject: [PATCH 11/15] Contributing guide & Code of Conduct (#2074) * Contributing guide & Code of Conduct * Redirect to GitHub's tutorial on PRs --- CODE_OF_CONDUCT.md | 133 +++++++++++++++++++++++++++++++++++++++++++++ CONTRIBUTING.md | 120 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 253 insertions(+) create mode 100644 CODE_OF_CONDUCT.md create mode 100644 CONTRIBUTING.md diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..ef09fa13 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,133 @@ + +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, caste, color, religion, or sexual +identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the overall + community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or advances of + any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email address, + without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +feedback@huggingface.co. +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series of +actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or permanent +ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within the +community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.1, available at +[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. + +Community Impact Guidelines were inspired by +[Mozilla's code of conduct enforcement ladder][Mozilla CoC]. + +For answers to common questions about this code of conduct, see the FAQ at +[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at +[https://www.contributor-covenant.org/translations][translations]. + +[homepage]: https://www.contributor-covenant.org +[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html +[Mozilla CoC]: https://github.com/mozilla/diversity +[FAQ]: https://www.contributor-covenant.org/faq +[translations]: https://www.contributor-covenant.org/translations \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..39b57c19 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,120 @@ + + +# Contribute to text-generation-inference + +Everyone is welcome to contribute, and we value everybody's contribution. Code +contributions are not the only way to help the community. Answering questions, helping +others, and improving the documentation are also immensely valuable. + +It also helps us if you spread the word! Reference the library in blog posts +about the awesome projects it made possible, shout out on Twitter every time it has +helped you, or simply ⭐️ the repository to say thank you. + +However you choose to contribute, please be mindful and respect our +[code of conduct](https://github.com/huggingface/text-generation-inference/blob/main/CODE_OF_CONDUCT.md). + +**This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md).** + +## Ways to contribute + +There are several ways you can contribute to text-generation-inference. + +* Fix outstanding issues with the existing code. +* Submit issues related to bugs or desired new features. +* Contribute to the examples or to the documentation. + +> All contributions are equally valuable to the community. 🥰 + +## Fixing outstanding issues + +If you notice an issue with the existing code and have a fix in mind, feel free to [start contributing](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request) and open +a Pull Request! + +## Submitting a bug-related issue or feature request + +Do your best to follow these guidelines when submitting a bug-related issue or a feature +request. It will make it easier for us to come back to you quickly and with good +feedback. + +### Did you find a bug? + +The text-generation-inference library is robust and reliable thanks to users who report the problems they encounter. + +Before you report an issue, we would really appreciate it if you could **make sure the bug was not +already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the +library itself, and not your code. + +Once you've confirmed the bug hasn't already been reported, please include the following information in your issue so +we can quickly resolve it: + +* Your **OS type and version**, as well as your environment versions (versions of rust, python, and dependencies). +* A short, self-contained, code snippet that allows us to reproduce the bug. +* The *full* traceback if an exception is raised. +* Attach any other additional information, like screenshots, you think may help. + +To get the OS and software versions automatically, you can re-run the launcher with the `--env` flag: + +```bash +text-generation-launcher --env +``` + +This will precede the launch of the model with the information relative to your environment. We recommend pasting +that in your issue report. + +### Do you want a new feature? + +If there is a new feature you'd like to see in text-generation-inference, please open an issue and describe: + +1. What is the *motivation* behind this feature? Is it related to a problem or frustration with the library? Is it + a feature related to something you need for a project? Is it something you worked on and think it could benefit + the community? + + Whatever it is, we'd love to hear about it! + +2. Describe your requested feature in as much detail as possible. The more you can tell us about it, the better + we'll be able to help you. +3. Provide a *code snippet* that demonstrates the feature's usage. +4. If the feature is related to a paper, please include a link. + +If your issue is well written we're already 80% of the way there by the time you create it. + +We have added [templates](https://github.com/huggingface/text-generation-inference/tree/main/.github/ISSUE_TEMPLATE) +to help you get started with your issue. + +## Do you want to implement a new model? + +New models are constantly released and if you want to implement a new model, please provide the following information: + +* A short description of the model and a link to the paper. +* Link to the implementation if it is open-sourced. +* Link to the model weights if they are available. + +If you are willing to contribute the model yourself, let us know so we can help you add it to text-generation-inference! + +## Do you want to add documentation? + +We're always looking for improvements to the documentation that make it more clear and accurate. Please let us know +how the documentation can be improved such as typos and any content that is missing, unclear or inaccurate. We'll be +happy to make the changes or help you make a contribution if you're interested! + +## I want to become a maintainer of the project. How do I get there? + +TGI is a project led and managed by Hugging Face as it powers our internal services. However, we are happy to have +motivated individuals from other organizations join us as maintainers with the goal of making TGI the best inference +service. + +If you are such an individual (or organization), please reach out to us and let's collaborate. \ No newline at end of file From 0f7d38e774aff78d41bd63c4baaa7a96f9320c0b Mon Sep 17 00:00:00 2001 From: Ziru Niu Date: Mon, 17 Jun 2024 18:10:01 +0800 Subject: [PATCH 12/15] fix build.rs watch files (#2072) --- router/client/build.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/router/client/build.rs b/router/client/build.rs index a7ade9b0..210cd603 100644 --- a/router/client/build.rs +++ b/router/client/build.rs @@ -1,7 +1,7 @@ use std::fs; fn main() -> Result<(), Box> { - println!("cargo:rerun-if-changed=../../proto/**"); + println!("cargo:rerun-if-changed=../../proto/"); fs::create_dir_all("src/v2/pb").unwrap_or(()); let mut config = prost_build::Config::new(); From c8c7ccd31e1e760d216c9d2f2b17b0d984ed033b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Mon, 17 Jun 2024 16:40:44 +0200 Subject: [PATCH 13/15] Set maximum grpc message receive size to 2GiB (#2075) * Set maximum grpc message receive size to 2GiB The previous default was 4MiB, which doesn't really work well for multi-modal models. * Update to Rust 1.79.0 * Fixup formatting to make PR pass --- .github/workflows/tests.yaml | 6 +++--- CODE_OF_CONDUCT.md | 2 +- CONTRIBUTING.md | 22 +++++++++++----------- Dockerfile | 2 +- Dockerfile_amd | 2 +- Dockerfile_intel | 2 +- benchmark/src/app.rs | 12 ++++++------ benchmark/src/table.rs | 6 +++--- benchmark/src/utils.rs | 2 +- rust-toolchain.toml | 6 +++--- server/text_generation_server/server.py | 6 +++++- 11 files changed, 36 insertions(+), 32 deletions(-) diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 74479cc6..83fff196 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -33,9 +33,9 @@ jobs: - name: Install Rust uses: actions-rs/toolchain@v1 with: - # Released on: 02 May, 2024 - # https://releases.rs/docs/1.78.0/ - toolchain: 1.78.0 + # Released on: June 13, 2024 + # https://releases.rs/docs/1.79.0/ + toolchain: 1.79.0 override: true components: rustfmt, clippy - name: Install Protoc diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index ef09fa13..b23f3150 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -130,4 +130,4 @@ For answers to common questions about this code of conduct, see the FAQ at [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html [Mozilla CoC]: https://github.com/mozilla/diversity [FAQ]: https://www.contributor-covenant.org/faq -[translations]: https://www.contributor-covenant.org/translations \ No newline at end of file +[translations]: https://www.contributor-covenant.org/translations diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 39b57c19..d541e47f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -55,10 +55,10 @@ feedback. The text-generation-inference library is robust and reliable thanks to users who report the problems they encounter. Before you report an issue, we would really appreciate it if you could **make sure the bug was not -already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the -library itself, and not your code. +already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the +library itself, and not your code. -Once you've confirmed the bug hasn't already been reported, please include the following information in your issue so +Once you've confirmed the bug hasn't already been reported, please include the following information in your issue so we can quickly resolve it: * Your **OS type and version**, as well as your environment versions (versions of rust, python, and dependencies). @@ -79,20 +79,20 @@ that in your issue report. If there is a new feature you'd like to see in text-generation-inference, please open an issue and describe: -1. What is the *motivation* behind this feature? Is it related to a problem or frustration with the library? Is it - a feature related to something you need for a project? Is it something you worked on and think it could benefit +1. What is the *motivation* behind this feature? Is it related to a problem or frustration with the library? Is it + a feature related to something you need for a project? Is it something you worked on and think it could benefit the community? Whatever it is, we'd love to hear about it! -2. Describe your requested feature in as much detail as possible. The more you can tell us about it, the better +2. Describe your requested feature in as much detail as possible. The more you can tell us about it, the better we'll be able to help you. 3. Provide a *code snippet* that demonstrates the feature's usage. 4. If the feature is related to a paper, please include a link. If your issue is well written we're already 80% of the way there by the time you create it. -We have added [templates](https://github.com/huggingface/text-generation-inference/tree/main/.github/ISSUE_TEMPLATE) +We have added [templates](https://github.com/huggingface/text-generation-inference/tree/main/.github/ISSUE_TEMPLATE) to help you get started with your issue. ## Do you want to implement a new model? @@ -107,14 +107,14 @@ If you are willing to contribute the model yourself, let us know so we can help ## Do you want to add documentation? -We're always looking for improvements to the documentation that make it more clear and accurate. Please let us know -how the documentation can be improved such as typos and any content that is missing, unclear or inaccurate. We'll be +We're always looking for improvements to the documentation that make it more clear and accurate. Please let us know +how the documentation can be improved such as typos and any content that is missing, unclear or inaccurate. We'll be happy to make the changes or help you make a contribution if you're interested! ## I want to become a maintainer of the project. How do I get there? TGI is a project led and managed by Hugging Face as it powers our internal services. However, we are happy to have motivated individuals from other organizations join us as maintainers with the goal of making TGI the best inference -service. +service. -If you are such an individual (or organization), please reach out to us and let's collaborate. \ No newline at end of file +If you are such an individual (or organization), please reach out to us and let's collaborate. diff --git a/Dockerfile b/Dockerfile index 14628339..c93372a2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # Rust builder -FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef +FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef WORKDIR /usr/src ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse diff --git a/Dockerfile_amd b/Dockerfile_amd index c79bc03c..55da9204 100644 --- a/Dockerfile_amd +++ b/Dockerfile_amd @@ -1,5 +1,5 @@ # Rust builder -FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef +FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef WORKDIR /usr/src ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse diff --git a/Dockerfile_intel b/Dockerfile_intel index cb0e84bb..35362fc9 100644 --- a/Dockerfile_intel +++ b/Dockerfile_intel @@ -1,4 +1,4 @@ -FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef +FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef WORKDIR /usr/src ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse diff --git a/benchmark/src/app.rs b/benchmark/src/app.rs index 48ac976a..a0a9313a 100644 --- a/benchmark/src/app.rs +++ b/benchmark/src/app.rs @@ -497,7 +497,7 @@ fn statis_spans<'a>(data: &[f64], unit: &'static str) -> Vec> { "Lowest: {:.2} {unit}", data.iter() .min_by(|a, b| a.total_cmp(b)) - .unwrap_or(&std::f64::NAN) + .unwrap_or(&f64::NAN) ), Style::default().fg(Color::Reset), )]), @@ -506,7 +506,7 @@ fn statis_spans<'a>(data: &[f64], unit: &'static str) -> Vec> { "Highest: {:.2} {unit}", data.iter() .max_by(|a, b| a.total_cmp(b)) - .unwrap_or(&std::f64::NAN) + .unwrap_or(&f64::NAN) ), Style::default().fg(Color::Reset), )]), @@ -555,17 +555,17 @@ fn latency_throughput_chart<'a>( let min_latency: f64 = *latency_iter .clone() .min_by(|a, b| a.total_cmp(b)) - .unwrap_or(&std::f64::NAN); + .unwrap_or(&f64::NAN); let max_latency: f64 = *latency_iter .max_by(|a, b| a.total_cmp(b)) - .unwrap_or(&std::f64::NAN); + .unwrap_or(&f64::NAN); let min_throughput: f64 = *throughput_iter .clone() .min_by(|a, b| a.total_cmp(b)) - .unwrap_or(&std::f64::NAN); + .unwrap_or(&f64::NAN); let max_throughput: f64 = *throughput_iter .max_by(|a, b| a.total_cmp(b)) - .unwrap_or(&std::f64::NAN); + .unwrap_or(&f64::NAN); // Char min max values let min_x = if zoom { diff --git a/benchmark/src/table.rs b/benchmark/src/table.rs index e18d7310..1585a25f 100644 --- a/benchmark/src/table.rs +++ b/benchmark/src/table.rs @@ -156,17 +156,17 @@ fn avg_min_max(data: &[f64]) -> (f64, f64, f64) { let min = data .iter() .min_by(|a, b| a.total_cmp(b)) - .unwrap_or(&std::f64::NAN); + .unwrap_or(&f64::NAN); let max = data .iter() .max_by(|a, b| a.total_cmp(b)) - .unwrap_or(&std::f64::NAN); + .unwrap_or(&f64::NAN); (average, *min, *max) } fn px(data: &[f64], p: u32) -> f64 { let i = (f64::from(p) / 100.0 * data.len() as f64) as usize; - *data.get(i).unwrap_or(&std::f64::NAN) + *data.get(i).unwrap_or(&f64::NAN) } fn format_value(value: f64, unit: &'static str) -> String { diff --git a/benchmark/src/utils.rs b/benchmark/src/utils.rs index d096d655..20469991 100644 --- a/benchmark/src/utils.rs +++ b/benchmark/src/utils.rs @@ -37,7 +37,7 @@ pub(crate) fn percentiles(values: &[f64], pecents: &[i32]) -> BTreeMap Date: Tue, 18 Jun 2024 09:13:04 +0200 Subject: [PATCH 14/15] CI: Tailscale improvements (#2079) * test local tailscale * Update build.yaml * Update build.yaml * Update build.yaml * Update build.yaml * wait for ssh * network host * change step order --- .github/workflows/build.yaml | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index e80037b1..ad1377a2 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -51,16 +51,19 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v3 - - name: Initialize Docker Buildx - uses: docker/setup-buildx-action@v2.0.0 - with: - install: true + - name: Inject slug/short variables uses: rlespinasse/github-slug-action@v4.4.1 - name: Tailscale uses: huggingface/tailscale-action@main with: authkey: ${{ secrets.TAILSCALE_AUTHKEY }} + slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }} + slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + - name: Initialize Docker Buildx + uses: docker/setup-buildx-action@v2.0.0 + with: + install: true - name: Login to GitHub Container Registry if: github.event_name != 'pull_request' uses: docker/login-action@v2 @@ -121,6 +124,7 @@ jobs: DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}${{ matrix.label }} tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }} labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }} + network: host cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache${{ matrix.label }},mode=min cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache${{ matrix.label }},mode=min - name: Set up Python @@ -139,3 +143,8 @@ jobs: export DOCKER_IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }} export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} pytest -s -vv integration-tests + - name: Tailscale Wait + if: ${{ failure() || runner.debug == '1' }} + uses: huggingface/tailscale-action@main + with: + waitForSSH: true From 11ea9ce002e796cc59714950b557b4021cbebc58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Tue, 18 Jun 2024 09:38:21 +0200 Subject: [PATCH 15/15] CI: pass pre-commit hooks again (#2084) --- .github/workflows/build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index ad1377a2..8c407e81 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -51,7 +51,7 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v3 - + - name: Inject slug/short variables uses: rlespinasse/github-slug-action@v4.4.1 - name: Tailscale