Make GPTQ test less flaky (#1295)

# What does this PR do?   Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.
2023-11-28 21:22:35 +01:00 · 2023-11-28 21:22:35 +01:00 · 624800c4de
parent ba552e1a82
commit 624800c4de
2 changed files with 17 additions and 8 deletions
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@ -24,6 +24,7 @@ DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")


 class ResponseComparator(JSONSnapshotExtension):
+    rtol = 0.2
    def serialize(
        self,
        data,
@ -58,7 +59,7 @@ class ResponseComparator(JSONSnapshotExtension):
            return (
                token.id == other.id
                and token.text == other.text
-                and math.isclose(token.logprob, other.logprob, rel_tol=0.2)
+                and math.isclose(token.logprob, other.logprob, rel_tol=self.rtol)
                and token.special == other.special
            )

@ -68,7 +69,7 @@ class ResponseComparator(JSONSnapshotExtension):
                    prefill_token.id == other.id
                    and prefill_token.text == other.text
                    and (
-                        math.isclose(prefill_token.logprob, other.logprob, rel_tol=0.2)
+                        math.isclose(prefill_token.logprob, other.logprob, rel_tol=self.rtol)
                        if prefill_token.logprob is not None
                        else prefill_token.logprob == other.logprob
                    )
@ -148,6 +149,10 @@ class ResponseComparator(JSONSnapshotExtension):
        )


+class GenerousResponseComparator(ResponseComparator):
+    # Needed for GPTQ with exllama which has serious numerical fluctuations.
+    rtol = 0.75
+
 class LauncherHandle:
    def __init__(self, port: int):
        self.client = AsyncClient(f"http://localhost:{port}")
@ -193,6 +198,10 @@ class ProcessLauncherHandle(LauncherHandle):
 def response_snapshot(snapshot):
    return snapshot.use_extension(ResponseComparator)

+@pytest.fixture
+def generous_response_snapshot(snapshot):
+    return snapshot.use_extension(GenerousResponseComparator)
+

@pytest.fixture(scope="module")
 def event_loop():
--- a/integration-tests/models/test_flash_starcoder_gptq.py
+++ b/integration-tests/models/test_flash_starcoder_gptq.py
@ -15,20 +15,20 @@ async def flash_starcoder_gptq(flash_starcoder_gptq_handle):

@pytest.mark.asyncio
@pytest.mark.private
-async def test_flash_starcoder_gptq(flash_starcoder_gptq, response_snapshot):
+async def test_flash_starcoder_gptq(flash_starcoder_gptq, generous_response_snapshot):
    response = await flash_starcoder_gptq.generate(
        "def geometric_mean(L: List[float]):",
        max_new_tokens=20,
        decoder_input_details=True,
    )
    assert response.details.generated_tokens == 20
-    assert response == response_snapshot
+    assert response == generous_response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
 async def test_flash_starcoder_gptq_default_params(
-    flash_starcoder_gptq, response_snapshot
+    flash_starcoder_gptq, generous_response_snapshot
 ):
    response = await flash_starcoder_gptq.generate(
        "def geometric_mean(L: List[float]):",
@ -39,13 +39,13 @@ async def test_flash_starcoder_gptq_default_params(
        seed=0,
    )
    assert response.details.generated_tokens == 20
-    assert response == response_snapshot
+    assert response == generous_response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
 async def test_flash_starcoder_gptq_load(
-    flash_starcoder_gptq, generate_load, response_snapshot
+    flash_starcoder_gptq, generate_load, generous_response_snapshot
 ):
    responses = await generate_load(
        flash_starcoder_gptq,
@ -57,4 +57,4 @@ async def test_flash_starcoder_gptq_load(
    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])

-    assert responses == response_snapshot
+    assert responses == generous_response_snapshot