fix(server): fix init for flash causal lm (#352)

Fixes #347
2023-05-22 15:05:32 +02:00 · 2023-05-22 15:05:32 +02:00 · 91d9beec90
parent e649bf9a55
commit 91d9beec90
12 changed files with 1110 additions and 495 deletions
--- a/integration-tests/models/snapshots/test_flash_neox/test_flash_neox.json
+++ b/integration-tests/models/snapshots/test_flash_neox/test_flash_neox.json
@ -7,157 +7,107 @@
      {
        "id": 50278,
        "logprob": null,
-        "text": "<|prompter|>"
+        "text": "<|USER|>"
      },
      {
        "id": 1276,
-        "logprob": -8.03125,
+        "logprob": -4.5546875,
        "text": "What"
      },
-      {
-        "id": 310,
-        "logprob": -5.421875,
-        "text": " is"
-      },
-      {
-        "id": 247,
-        "logprob": -2.1601562,
-        "text": " a"
-      },
-      {
-        "id": 1167,
-        "logprob": -5.4609375,
-        "text": " mem"
-      },
-      {
-        "id": 70,
-        "logprob": -0.005657196,
-        "text": "e"
-      },
-      {
-        "id": 13,
-        "logprob": -7.28125,
-        "text": ","
-      },
-      {
-        "id": 285,
-        "logprob": -0.2980957,
-        "text": " and"
-      },
-      {
-        "id": 752,
-        "logprob": -2.1679688,
-        "text": " what"
-      },
      {
        "id": 434,
-        "logprob": -5.6210938,
+        "logprob": -4.234375,
        "text": "'s"
      },
      {
-        "id": 253,
-        "logprob": -0.81103516,
-        "text": " the"
+        "id": 634,
+        "logprob": -5.1054688,
+        "text": " your"
      },
      {
-        "id": 2892,
-        "logprob": -6.6640625,
-        "text": " history"
+        "id": 12315,
+        "logprob": -9.953125,
+        "text": " mood"
      },
      {
-        "id": 3212,
-        "logprob": -2.265625,
-        "text": " behind"
-      },
-      {
-        "id": 436,
-        "logprob": -11.5078125,
-        "text": " this"
-      },
-      {
-        "id": 3159,
-        "logprob": -2.1582031,
-        "text": " word"
+        "id": 3063,
+        "logprob": -4.0820312,
+        "text": " today"
      },
      {
        "id": 32,
-        "logprob": -0.008720398,
+        "logprob": -0.15148926,
        "text": "?"
      },
      {
-        "id": 0,
-        "logprob": -2.4726562,
-        "text": "<|endoftext|>"
-      },
-      {
-        "id": 50281,
-        "logprob": -18.265625,
-        "text": "<|assistant|>"
+        "id": 50279,
+        "logprob": -0.27026367,
+        "text": "<|ASSISTANT|>"
      }
    ],
    "seed": null,
    "tokens": [
      {
-        "id": 510,
-        "logprob": -0.63183594,
+        "id": 42,
+        "logprob": -0.88378906,
        "special": false,
-        "text": "The"
+        "text": "I"
      },
      {
-        "id": 3159,
-        "logprob": -0.5390625,
+        "id": 1353,
+        "logprob": -0.94921875,
        "special": false,
-        "text": " word"
+        "text": "'m"
      },
      {
-        "id": 346,
-        "logprob": -0.045684814,
+        "id": 417,
+        "logprob": -2.2402344,
        "special": false,
-        "text": " \""
+        "text": " not"
      },
      {
-        "id": 6441,
-        "logprob": -0.002090454,
+        "id": 2119,
+        "logprob": -0.3725586,
        "special": false,
-        "text": "mem"
+        "text": " sure"
      },
      {
-        "id": 70,
-        "logprob": -1.3589859e-05,
+        "id": 13,
+        "logprob": -1.078125,
        "special": false,
-        "text": "e"
+        "text": ","
      },
      {
-        "id": 3,
-        "logprob": -0.0009455681,
+        "id": 534,
+        "logprob": -0.67822266,
        "special": false,
-        "text": "\""
+        "text": " which"
      },
      {
-        "id": 369,
-        "logprob": -0.088012695,
+        "id": 310,
+        "logprob": -1.3837891,
        "special": false,
-        "text": " was"
+        "text": " is"
      },
      {
-        "id": 806,
-        "logprob": -0.12585449,
+        "id": 253,
+        "logprob": -1.7050781,
        "special": false,
-        "text": " first"
+        "text": " the"
      },
      {
-        "id": 908,
-        "logprob": -0.017196655,
+        "id": 1682,
+        "logprob": -0.052001953,
        "special": false,
-        "text": " used"
+        "text": " best"
      },
      {
-        "id": 275,
-        "logprob": -0.49731445,
+        "id": 1039,
+        "logprob": -2.0390625,
        "special": false,
-        "text": " in"
+        "text": " way"
      }
    ]
  },
-  "generated_text": "The word \"meme\" was first used in"
+  "generated_text": "I'm not sure, which is the best way"
 }
--- a/integration-tests/models/snapshots/test_flash_neox/test_flash_neox_load.json
+++ b/integration-tests/models/snapshots/test_flash_neox/test_flash_neox_load.json
@ -8,159 +8,109 @@
        {
          "id": 50278,
          "logprob": null,
-          "text": "<|prompter|>"
+          "text": "<|USER|>"
        },
        {
          "id": 1276,
-          "logprob": -8.03125,
+          "logprob": -4.5546875,
          "text": "What"
        },
-        {
-          "id": 310,
-          "logprob": -5.421875,
-          "text": " is"
-        },
-        {
-          "id": 247,
-          "logprob": -2.1601562,
-          "text": " a"
-        },
-        {
-          "id": 1167,
-          "logprob": -5.4609375,
-          "text": " mem"
-        },
-        {
-          "id": 70,
-          "logprob": -0.005657196,
-          "text": "e"
-        },
-        {
-          "id": 13,
-          "logprob": -7.28125,
-          "text": ","
-        },
-        {
-          "id": 285,
-          "logprob": -0.2980957,
-          "text": " and"
-        },
-        {
-          "id": 752,
-          "logprob": -2.1679688,
-          "text": " what"
-        },
        {
          "id": 434,
-          "logprob": -5.6210938,
+          "logprob": -4.234375,
          "text": "'s"
        },
        {
-          "id": 253,
-          "logprob": -0.81103516,
-          "text": " the"
+          "id": 634,
+          "logprob": -5.21875,
+          "text": " your"
        },
        {
-          "id": 2892,
-          "logprob": -6.6640625,
-          "text": " history"
+          "id": 12315,
+          "logprob": -9.9375,
+          "text": " mood"
        },
        {
-          "id": 3212,
-          "logprob": -2.265625,
-          "text": " behind"
-        },
-        {
-          "id": 436,
-          "logprob": -11.5078125,
-          "text": " this"
-        },
-        {
-          "id": 3159,
-          "logprob": -2.1582031,
-          "text": " word"
+          "id": 3063,
+          "logprob": -4.1015625,
+          "text": " today"
        },
        {
          "id": 32,
-          "logprob": -0.008720398,
+          "logprob": -0.15319824,
          "text": "?"
        },
        {
-          "id": 0,
-          "logprob": -2.4726562,
-          "text": "<|endoftext|>"
-        },
-        {
-          "id": 50281,
-          "logprob": -18.265625,
-          "text": "<|assistant|>"
+          "id": 50279,
+          "logprob": -0.2614746,
+          "text": "<|ASSISTANT|>"
        }
      ],
      "seed": null,
      "tokens": [
        {
-          "id": 510,
-          "logprob": -0.63183594,
+          "id": 42,
+          "logprob": -0.8886719,
          "special": false,
-          "text": "The"
+          "text": "I"
        },
        {
-          "id": 3159,
-          "logprob": -0.5488281,
+          "id": 1353,
+          "logprob": -0.98046875,
          "special": false,
-          "text": " word"
+          "text": "'m"
        },
        {
-          "id": 346,
-          "logprob": -0.045684814,
+          "id": 417,
+          "logprob": -2.2265625,
          "special": false,
-          "text": " \""
+          "text": " not"
        },
        {
-          "id": 6441,
-          "logprob": -0.00207901,
+          "id": 2119,
+          "logprob": -0.3479004,
          "special": false,
-          "text": "mem"
+          "text": " sure"
        },
        {
-          "id": 70,
-          "logprob": -1.335144e-05,
+          "id": 13,
+          "logprob": -1.0117188,
          "special": false,
-          "text": "e"
+          "text": ","
        },
        {
-          "id": 3,
-          "logprob": -0.00097227097,
+          "id": 534,
+          "logprob": -0.67871094,
          "special": false,
-          "text": "\""
+          "text": " which"
        },
        {
-          "id": 369,
-          "logprob": -0.0892334,
+          "id": 310,
+          "logprob": -1.421875,
          "special": false,
-          "text": " was"
+          "text": " is"
        },
        {
-          "id": 806,
-          "logprob": -0.12463379,
+          "id": 253,
+          "logprob": -1.7382812,
          "special": false,
-          "text": " first"
+          "text": " the"
        },
        {
-          "id": 908,
-          "logprob": -0.01737976,
+          "id": 1682,
+          "logprob": -0.051330566,
          "special": false,
-          "text": " used"
+          "text": " best"
        },
        {
-          "id": 275,
-          "logprob": -0.50341797,
+          "id": 1039,
+          "logprob": -2.0390625,
          "special": false,
-          "text": " in"
+          "text": " way"
        }
      ]
    },
-    "generated_text": "The word \"meme\" was first used in"
+    "generated_text": "I'm not sure, which is the best way"
  },
  {
    "details": {
@ -171,159 +121,109 @@
        {
          "id": 50278,
          "logprob": null,
-          "text": "<|prompter|>"
+          "text": "<|USER|>"
        },
        {
          "id": 1276,
-          "logprob": -8.03125,
+          "logprob": -4.5546875,
          "text": "What"
        },
-        {
-          "id": 310,
-          "logprob": -5.421875,
-          "text": " is"
-        },
-        {
-          "id": 247,
-          "logprob": -2.1601562,
-          "text": " a"
-        },
-        {
-          "id": 1167,
-          "logprob": -5.4609375,
-          "text": " mem"
-        },
-        {
-          "id": 70,
-          "logprob": -0.005657196,
-          "text": "e"
-        },
-        {
-          "id": 13,
-          "logprob": -7.28125,
-          "text": ","
-        },
-        {
-          "id": 285,
-          "logprob": -0.2980957,
-          "text": " and"
-        },
-        {
-          "id": 752,
-          "logprob": -2.1679688,
-          "text": " what"
-        },
        {
          "id": 434,
-          "logprob": -5.6210938,
+          "logprob": -4.234375,
          "text": "'s"
        },
        {
-          "id": 253,
-          "logprob": -0.81103516,
-          "text": " the"
+          "id": 634,
+          "logprob": -5.1054688,
+          "text": " your"
        },
        {
-          "id": 2892,
-          "logprob": -6.6640625,
-          "text": " history"
+          "id": 12315,
+          "logprob": -9.953125,
+          "text": " mood"
        },
        {
-          "id": 3212,
-          "logprob": -2.265625,
-          "text": " behind"
-        },
-        {
-          "id": 436,
-          "logprob": -11.5078125,
-          "text": " this"
-        },
-        {
-          "id": 3159,
-          "logprob": -2.1582031,
-          "text": " word"
+          "id": 3063,
+          "logprob": -4.0820312,
+          "text": " today"
        },
        {
          "id": 32,
-          "logprob": -0.008720398,
+          "logprob": -0.15148926,
          "text": "?"
        },
        {
-          "id": 0,
-          "logprob": -2.4726562,
-          "text": "<|endoftext|>"
-        },
-        {
-          "id": 50281,
-          "logprob": -18.265625,
-          "text": "<|assistant|>"
+          "id": 50279,
+          "logprob": -0.27026367,
+          "text": "<|ASSISTANT|>"
        }
      ],
      "seed": null,
      "tokens": [
        {
-          "id": 510,
-          "logprob": -0.63183594,
+          "id": 42,
+          "logprob": -0.88378906,
          "special": false,
-          "text": "The"
+          "text": "I"
        },
        {
-          "id": 3159,
-          "logprob": -0.5488281,
+          "id": 1353,
+          "logprob": -0.9819336,
          "special": false,
-          "text": " word"
+          "text": "'m"
        },
        {
-          "id": 346,
-          "logprob": -0.045684814,
+          "id": 417,
+          "logprob": -2.2421875,
          "special": false,
-          "text": " \""
+          "text": " not"
        },
        {
-          "id": 6441,
-          "logprob": -0.00207901,
+          "id": 2119,
+          "logprob": -0.3474121,
          "special": false,
-          "text": "mem"
+          "text": " sure"
        },
        {
-          "id": 70,
-          "logprob": -1.335144e-05,
+          "id": 13,
+          "logprob": -1.078125,
          "special": false,
-          "text": "e"
+          "text": ","
        },
        {
-          "id": 3,
-          "logprob": -0.00097227097,
+          "id": 534,
+          "logprob": -0.69140625,
          "special": false,
-          "text": "\""
+          "text": " which"
        },
        {
-          "id": 369,
-          "logprob": -0.0892334,
+          "id": 310,
+          "logprob": -1.4072266,
          "special": false,
-          "text": " was"
+          "text": " is"
        },
        {
-          "id": 806,
-          "logprob": -0.12463379,
+          "id": 253,
+          "logprob": -1.7041016,
          "special": false,
-          "text": " first"
+          "text": " the"
        },
        {
-          "id": 908,
-          "logprob": -0.01737976,
+          "id": 1682,
+          "logprob": -0.053375244,
          "special": false,
-          "text": " used"
+          "text": " best"
        },
        {
-          "id": 275,
-          "logprob": -0.50341797,
+          "id": 1039,
+          "logprob": -2.0351562,
          "special": false,
-          "text": " in"
+          "text": " way"
        }
      ]
    },
-    "generated_text": "The word \"meme\" was first used in"
+    "generated_text": "I'm not sure, which is the best way"
  },
  {
    "details": {
@ -334,159 +234,109 @@
        {
          "id": 50278,
          "logprob": null,
-          "text": "<|prompter|>"
+          "text": "<|USER|>"
        },
        {
          "id": 1276,
-          "logprob": -8.03125,
+          "logprob": -4.5546875,
          "text": "What"
        },
-        {
-          "id": 310,
-          "logprob": -5.421875,
-          "text": " is"
-        },
-        {
-          "id": 247,
-          "logprob": -2.1601562,
-          "text": " a"
-        },
-        {
-          "id": 1167,
-          "logprob": -5.4609375,
-          "text": " mem"
-        },
-        {
-          "id": 70,
-          "logprob": -0.005657196,
-          "text": "e"
-        },
-        {
-          "id": 13,
-          "logprob": -7.28125,
-          "text": ","
-        },
-        {
-          "id": 285,
-          "logprob": -0.2980957,
-          "text": " and"
-        },
-        {
-          "id": 752,
-          "logprob": -2.1679688,
-          "text": " what"
-        },
        {
          "id": 434,
-          "logprob": -5.6210938,
+          "logprob": -4.234375,
          "text": "'s"
        },
        {
-          "id": 253,
-          "logprob": -0.81103516,
-          "text": " the"
+          "id": 634,
+          "logprob": -5.21875,
+          "text": " your"
        },
        {
-          "id": 2892,
-          "logprob": -6.6640625,
-          "text": " history"
+          "id": 12315,
+          "logprob": -9.9375,
+          "text": " mood"
        },
        {
-          "id": 3212,
-          "logprob": -2.265625,
-          "text": " behind"
-        },
-        {
-          "id": 436,
-          "logprob": -11.5078125,
-          "text": " this"
-        },
-        {
-          "id": 3159,
-          "logprob": -2.1582031,
-          "text": " word"
+          "id": 3063,
+          "logprob": -4.1015625,
+          "text": " today"
        },
        {
          "id": 32,
-          "logprob": -0.008720398,
+          "logprob": -0.15319824,
          "text": "?"
        },
        {
-          "id": 0,
-          "logprob": -2.4726562,
-          "text": "<|endoftext|>"
-        },
-        {
-          "id": 50281,
-          "logprob": -18.265625,
-          "text": "<|assistant|>"
+          "id": 50279,
+          "logprob": -0.2614746,
+          "text": "<|ASSISTANT|>"
        }
      ],
      "seed": null,
      "tokens": [
        {
-          "id": 510,
-          "logprob": -0.63183594,
+          "id": 42,
+          "logprob": -0.8886719,
          "special": false,
-          "text": "The"
+          "text": "I"
        },
        {
-          "id": 3159,
-          "logprob": -0.5488281,
+          "id": 1353,
+          "logprob": -0.98046875,
          "special": false,
-          "text": " word"
+          "text": "'m"
        },
        {
-          "id": 346,
-          "logprob": -0.045684814,
+          "id": 417,
+          "logprob": -2.2265625,
          "special": false,
-          "text": " \""
+          "text": " not"
        },
        {
-          "id": 6441,
-          "logprob": -0.00207901,
+          "id": 2119,
+          "logprob": -0.3479004,
          "special": false,
-          "text": "mem"
+          "text": " sure"
        },
        {
-          "id": 70,
-          "logprob": -1.335144e-05,
+          "id": 13,
+          "logprob": -1.0117188,
          "special": false,
-          "text": "e"
+          "text": ","
        },
        {
-          "id": 3,
-          "logprob": -0.00097227097,
+          "id": 534,
+          "logprob": -0.67871094,
          "special": false,
-          "text": "\""
+          "text": " which"
        },
        {
-          "id": 369,
-          "logprob": -0.0892334,
+          "id": 310,
+          "logprob": -1.421875,
          "special": false,
-          "text": " was"
+          "text": " is"
        },
        {
-          "id": 806,
-          "logprob": -0.12463379,
+          "id": 253,
+          "logprob": -1.7382812,
          "special": false,
-          "text": " first"
+          "text": " the"
        },
        {
-          "id": 908,
-          "logprob": -0.01737976,
+          "id": 1682,
+          "logprob": -0.051330566,
          "special": false,
-          "text": " used"
+          "text": " best"
        },
        {
-          "id": 275,
-          "logprob": -0.50341797,
+          "id": 1039,
+          "logprob": -2.0390625,
          "special": false,
-          "text": " in"
+          "text": " way"
        }
      ]
    },
-    "generated_text": "The word \"meme\" was first used in"
+    "generated_text": "I'm not sure, which is the best way"
  },
  {
    "details": {
@ -497,158 +347,108 @@
        {
          "id": 50278,
          "logprob": null,
-          "text": "<|prompter|>"
+          "text": "<|USER|>"
        },
        {
          "id": 1276,
-          "logprob": -8.03125,
+          "logprob": -4.5546875,
          "text": "What"
        },
-        {
-          "id": 310,
-          "logprob": -5.421875,
-          "text": " is"
-        },
-        {
-          "id": 247,
-          "logprob": -2.1601562,
-          "text": " a"
-        },
-        {
-          "id": 1167,
-          "logprob": -5.4609375,
-          "text": " mem"
-        },
-        {
-          "id": 70,
-          "logprob": -0.005657196,
-          "text": "e"
-        },
-        {
-          "id": 13,
-          "logprob": -7.28125,
-          "text": ","
-        },
-        {
-          "id": 285,
-          "logprob": -0.2980957,
-          "text": " and"
-        },
-        {
-          "id": 752,
-          "logprob": -2.1679688,
-          "text": " what"
-        },
        {
          "id": 434,
-          "logprob": -5.6210938,
+          "logprob": -4.234375,
          "text": "'s"
        },
        {
-          "id": 253,
-          "logprob": -0.81103516,
-          "text": " the"
+          "id": 634,
+          "logprob": -5.21875,
+          "text": " your"
        },
        {
-          "id": 2892,
-          "logprob": -6.6640625,
-          "text": " history"
+          "id": 12315,
+          "logprob": -9.9375,
+          "text": " mood"
        },
        {
-          "id": 3212,
-          "logprob": -2.265625,
-          "text": " behind"
-        },
-        {
-          "id": 436,
-          "logprob": -11.5078125,
-          "text": " this"
-        },
-        {
-          "id": 3159,
-          "logprob": -2.1582031,
-          "text": " word"
+          "id": 3063,
+          "logprob": -4.1015625,
+          "text": " today"
        },
        {
          "id": 32,
-          "logprob": -0.008720398,
+          "logprob": -0.15319824,
          "text": "?"
        },
        {
-          "id": 0,
-          "logprob": -2.4726562,
-          "text": "<|endoftext|>"
-        },
-        {
-          "id": 50281,
-          "logprob": -18.265625,
-          "text": "<|assistant|>"
+          "id": 50279,
+          "logprob": -0.2614746,
+          "text": "<|ASSISTANT|>"
        }
      ],
      "seed": null,
      "tokens": [
        {
-          "id": 510,
-          "logprob": -0.63183594,
+          "id": 42,
+          "logprob": -0.8886719,
          "special": false,
-          "text": "The"
+          "text": "I"
        },
        {
-          "id": 3159,
-          "logprob": -0.5488281,
+          "id": 1353,
+          "logprob": -0.98046875,
          "special": false,
-          "text": " word"
+          "text": "'m"
        },
        {
-          "id": 346,
-          "logprob": -0.045684814,
+          "id": 417,
+          "logprob": -2.2265625,
          "special": false,
-          "text": " \""
+          "text": " not"
        },
        {
-          "id": 6441,
-          "logprob": -0.00207901,
+          "id": 2119,
+          "logprob": -0.3479004,
          "special": false,
-          "text": "mem"
+          "text": " sure"
        },
        {
-          "id": 70,
-          "logprob": -1.335144e-05,
+          "id": 13,
+          "logprob": -1.0117188,
          "special": false,
-          "text": "e"
+          "text": ","
        },
        {
-          "id": 3,
-          "logprob": -0.00097227097,
+          "id": 534,
+          "logprob": -0.67871094,
          "special": false,
-          "text": "\""
+          "text": " which"
        },
        {
-          "id": 369,
-          "logprob": -0.0892334,
+          "id": 310,
+          "logprob": -1.421875,
          "special": false,
-          "text": " was"
+          "text": " is"
        },
        {
-          "id": 806,
-          "logprob": -0.12463379,
+          "id": 253,
+          "logprob": -1.7382812,
          "special": false,
-          "text": " first"
+          "text": " the"
        },
        {
-          "id": 908,
-          "logprob": -0.01737976,
+          "id": 1682,
+          "logprob": -0.051330566,
          "special": false,
-          "text": " used"
+          "text": " best"
        },
        {
-          "id": 275,
-          "logprob": -0.50341797,
+          "id": 1039,
+          "logprob": -2.0390625,
          "special": false,
-          "text": " in"
+          "text": " way"
        }
      ]
    },
-    "generated_text": "The word \"meme\" was first used in"
+    "generated_text": "I'm not sure, which is the best way"
  }
 ]
--- a/integration-tests/models/snapshots/test_flash_neox_sharded/test_flash_neox.json
+++ b/integration-tests/models/snapshots/test_flash_neox_sharded/test_flash_neox.json
@ -0,0 +1,163 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 50278,
+        "logprob": null,
+        "text": "<|prompter|>"
+      },
+      {
+        "id": 1276,
+        "logprob": -8.03125,
+        "text": "What"
+      },
+      {
+        "id": 310,
+        "logprob": -5.421875,
+        "text": " is"
+      },
+      {
+        "id": 247,
+        "logprob": -2.1601562,
+        "text": " a"
+      },
+      {
+        "id": 1167,
+        "logprob": -5.4609375,
+        "text": " mem"
+      },
+      {
+        "id": 70,
+        "logprob": -0.005657196,
+        "text": "e"
+      },
+      {
+        "id": 13,
+        "logprob": -7.28125,
+        "text": ","
+      },
+      {
+        "id": 285,
+        "logprob": -0.2980957,
+        "text": " and"
+      },
+      {
+        "id": 752,
+        "logprob": -2.1679688,
+        "text": " what"
+      },
+      {
+        "id": 434,
+        "logprob": -5.6210938,
+        "text": "'s"
+      },
+      {
+        "id": 253,
+        "logprob": -0.81103516,
+        "text": " the"
+      },
+      {
+        "id": 2892,
+        "logprob": -6.6640625,
+        "text": " history"
+      },
+      {
+        "id": 3212,
+        "logprob": -2.265625,
+        "text": " behind"
+      },
+      {
+        "id": 436,
+        "logprob": -11.5078125,
+        "text": " this"
+      },
+      {
+        "id": 3159,
+        "logprob": -2.1582031,
+        "text": " word"
+      },
+      {
+        "id": 32,
+        "logprob": -0.008720398,
+        "text": "?"
+      },
+      {
+        "id": 0,
+        "logprob": -2.4726562,
+        "text": "<|endoftext|>"
+      },
+      {
+        "id": 50281,
+        "logprob": -18.265625,
+        "text": "<|assistant|>"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 510,
+        "logprob": -0.63183594,
+        "special": false,
+        "text": "The"
+      },
+      {
+        "id": 3159,
+        "logprob": -0.5390625,
+        "special": false,
+        "text": " word"
+      },
+      {
+        "id": 346,
+        "logprob": -0.045684814,
+        "special": false,
+        "text": " \""
+      },
+      {
+        "id": 6441,
+        "logprob": -0.002090454,
+        "special": false,
+        "text": "mem"
+      },
+      {
+        "id": 70,
+        "logprob": -1.3589859e-05,
+        "special": false,
+        "text": "e"
+      },
+      {
+        "id": 3,
+        "logprob": -0.0009455681,
+        "special": false,
+        "text": "\""
+      },
+      {
+        "id": 369,
+        "logprob": -0.088012695,
+        "special": false,
+        "text": " was"
+      },
+      {
+        "id": 806,
+        "logprob": -0.12585449,
+        "special": false,
+        "text": " first"
+      },
+      {
+        "id": 908,
+        "logprob": -0.017196655,
+        "special": false,
+        "text": " used"
+      },
+      {
+        "id": 275,
+        "logprob": -0.49731445,
+        "special": false,
+        "text": " in"
+      }
+    ]
+  },
+  "generated_text": "The word \"meme\" was first used in"
+}
--- a/integration-tests/models/snapshots/test_flash_neox_sharded/test_flash_neox_load.json
+++ b/integration-tests/models/snapshots/test_flash_neox_sharded/test_flash_neox_load.json
@ -0,0 +1,654 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 50278,
+          "logprob": null,
+          "text": "<|prompter|>"
+        },
+        {
+          "id": 1276,
+          "logprob": -8.03125,
+          "text": "What"
+        },
+        {
+          "id": 310,
+          "logprob": -5.421875,
+          "text": " is"
+        },
+        {
+          "id": 247,
+          "logprob": -2.1601562,
+          "text": " a"
+        },
+        {
+          "id": 1167,
+          "logprob": -5.4609375,
+          "text": " mem"
+        },
+        {
+          "id": 70,
+          "logprob": -0.005657196,
+          "text": "e"
+        },
+        {
+          "id": 13,
+          "logprob": -7.28125,
+          "text": ","
+        },
+        {
+          "id": 285,
+          "logprob": -0.2980957,
+          "text": " and"
+        },
+        {
+          "id": 752,
+          "logprob": -2.1679688,
+          "text": " what"
+        },
+        {
+          "id": 434,
+          "logprob": -5.6210938,
+          "text": "'s"
+        },
+        {
+          "id": 253,
+          "logprob": -0.81103516,
+          "text": " the"
+        },
+        {
+          "id": 2892,
+          "logprob": -6.6640625,
+          "text": " history"
+        },
+        {
+          "id": 3212,
+          "logprob": -2.265625,
+          "text": " behind"
+        },
+        {
+          "id": 436,
+          "logprob": -11.5078125,
+          "text": " this"
+        },
+        {
+          "id": 3159,
+          "logprob": -2.1582031,
+          "text": " word"
+        },
+        {
+          "id": 32,
+          "logprob": -0.008720398,
+          "text": "?"
+        },
+        {
+          "id": 0,
+          "logprob": -2.4726562,
+          "text": "<|endoftext|>"
+        },
+        {
+          "id": 50281,
+          "logprob": -18.265625,
+          "text": "<|assistant|>"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 510,
+          "logprob": -0.63183594,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 3159,
+          "logprob": -0.5488281,
+          "special": false,
+          "text": " word"
+        },
+        {
+          "id": 346,
+          "logprob": -0.045684814,
+          "special": false,
+          "text": " \""
+        },
+        {
+          "id": 6441,
+          "logprob": -0.00207901,
+          "special": false,
+          "text": "mem"
+        },
+        {
+          "id": 70,
+          "logprob": -1.335144e-05,
+          "special": false,
+          "text": "e"
+        },
+        {
+          "id": 3,
+          "logprob": -0.00097227097,
+          "special": false,
+          "text": "\""
+        },
+        {
+          "id": 369,
+          "logprob": -0.0892334,
+          "special": false,
+          "text": " was"
+        },
+        {
+          "id": 806,
+          "logprob": -0.12463379,
+          "special": false,
+          "text": " first"
+        },
+        {
+          "id": 908,
+          "logprob": -0.01737976,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 275,
+          "logprob": -0.50341797,
+          "special": false,
+          "text": " in"
+        }
+      ]
+    },
+    "generated_text": "The word \"meme\" was first used in"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 50278,
+          "logprob": null,
+          "text": "<|prompter|>"
+        },
+        {
+          "id": 1276,
+          "logprob": -8.03125,
+          "text": "What"
+        },
+        {
+          "id": 310,
+          "logprob": -5.421875,
+          "text": " is"
+        },
+        {
+          "id": 247,
+          "logprob": -2.1601562,
+          "text": " a"
+        },
+        {
+          "id": 1167,
+          "logprob": -5.4609375,
+          "text": " mem"
+        },
+        {
+          "id": 70,
+          "logprob": -0.005657196,
+          "text": "e"
+        },
+        {
+          "id": 13,
+          "logprob": -7.28125,
+          "text": ","
+        },
+        {
+          "id": 285,
+          "logprob": -0.2980957,
+          "text": " and"
+        },
+        {
+          "id": 752,
+          "logprob": -2.1679688,
+          "text": " what"
+        },
+        {
+          "id": 434,
+          "logprob": -5.6210938,
+          "text": "'s"
+        },
+        {
+          "id": 253,
+          "logprob": -0.81103516,
+          "text": " the"
+        },
+        {
+          "id": 2892,
+          "logprob": -6.6640625,
+          "text": " history"
+        },
+        {
+          "id": 3212,
+          "logprob": -2.265625,
+          "text": " behind"
+        },
+        {
+          "id": 436,
+          "logprob": -11.5078125,
+          "text": " this"
+        },
+        {
+          "id": 3159,
+          "logprob": -2.1582031,
+          "text": " word"
+        },
+        {
+          "id": 32,
+          "logprob": -0.008720398,
+          "text": "?"
+        },
+        {
+          "id": 0,
+          "logprob": -2.4726562,
+          "text": "<|endoftext|>"
+        },
+        {
+          "id": 50281,
+          "logprob": -18.265625,
+          "text": "<|assistant|>"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 510,
+          "logprob": -0.63183594,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 3159,
+          "logprob": -0.5488281,
+          "special": false,
+          "text": " word"
+        },
+        {
+          "id": 346,
+          "logprob": -0.045684814,
+          "special": false,
+          "text": " \""
+        },
+        {
+          "id": 6441,
+          "logprob": -0.00207901,
+          "special": false,
+          "text": "mem"
+        },
+        {
+          "id": 70,
+          "logprob": -1.335144e-05,
+          "special": false,
+          "text": "e"
+        },
+        {
+          "id": 3,
+          "logprob": -0.00097227097,
+          "special": false,
+          "text": "\""
+        },
+        {
+          "id": 369,
+          "logprob": -0.0892334,
+          "special": false,
+          "text": " was"
+        },
+        {
+          "id": 806,
+          "logprob": -0.12463379,
+          "special": false,
+          "text": " first"
+        },
+        {
+          "id": 908,
+          "logprob": -0.01737976,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 275,
+          "logprob": -0.50341797,
+          "special": false,
+          "text": " in"
+        }
+      ]
+    },
+    "generated_text": "The word \"meme\" was first used in"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 50278,
+          "logprob": null,
+          "text": "<|prompter|>"
+        },
+        {
+          "id": 1276,
+          "logprob": -8.03125,
+          "text": "What"
+        },
+        {
+          "id": 310,
+          "logprob": -5.421875,
+          "text": " is"
+        },
+        {
+          "id": 247,
+          "logprob": -2.1601562,
+          "text": " a"
+        },
+        {
+          "id": 1167,
+          "logprob": -5.4609375,
+          "text": " mem"
+        },
+        {
+          "id": 70,
+          "logprob": -0.005657196,
+          "text": "e"
+        },
+        {
+          "id": 13,
+          "logprob": -7.28125,
+          "text": ","
+        },
+        {
+          "id": 285,
+          "logprob": -0.2980957,
+          "text": " and"
+        },
+        {
+          "id": 752,
+          "logprob": -2.1679688,
+          "text": " what"
+        },
+        {
+          "id": 434,
+          "logprob": -5.6210938,
+          "text": "'s"
+        },
+        {
+          "id": 253,
+          "logprob": -0.81103516,
+          "text": " the"
+        },
+        {
+          "id": 2892,
+          "logprob": -6.6640625,
+          "text": " history"
+        },
+        {
+          "id": 3212,
+          "logprob": -2.265625,
+          "text": " behind"
+        },
+        {
+          "id": 436,
+          "logprob": -11.5078125,
+          "text": " this"
+        },
+        {
+          "id": 3159,
+          "logprob": -2.1582031,
+          "text": " word"
+        },
+        {
+          "id": 32,
+          "logprob": -0.008720398,
+          "text": "?"
+        },
+        {
+          "id": 0,
+          "logprob": -2.4726562,
+          "text": "<|endoftext|>"
+        },
+        {
+          "id": 50281,
+          "logprob": -18.265625,
+          "text": "<|assistant|>"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 510,
+          "logprob": -0.63183594,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 3159,
+          "logprob": -0.5488281,
+          "special": false,
+          "text": " word"
+        },
+        {
+          "id": 346,
+          "logprob": -0.045684814,
+          "special": false,
+          "text": " \""
+        },
+        {
+          "id": 6441,
+          "logprob": -0.00207901,
+          "special": false,
+          "text": "mem"
+        },
+        {
+          "id": 70,
+          "logprob": -1.335144e-05,
+          "special": false,
+          "text": "e"
+        },
+        {
+          "id": 3,
+          "logprob": -0.00097227097,
+          "special": false,
+          "text": "\""
+        },
+        {
+          "id": 369,
+          "logprob": -0.0892334,
+          "special": false,
+          "text": " was"
+        },
+        {
+          "id": 806,
+          "logprob": -0.12463379,
+          "special": false,
+          "text": " first"
+        },
+        {
+          "id": 908,
+          "logprob": -0.01737976,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 275,
+          "logprob": -0.50341797,
+          "special": false,
+          "text": " in"
+        }
+      ]
+    },
+    "generated_text": "The word \"meme\" was first used in"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 50278,
+          "logprob": null,
+          "text": "<|prompter|>"
+        },
+        {
+          "id": 1276,
+          "logprob": -8.03125,
+          "text": "What"
+        },
+        {
+          "id": 310,
+          "logprob": -5.421875,
+          "text": " is"
+        },
+        {
+          "id": 247,
+          "logprob": -2.1601562,
+          "text": " a"
+        },
+        {
+          "id": 1167,
+          "logprob": -5.4609375,
+          "text": " mem"
+        },
+        {
+          "id": 70,
+          "logprob": -0.005657196,
+          "text": "e"
+        },
+        {
+          "id": 13,
+          "logprob": -7.28125,
+          "text": ","
+        },
+        {
+          "id": 285,
+          "logprob": -0.2980957,
+          "text": " and"
+        },
+        {
+          "id": 752,
+          "logprob": -2.1679688,
+          "text": " what"
+        },
+        {
+          "id": 434,
+          "logprob": -5.6210938,
+          "text": "'s"
+        },
+        {
+          "id": 253,
+          "logprob": -0.81103516,
+          "text": " the"
+        },
+        {
+          "id": 2892,
+          "logprob": -6.6640625,
+          "text": " history"
+        },
+        {
+          "id": 3212,
+          "logprob": -2.265625,
+          "text": " behind"
+        },
+        {
+          "id": 436,
+          "logprob": -11.5078125,
+          "text": " this"
+        },
+        {
+          "id": 3159,
+          "logprob": -2.1582031,
+          "text": " word"
+        },
+        {
+          "id": 32,
+          "logprob": -0.008720398,
+          "text": "?"
+        },
+        {
+          "id": 0,
+          "logprob": -2.4726562,
+          "text": "<|endoftext|>"
+        },
+        {
+          "id": 50281,
+          "logprob": -18.265625,
+          "text": "<|assistant|>"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 510,
+          "logprob": -0.63183594,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 3159,
+          "logprob": -0.5488281,
+          "special": false,
+          "text": " word"
+        },
+        {
+          "id": 346,
+          "logprob": -0.045684814,
+          "special": false,
+          "text": " \""
+        },
+        {
+          "id": 6441,
+          "logprob": -0.00207901,
+          "special": false,
+          "text": "mem"
+        },
+        {
+          "id": 70,
+          "logprob": -1.335144e-05,
+          "special": false,
+          "text": "e"
+        },
+        {
+          "id": 3,
+          "logprob": -0.00097227097,
+          "special": false,
+          "text": "\""
+        },
+        {
+          "id": 369,
+          "logprob": -0.0892334,
+          "special": false,
+          "text": " was"
+        },
+        {
+          "id": 806,
+          "logprob": -0.12463379,
+          "special": false,
+          "text": " first"
+        },
+        {
+          "id": 908,
+          "logprob": -0.01737976,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 275,
+          "logprob": -0.50341797,
+          "special": false,
+          "text": " in"
+        }
+      ]
+    },
+    "generated_text": "The word \"meme\" was first used in"
+  }
+]
--- a/integration-tests/models/test_flash_neox.py
+++ b/integration-tests/models/test_flash_neox.py
@ -3,7 +3,7 @@ import pytest

@pytest.fixture(scope="module")
 def flash_neox_handle(launcher):
-    with launcher("OpenAssistant/oasst-sft-1-pythia-12b", num_shard=2) as handle:
+    with launcher("stabilityai/stablelm-tuned-alpha-3b", num_shard=1) as handle:
        yield handle


@ -16,7 +16,7 @@ async def flash_neox(flash_neox_handle):
@pytest.mark.asyncio
 async def test_flash_neox(flash_neox, response_snapshot):
    response = await flash_neox.generate(
-        "<|prompter|>What is a meme, and what's the history behind this word?<|endoftext|><|assistant|>",
+        "<|USER|>What's your mood today?<|ASSISTANT|>",
        max_new_tokens=10,
    )

@ -28,12 +28,14 @@ async def test_flash_neox(flash_neox, response_snapshot):
 async def test_flash_neox_load(flash_neox, generate_load, response_snapshot):
    responses = await generate_load(
        flash_neox,
-        "<|prompter|>What is a meme, and what's the history behind this word?<|endoftext|><|assistant|>",
+        "<|USER|>What's your mood today?<|ASSISTANT|>",
        max_new_tokens=10,
        n=4,
    )

-    assert len(responses) == 4
-    assert all([r.generated_text == responses[0].generated_text for r in responses])
+    generated_texts = [r.generated_text for r in responses]
+
+    assert len(generated_texts) == 4
+    assert generated_texts, all([text == generated_texts[0] for text in generated_texts])

    assert responses == response_snapshot
--- a/integration-tests/models/test_flash_neox_sharded.py
+++ b/integration-tests/models/test_flash_neox_sharded.py
@ -0,0 +1,39 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_neox_sharded_handle(launcher):
+    with launcher("OpenAssistant/oasst-sft-1-pythia-12b", num_shard=2) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_neox_sharded(flash_neox_sharded_handle):
+    await flash_neox_sharded_handle.health(240)
+    return flash_neox_sharded_handle.client
+
+
+@pytest.mark.asyncio
+async def test_flash_neox(flash_neox_sharded, response_snapshot):
+    response = await flash_neox_sharded.generate(
+        "<|prompter|>What is a meme, and what's the history behind this word?<|endoftext|><|assistant|>",
+        max_new_tokens=10,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+async def test_flash_neox_load(flash_neox_sharded, generate_load, response_snapshot):
+    responses = await generate_load(
+        flash_neox_sharded,
+        "<|prompter|>What is a meme, and what's the history behind this word?<|endoftext|><|assistant|>",
+        max_new_tokens=10,
+        n=4,
+    )
+
+    assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+
+    assert responses == response_snapshot
--- a/integration-tests/pytest.ini
+++ b/integration-tests/pytest.ini
@ -0,0 +1,4 @@
+[pytest]
+asyncio_mode = auto
+markers =
+    private: marks tests as requiring an admin hf token (deselect with '-m "not private"')
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@ -35,6 +35,9 @@ bnb = ["bitsandbytes"]
 grpcio-tools = "^1.51.1"
 pytest = "^7.3.0"

+[tool.pytest.ini_options]
+markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"]
+
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
--- a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
@ -362,7 +362,7 @@ class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel):
            pretrained_model_name_or_path, load_in_8bit=False, *model_args, **kwargs
        )

-        model.post_load_weights(load_in_8bit)
+        model.post_load_weights("bitsandbytes" if load_in_8bit else None)
        return model

    def forward(
@ -466,7 +466,7 @@ class FlashGPTNeoXForCausalLM(FlashGPTNeoXPreTrainedModel):
        model = super(FlashGPTNeoXForCausalLM, cls).from_pretrained(
            pretrained_model_name_or_path, load_in_8bit=False, *model_args, **kwargs
        )
-        model.post_load_weights(load_in_8bit)
+        model.post_load_weights("bitsandbytes" if load_in_8bit else None)
        return model

    def forward(
--- a/server/text_generation_server/models/flash_neox.py
+++ b/server/text_generation_server/models/flash_neox.py
@ -88,7 +88,7 @@ class FlashNeoXSharded(FlashNeoX):
    def load_weights(
        model,
        filenames: List[str],
-        quantize: bool,
+        quantize: Optional[str],
        device: torch.device,
        dtype: torch.dtype,
        rank: int,
--- a/server/text_generation_server/models/flash_santacoder.py
+++ b/server/text_generation_server/models/flash_santacoder.py
@ -80,7 +80,7 @@ class FlashSantacoder(FlashCausalLM):
    def load_weights(
        model: FlashSantacoderForCausalLM,
        filenames: List[Path],
-        quantize: bool,
+        quantize: Optional[str],
        device: torch.device,
        dtype: torch.dtype,
        transpose: bool,
--- a/server/text_generation_server/models/opt.py
+++ b/server/text_generation_server/models/opt.py
@ -101,7 +101,7 @@ class OPTSharded(OPT):
    def load_weights(
        model,
        filenames: List[str],
-        quantize: bool,
+        quantize: Optional[str],
        device: torch.device,
        dtype: torch.dtype,
        rank: int,