diff --git a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe.json b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe.json
index 0d6dca31..cfabe3c6 100644
--- a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe.json
+++ b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe.json
@@ -11,32 +11,32 @@
       },
       {
         "id": 338,
-        "logprob": -0.7133789,
+        "logprob": -0.6201172,
         "text": "is"
       },
       {
         "id": 16030,
-        "logprob": -13.9296875,
+        "logprob": -13.6484375,
         "text": "gradient"
       },
       {
         "id": 26815,
-        "logprob": -0.048919678,
+        "logprob": -0.003894806,
         "text": "descent"
       },
       {
         "id": 29973,
-        "logprob": -3.0078125,
+        "logprob": -2.6386719,
         "text": "?"
       },
       {
         "id": 13,
-        "logprob": -2.8105469,
+        "logprob": -6.46875,
         "text": "\n"
       },
       {
         "id": 13,
-        "logprob": -0.84521484,
+        "logprob": -6.6875,
         "text": "\n"
       }
     ],
@@ -44,66 +44,66 @@
     "tokens": [
       {
         "id": 25584,
-        "logprob": -0.017028809,
+        "logprob": -0.008979797,
         "special": false,
         "text": "Grad"
       },
       {
         "id": 993,
-        "logprob": -0.0027313232,
+        "logprob": -8.34465e-07,
         "special": false,
         "text": "ient"
       },
       {
         "id": 26815,
-        "logprob": -0.023254395,
+        "logprob": -0.0009407997,
         "special": false,
         "text": " descent"
       },
       {
         "id": 338,
-        "logprob": -2.0623207e-05,
+        "logprob": -0.0003838539,
         "special": false,
         "text": " is"
       },
       {
-        "id": 263,
-        "logprob": -0.5361328,
+        "id": 385,
+        "logprob": -0.24499512,
         "special": false,
-        "text": " a"
-      },
-      {
-        "id": 937,
-        "logprob": -0.17578125,
-        "special": false,
-        "text": " first"
-      },
-      {
-        "id": 29899,
-        "logprob": 0.0,
-        "special": false,
-        "text": "-"
-      },
-      {
-        "id": 2098,
-        "logprob": -0.00011539459,
-        "special": false,
-        "text": "order"
+        "text": " an"
       },
       {
         "id": 13883,
-        "logprob": -0.47436523,
+        "logprob": -0.010406494,
         "special": false,
         "text": " optimization"
       },
       {
         "id": 5687,
-        "logprob": -0.00027680397,
+        "logprob": -0.00024354458,
         "special": false,
         "text": " algorithm"
+      },
+      {
+        "id": 15574,
+        "logprob": -0.6582031,
+        "special": false,
+        "text": " commonly"
+      },
+      {
+        "id": 1304,
+        "logprob": -0.00092840195,
+        "special": false,
+        "text": " used"
+      },
+      {
+        "id": 297,
+        "logprob": -0.19470215,
+        "special": false,
+        "text": " in"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": "Gradient descent is a first-order optimization algorithm"
+  "generated_text": "Gradient descent is an optimization algorithm commonly used in"
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_all_params.json b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_all_params.json
index 38b80335..b524859f 100644
--- a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_all_params.json
+++ b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_all_params.json
@@ -5,95 +5,95 @@
     "generated_tokens": 10,
     "prefill": [
       {
-        "id": 16030,
+        "id": 338,
         "logprob": null,
+        "text": "is"
+      },
+      {
+        "id": 16030,
+        "logprob": -13.328125,
         "text": "gradient"
       },
       {
         "id": 26815,
-        "logprob": -6.4960938,
+        "logprob": -0.24023438,
         "text": "descent"
       },
       {
         "id": 29973,
-        "logprob": -5.1484375,
+        "logprob": -3.1386719,
         "text": "?"
       },
       {
         "id": 13,
-        "logprob": -4.0351562,
-        "text": "\n"
-      },
-      {
-        "id": 13,
-        "logprob": -5.2265625,
+        "logprob": -3.0878906,
         "text": "\n"
       }
     ],
     "seed": 0,
     "tokens": [
       {
-        "id": 10994,
-        "logprob": -1.1542969,
-        "special": false,
-        "text": "Hello"
-      },
-      {
-        "id": 29991,
+        "id": 25584,
         "logprob": 0.0,
         "special": false,
-        "text": "!"
+        "text": "Grad"
       },
       {
-        "id": 739,
+        "id": 993,
         "logprob": 0.0,
         "special": false,
-        "text": " It"
+        "text": "ient"
       },
       {
-        "id": 2444,
-        "logprob": -0.42260742,
-        "special": false,
-        "text": " seems"
-      },
-      {
-        "id": 366,
+        "id": 2726,
         "logprob": 0.0,
         "special": false,
-        "text": " you"
+        "text": " Des"
       },
       {
-        "id": 29915,
+        "id": 1760,
         "logprob": 0.0,
         "special": false,
-        "text": "'"
+        "text": "cent"
       },
       {
-        "id": 276,
-        "logprob": -0.9838867,
+        "id": 313,
+        "logprob": -0.12322998,
         "special": false,
-        "text": "re"
+        "text": " ("
       },
       {
-        "id": 3211,
+        "id": 29954,
         "logprob": 0.0,
         "special": false,
-        "text": " address"
+        "text": "G"
       },
       {
-        "id": 292,
+        "id": 29928,
         "logprob": 0.0,
         "special": false,
-        "text": "ing"
+        "text": "D"
       },
       {
-        "id": 263,
-        "logprob": -0.15124512,
+        "id": 29897,
+        "logprob": 0.0,
         "special": false,
-        "text": " a"
+        "text": ")"
+      },
+      {
+        "id": 338,
+        "logprob": -0.6040039,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 385,
+        "logprob": -0.1796875,
+        "special": false,
+        "text": " an"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": "What is gradient descent?\n\nHello! It seems you're addressing a"
+  "generated_text": "What is gradient descent?\nGradient Descent (GD) is an"
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_load.json b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_load.json
index f1f81152..2c977d8b 100644
--- a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_load.json
+++ b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_load.json
@@ -12,32 +12,32 @@
         },
         {
           "id": 338,
-          "logprob": -0.7133789,
+          "logprob": -0.6201172,
           "text": "is"
         },
         {
           "id": 16030,
-          "logprob": -13.9296875,
+          "logprob": -13.6484375,
           "text": "gradient"
         },
         {
           "id": 26815,
-          "logprob": -0.048919678,
+          "logprob": -0.003894806,
           "text": "descent"
         },
         {
           "id": 29973,
-          "logprob": -3.0078125,
+          "logprob": -2.6386719,
           "text": "?"
         },
         {
           "id": 13,
-          "logprob": -2.8105469,
+          "logprob": -6.46875,
           "text": "\n"
         },
         {
           "id": 13,
-          "logprob": -0.84521484,
+          "logprob": -6.6875,
           "text": "\n"
         }
       ],
@@ -45,68 +45,68 @@
       "tokens": [
         {
           "id": 25584,
-          "logprob": -0.017028809,
+          "logprob": -0.008979797,
           "special": false,
           "text": "Grad"
         },
         {
           "id": 993,
-          "logprob": -0.0028476715,
+          "logprob": -8.34465e-07,
           "special": false,
           "text": "ient"
         },
         {
           "id": 26815,
-          "logprob": -0.023971558,
+          "logprob": -0.00097084045,
           "special": false,
           "text": " descent"
         },
         {
           "id": 338,
-          "logprob": -2.0384789e-05,
+          "logprob": -0.0003838539,
           "special": false,
           "text": " is"
         },
         {
-          "id": 263,
-          "logprob": -0.5229492,
+          "id": 385,
+          "logprob": -0.23840332,
           "special": false,
-          "text": " a"
-        },
-        {
-          "id": 937,
-          "logprob": -0.17602539,
-          "special": false,
-          "text": " first"
-        },
-        {
-          "id": 29899,
-          "logprob": 0.0,
-          "special": false,
-          "text": "-"
-        },
-        {
-          "id": 2098,
-          "logprob": -0.000116467476,
-          "special": false,
-          "text": "order"
+          "text": " an"
         },
         {
           "id": 13883,
-          "logprob": -0.47436523,
+          "logprob": -0.010406494,
           "special": false,
           "text": " optimization"
         },
         {
           "id": 5687,
-          "logprob": -0.00027871132,
+          "logprob": -0.0002501011,
           "special": false,
           "text": " algorithm"
+        },
+        {
+          "id": 15574,
+          "logprob": -0.6582031,
+          "special": false,
+          "text": " commonly"
+        },
+        {
+          "id": 1304,
+          "logprob": -0.00092840195,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 297,
+          "logprob": -0.18933105,
+          "special": false,
+          "text": " in"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "Gradient descent is a first-order optimization algorithm"
+    "generated_text": "Gradient descent is an optimization algorithm commonly used in"
   },
   {
     "details": {
@@ -121,32 +121,32 @@
         },
         {
           "id": 338,
-          "logprob": -0.7128906,
+          "logprob": -0.6113281,
           "text": "is"
         },
         {
           "id": 16030,
-          "logprob": -13.9375,
+          "logprob": -13.6640625,
           "text": "gradient"
         },
         {
           "id": 26815,
-          "logprob": -0.05053711,
+          "logprob": -0.003929138,
           "text": "descent"
         },
         {
           "id": 29973,
-          "logprob": -3.0058594,
+          "logprob": -2.625,
           "text": "?"
         },
         {
           "id": 13,
-          "logprob": -2.8242188,
+          "logprob": -6.484375,
           "text": "\n"
         },
         {
           "id": 13,
-          "logprob": -0.84521484,
+          "logprob": -6.6875,
           "text": "\n"
         }
       ],
@@ -154,68 +154,68 @@
       "tokens": [
         {
           "id": 25584,
-          "logprob": -0.018859863,
+          "logprob": -0.009017944,
           "special": false,
           "text": "Grad"
         },
         {
           "id": 993,
-          "logprob": -0.002822876,
+          "logprob": -9.536743e-07,
           "special": false,
           "text": "ient"
         },
         {
           "id": 26815,
-          "logprob": -0.023254395,
+          "logprob": -0.00097084045,
           "special": false,
           "text": " descent"
         },
         {
           "id": 338,
-          "logprob": -2.0384789e-05,
+          "logprob": -0.0003838539,
           "special": false,
           "text": " is"
         },
         {
-          "id": 263,
-          "logprob": -0.5229492,
+          "id": 385,
+          "logprob": -0.24499512,
           "special": false,
-          "text": " a"
-        },
-        {
-          "id": 937,
-          "logprob": -0.17126465,
-          "special": false,
-          "text": " first"
-        },
-        {
-          "id": 29899,
-          "logprob": 0.0,
-          "special": false,
-          "text": "-"
-        },
-        {
-          "id": 2098,
-          "logprob": -0.0001155138,
-          "special": false,
-          "text": "order"
+          "text": " an"
         },
         {
           "id": 13883,
-          "logprob": -0.47436523,
+          "logprob": -0.010406494,
           "special": false,
           "text": " optimization"
         },
         {
           "id": 5687,
-          "logprob": -0.00027036667,
+          "logprob": -0.0002501011,
           "special": false,
           "text": " algorithm"
+        },
+        {
+          "id": 15574,
+          "logprob": -0.6435547,
+          "special": false,
+          "text": " commonly"
+        },
+        {
+          "id": 1304,
+          "logprob": -0.0009279251,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 297,
+          "logprob": -0.18933105,
+          "special": false,
+          "text": " in"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "Gradient descent is a first-order optimization algorithm"
+    "generated_text": "Gradient descent is an optimization algorithm commonly used in"
   },
   {
     "details": {
@@ -230,32 +230,32 @@
         },
         {
           "id": 338,
-          "logprob": -0.71484375,
+          "logprob": -0.609375,
           "text": "is"
         },
         {
           "id": 16030,
-          "logprob": -13.9375,
+          "logprob": -13.671875,
           "text": "gradient"
         },
         {
           "id": 26815,
-          "logprob": -0.049346924,
+          "logprob": -0.0040016174,
           "text": "descent"
         },
         {
           "id": 29973,
-          "logprob": -3.0078125,
+          "logprob": -2.6230469,
           "text": "?"
         },
         {
           "id": 13,
-          "logprob": -2.8242188,
+          "logprob": -6.453125,
           "text": "\n"
         },
         {
           "id": 13,
-          "logprob": -0.86328125,
+          "logprob": -6.6875,
           "text": "\n"
         }
       ],
@@ -263,68 +263,68 @@
       "tokens": [
         {
           "id": 25584,
-          "logprob": -0.017196655,
+          "logprob": -0.008956909,
           "special": false,
           "text": "Grad"
         },
         {
           "id": 993,
-          "logprob": -0.0028438568,
+          "logprob": -8.34465e-07,
           "special": false,
           "text": "ient"
         },
         {
           "id": 26815,
-          "logprob": -0.023254395,
+          "logprob": -0.0009407997,
           "special": false,
           "text": " descent"
         },
         {
           "id": 338,
-          "logprob": -2.026558e-05,
+          "logprob": -0.0003721714,
           "special": false,
           "text": " is"
         },
         {
-          "id": 263,
-          "logprob": -0.5229492,
+          "id": 385,
+          "logprob": -0.24499512,
           "special": false,
-          "text": " a"
-        },
-        {
-          "id": 937,
-          "logprob": -0.17602539,
-          "special": false,
-          "text": " first"
-        },
-        {
-          "id": 29899,
-          "logprob": 0.0,
-          "special": false,
-          "text": "-"
-        },
-        {
-          "id": 2098,
-          "logprob": -0.00011622906,
-          "special": false,
-          "text": "order"
+          "text": " an"
         },
         {
           "id": 13883,
-          "logprob": -0.48608398,
+          "logprob": -0.010406494,
           "special": false,
           "text": " optimization"
         },
         {
           "id": 5687,
-          "logprob": -0.00027894974,
+          "logprob": -0.0002501011,
           "special": false,
           "text": " algorithm"
+        },
+        {
+          "id": 15574,
+          "logprob": -0.6435547,
+          "special": false,
+          "text": " commonly"
+        },
+        {
+          "id": 1304,
+          "logprob": -0.00092601776,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 297,
+          "logprob": -0.19177246,
+          "special": false,
+          "text": " in"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "Gradient descent is a first-order optimization algorithm"
+    "generated_text": "Gradient descent is an optimization algorithm commonly used in"
   },
   {
     "details": {
@@ -339,32 +339,32 @@
         },
         {
           "id": 338,
-          "logprob": -0.7192383,
+          "logprob": -0.609375,
           "text": "is"
         },
         {
           "id": 16030,
-          "logprob": -13.9375,
+          "logprob": -13.6640625,
           "text": "gradient"
         },
         {
           "id": 26815,
-          "logprob": -0.050445557,
+          "logprob": -0.0038967133,
           "text": "descent"
         },
         {
           "id": 29973,
-          "logprob": -3.0078125,
+          "logprob": -2.6347656,
           "text": "?"
         },
         {
           "id": 13,
-          "logprob": -2.8242188,
+          "logprob": -6.453125,
           "text": "\n"
         },
         {
           "id": 13,
-          "logprob": -0.8276367,
+          "logprob": -6.6875,
           "text": "\n"
         }
       ],
@@ -372,67 +372,67 @@
       "tokens": [
         {
           "id": 25584,
-          "logprob": -0.01727295,
+          "logprob": -0.008979797,
           "special": false,
           "text": "Grad"
         },
         {
           "id": 993,
-          "logprob": -0.0027542114,
+          "logprob": -9.536743e-07,
           "special": false,
           "text": "ient"
         },
         {
           "id": 26815,
-          "logprob": -0.023254395,
+          "logprob": -0.0009407997,
           "special": false,
           "text": " descent"
         },
         {
           "id": 338,
-          "logprob": -2.0384789e-05,
+          "logprob": -0.00038409233,
           "special": false,
           "text": " is"
         },
         {
-          "id": 263,
-          "logprob": -0.5229492,
+          "id": 385,
+          "logprob": -0.24499512,
           "special": false,
-          "text": " a"
-        },
-        {
-          "id": 937,
-          "logprob": -0.17126465,
-          "special": false,
-          "text": " first"
-        },
-        {
-          "id": 29899,
-          "logprob": 0.0,
-          "special": false,
-          "text": "-"
-        },
-        {
-          "id": 2098,
-          "logprob": -0.00011301041,
-          "special": false,
-          "text": "order"
+          "text": " an"
         },
         {
           "id": 13883,
-          "logprob": -0.48608398,
+          "logprob": -0.010414124,
           "special": false,
           "text": " optimization"
         },
         {
           "id": 5687,
-          "logprob": -0.00027894974,
+          "logprob": -0.00024354458,
           "special": false,
           "text": " algorithm"
+        },
+        {
+          "id": 15574,
+          "logprob": -0.6435547,
+          "special": false,
+          "text": " commonly"
+        },
+        {
+          "id": 1304,
+          "logprob": -0.0009279251,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 297,
+          "logprob": -0.19470215,
+          "special": false,
+          "text": " in"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "Gradient descent is a first-order optimization algorithm"
+    "generated_text": "Gradient descent is an optimization algorithm commonly used in"
   }
 ]
diff --git a/integration-tests/models/test_flash_phi35_moe.py b/integration-tests/models/test_flash_phi35_moe.py
index 2173740a..d3043b02 100644
--- a/integration-tests/models/test_flash_phi35_moe.py
+++ b/integration-tests/models/test_flash_phi35_moe.py
@@ -25,7 +25,7 @@ async def test_flash_phi35_moe(flash_phi35_moe, response_snapshot):
     assert response.details.generated_tokens == 10
     assert (
         response.generated_text
-        == "Gradient descent is a first-order optimization algorithm"
+        == "Gradient descent is an optimization algorithm commonly used in"
     )
     assert response == response_snapshot
 
@@ -33,7 +33,7 @@ async def test_flash_phi35_moe(flash_phi35_moe, response_snapshot):
 @pytest.mark.asyncio
 async def test_flash_phi35_moe_all_params(flash_phi35_moe, response_snapshot):
     response = await flash_phi35_moe.generate(
-        "What is gradient descent?\n\n",
+        "What is gradient descent?\n",
         max_new_tokens=10,
         repetition_penalty=1.2,
         return_full_text=True,
@@ -51,7 +51,7 @@ async def test_flash_phi35_moe_all_params(flash_phi35_moe, response_snapshot):
     assert response.details.generated_tokens == 10
     assert (
         response.generated_text
-        == "What is gradient descent?\n\nHello! It seems you're addressing a"
+        == "What is gradient descent?\nGradient Descent (GD) is an"
     )
     assert response == response_snapshot
 
@@ -66,7 +66,7 @@ async def test_flash_phi35_moe_load(flash_phi35_moe, generate_load, response_sna
     assert responses[0].details.generated_tokens == 10
     assert (
         responses[0].generated_text
-        == "Gradient descent is a first-order optimization algorithm"
+        == "Gradient descent is an optimization algorithm commonly used in"
     )
     assert all(
         [r.generated_text == responses[0].generated_text for r in responses]