fix(server): Use cleanup_tokenization_spaces=False for lossless decoding (#13)

Fixes #12 in the easiest way I could think of.
This commit is contained in:
Nicolas Patry 2023-01-03 11:07:05 +01:00 committed by GitHub
parent 60472f9d2b
commit b94f30215f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 2 additions and 1 deletions

View File

@ -354,7 +354,8 @@ class CausalLM(Model):
if stop:
# Decode all tokens
output_text = self.tokenizer.decode(
all_input_ids.squeeze(-1), skip_special_tokens=True
all_input_ids.squeeze(-1), skip_special_tokens=True,
cleanup_tokenization_spaces=False
)
# Slice with input_length to remove padding
token_ids = all_input_ids[-new_input_length:]