Skip to content

Commit

Permalink
Fix padding issue in Tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
NirantK committed Feb 15, 2024
1 parent e79c5a6 commit 59e36cd
Showing 1 changed file with 3 additions and 3 deletions.
6 changes: 3 additions & 3 deletions fastembed/common/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from pathlib import Path

import numpy as np
from tokenizers import Tokenizer, AddedToken
from tokenizers import AddedToken, Tokenizer


def load_tokenizer(model_dir: Path, max_length: int = 512) -> Tokenizer:
Expand Down Expand Up @@ -33,8 +33,8 @@ def load_tokenizer(model_dir: Path, max_length: int = 512) -> Tokenizer:

tokenizer = Tokenizer.from_file(str(tokenizer_path))
tokenizer.enable_truncation(max_length=min(tokenizer_config["model_max_length"], max_length))
tokenizer.enable_padding(pad_id=config["pad_token_id"], pad_token=tokenizer_config["pad_token"])

tokenizer.enable_padding(pad_id=config.get("pad_token_id", 0), pad_token=tokenizer_config["pad_token"]) # default pad_id is 0
for token in tokens_map.values():
if isinstance(token, str):
tokenizer.add_special_tokens([token])
Expand Down

0 comments on commit 59e36cd

Please sign in to comment.