-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizer.py
35 lines (28 loc) · 1.16 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from nltk.tokenize import word_tokenize
import os
class SimpleTokenizer:
"""
A simple tokenizer class that builds a vocabulary from the given text and encodes/decodes text into indices.
"""
def __init__(self, text):
"""Initialize the tokenizer with the initial text to build vocabulary."""
self.vocab = set()
self.stoi = {}
self.itos = {}
self.build_vocab(text)
def build_vocab(self, text):
"""Build vocabulary from the given text."""
tokens = word_tokenize(text)
self.vocab = set(tokens)
self.vocab_size = len(self.vocab) + 2
self.stoi = {word: i for i, word in enumerate(self.vocab, start=2)}
self.stoi['<pad>'] = 0
self.stoi['<unk>'] = 1
self.itos = {i: word for word, i in self.stoi.items()}
def encode(self, text):
"""Encode the text into a list of indices."""
tokens = word_tokenize(text)
return [self.stoi.get(word, self.stoi['<unk>']) for word in tokens]
def decode(self, indices):
"""Decode the list of indices back into text."""
return ' '.join([self.itos.get(index, '<unk>') for index in indices])