-
Notifications
You must be signed in to change notification settings - Fork 3
/
build-test-token-json.py
49 lines (40 loc) · 1.61 KB
/
build-test-token-json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#
# This script should be executed inside the test folder
#
# You will need to install the "transformers" library to run this script, with either
# pip3 install transformers
# or
# pip install transformers
#
# This script will generate a JSON file containing the tokenized version of
# of the `hello-world.txt` and `test-string.txt` files. The JSON file will
# contain the array of tokens. Which would be used as reference for other
# tokenizer implementations (ie. JavaScript).
#
# Get the tokenizer from the transformers library
from transformers import PreTrainedTokenizerFast
theTokenizer = PreTrainedTokenizerFast(tokenizer_file="../20B_tokenizer.json")
# Read the test files
with open("hello-world.txt", "r") as f:
helloWorldStr = f.read()
with open("test-string.txt", "r") as f:
testStringStr = f.read()
# Tokenize and encode the strings
helloWorldTokens = theTokenizer.encode(helloWorldStr)
testStringTokens = theTokenizer.encode(testStringStr)
# Write the tokens to a JSON file
import json
with open("hello-world.json", "w") as f:
json.dump(helloWorldTokens, f)
with open("test-string.json", "w") as f:
json.dump(testStringTokens, f)
# Lets build a varient of testStringStr test, but is done line by line
# and the lines are tokenized and encoded separately
testStringLines = testStringStr.splitlines()
# Tokenize and encode the lines seperately, and build an array of arrays
testStringLinesTokens = []
for line in testStringLines:
testStringLinesTokens.append(theTokenizer.encode(line))
# Write the tokens to a JSON file
with open("test-string-lines.json", "w") as f:
json.dump(testStringLinesTokens, f)