-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Tokenizer type and functions, README
- Loading branch information
1 parent
00d2b04
commit 521aa9f
Showing
3 changed files
with
169 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,71 @@ | ||
# HuggingFaceTokenizers | ||
# HuggingFaceTokenizers.jl | ||
|
||
[![Build Status](https://github.com/AntonOresten/HuggingFaceTokenizers.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/AntonOresten/HuggingFaceTokenizers.jl/actions/workflows/CI.yml?query=branch%3Amain) | ||
[![Build Status](https://github.com/MurrellGroup/HuggingFaceTokenizers.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/MurrellGroup/HuggingFaceTokenizers.jl/actions/workflows/CI.yml?query=branch%3Amain) | ||
|
||
Rudimentary Julia bindings for [🤗 Tokenizers](https://github.com/huggingface/tokenizers), providing fast and easy-to-use tokenization through Python interop. | ||
|
||
## Installation | ||
|
||
From the Julia REPL, enter Pkg mode with `]` and add the package using the URL: | ||
|
||
```julia | ||
add https://github.com/MurrellGroup/HuggingFaceTokenizers.jl | ||
``` | ||
|
||
## Usage | ||
|
||
### Loading a Tokenizer | ||
|
||
You can load a tokenizer either from a pre-trained model or from a saved file: | ||
|
||
```julia | ||
using HuggingFaceTokenizers | ||
|
||
# Load a pre-trained tokenizer | ||
tokenizer = from_pretrained(Tokenizer, "bert-base-uncased") | ||
|
||
# Or load from a file | ||
tokenizer = from_file(Tokenizer, "path/to/tokenizer.json") | ||
``` | ||
|
||
### Basic Operations | ||
|
||
#### Single Text Processing | ||
|
||
```julia | ||
# Encode a single text | ||
text = "Hello, how are you?" | ||
result = encode(tokenizer, text) | ||
println("Tokens: ", result.tokens) | ||
println("IDs: ", result.ids) | ||
|
||
# Decode back to text | ||
decoded_text = decode(tokenizer, result.ids) | ||
println("Decoded: ", decoded_text) | ||
``` | ||
|
||
#### Batch Processing | ||
|
||
```julia | ||
# Encode multiple texts at once | ||
texts = ["Hello, how are you?", "I'm doing great!"] | ||
batch_results = encode_batch(tokenizer, texts) | ||
|
||
# Each result contains tokens and ids | ||
for (i, result) in enumerate(batch_results) | ||
println("Text $i:") | ||
println(" Tokens: ", result.tokens) | ||
println(" IDs: ", result.ids) | ||
end | ||
|
||
# Decode multiple sequences at once | ||
ids_batch = [result.ids for result in batch_results] | ||
decoded_texts = decode_batch(tokenizer, ids_batch) | ||
``` | ||
|
||
### Saving a Tokenizer | ||
|
||
```julia | ||
# Save the tokenizer to a file | ||
save(tokenizer, "path/to/save/tokenizer.json") | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
""" | ||
Tokenizer | ||
A wrapper around a Python tokenizer. | ||
""" | ||
struct Tokenizer | ||
py_tokenizer::Py | ||
end | ||
|
||
""" | ||
from_file(::Type{Tokenizer}, path::String) | ||
Create a tokenizer from a saved tokenizer file. | ||
```julia | ||
tokenizer = from_file(Tokenizer, "path/to/tokenizer.json") | ||
``` | ||
""" | ||
function from_file(::Type{Tokenizer}, path::String) | ||
py_tokenizer = tokenizers.Tokenizer.from_file(path) | ||
return Tokenizer(py_tokenizer) | ||
end | ||
|
||
""" | ||
from_pretrained(::Type{Tokenizer}, name::String) | ||
Create a tokenizer from a pretrained tokenizer. | ||
```julia | ||
tokenizer = from_pretrained(Tokenizer, "bert-base-uncased") | ||
``` | ||
""" | ||
function from_pretrained(::Type{Tokenizer}, name::String) | ||
py_tokenizer = tokenizers.Tokenizer.from_pretrained(name) | ||
return Tokenizer(py_tokenizer) | ||
end | ||
|
||
""" | ||
save(tokenizer::Tokenizer, path::String) | ||
Save the tokenizer to a file. | ||
""" | ||
function save(tokenizer::Tokenizer, path::String) | ||
tokenizer.py_tokenizer.save(path) | ||
return nothing | ||
end | ||
|
||
""" | ||
encode(tokenizer::Tokenizer, text::String) -> (tokens::Vector{String}, ids::Vector{Int}) | ||
Encode a single text string into tokens and their corresponding IDs. | ||
""" | ||
function encode(tokenizer::Tokenizer, text::String) | ||
output = tokenizer.py_tokenizer.encode(text) | ||
tokens = pyconvert(Vector{String}, output.tokens) | ||
ids = pyconvert(Vector{Int}, output.ids) | ||
return (; tokens, ids) | ||
end | ||
|
||
""" | ||
decode(tokenizer::Tokenizer, ids::Vector{Int}) -> String | ||
Decode a sequence of token IDs back into text. | ||
""" | ||
function decode(tokenizer::Tokenizer, ids::Vector{Int}) | ||
return pyconvert(String, tokenizer.py_tokenizer.decode(ids)) | ||
end | ||
|
||
""" | ||
encode_batch(tokenizer::Tokenizer, texts::Vector{String}) -> Vector{Tuple{Vector{String}, Vector{Int}}} | ||
Encode multiple texts in batch. | ||
""" | ||
function encode_batch(tokenizer::Tokenizer, texts::Vector{String}) | ||
return map(tokenizer.py_tokenizer.encode_batch(texts)) do output | ||
(; tokens = pyconvert(Vector{String}, output.tokens), ids = pyconvert(Vector{Int}, output.ids)) | ||
end | ||
end | ||
|
||
""" | ||
decode_batch(tokenizer::Tokenizer, batch_ids::Vector{Vector{Int}}) -> Vector{String} | ||
Decode multiple sequences of token IDs in batch. | ||
""" | ||
function decode_batch(tokenizer::Tokenizer, batch_ids::Vector{Vector{Int}}) | ||
pyconvert(Vector{String}, tokenizer.py_tokenizer.decode_batch(batch_ids)) | ||
end |