Skip to content

Commit

Permalink
Add Tokenizer type and functions, README
Browse files Browse the repository at this point in the history
  • Loading branch information
AntonOresten committed Nov 24, 2024
1 parent 00d2b04 commit 521aa9f
Show file tree
Hide file tree
Showing 3 changed files with 169 additions and 6 deletions.
72 changes: 70 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,71 @@
# HuggingFaceTokenizers
# HuggingFaceTokenizers.jl

[![Build Status](https://github.com/AntonOresten/HuggingFaceTokenizers.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/AntonOresten/HuggingFaceTokenizers.jl/actions/workflows/CI.yml?query=branch%3Amain)
[![Build Status](https://github.com/MurrellGroup/HuggingFaceTokenizers.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/MurrellGroup/HuggingFaceTokenizers.jl/actions/workflows/CI.yml?query=branch%3Amain)

Rudimentary Julia bindings for [🤗 Tokenizers](https://github.com/huggingface/tokenizers), providing fast and easy-to-use tokenization through Python interop.

## Installation

From the Julia REPL, enter Pkg mode with `]` and add the package using the URL:

```julia
add https://github.com/MurrellGroup/HuggingFaceTokenizers.jl
```

## Usage

### Loading a Tokenizer

You can load a tokenizer either from a pre-trained model or from a saved file:

```julia
using HuggingFaceTokenizers

# Load a pre-trained tokenizer
tokenizer = from_pretrained(Tokenizer, "bert-base-uncased")

# Or load from a file
tokenizer = from_file(Tokenizer, "path/to/tokenizer.json")
```

### Basic Operations

#### Single Text Processing

```julia
# Encode a single text
text = "Hello, how are you?"
result = encode(tokenizer, text)
println("Tokens: ", result.tokens)
println("IDs: ", result.ids)

# Decode back to text
decoded_text = decode(tokenizer, result.ids)
println("Decoded: ", decoded_text)
```

#### Batch Processing

```julia
# Encode multiple texts at once
texts = ["Hello, how are you?", "I'm doing great!"]
batch_results = encode_batch(tokenizer, texts)

# Each result contains tokens and ids
for (i, result) in enumerate(batch_results)
println("Text $i:")
println(" Tokens: ", result.tokens)
println(" IDs: ", result.ids)
end

# Decode multiple sequences at once
ids_batch = [result.ids for result in batch_results]
decoded_texts = decode_batch(tokenizer, ids_batch)
```

### Saving a Tokenizer

```julia
# Save the tokenizer to a file
save(tokenizer, "path/to/save/tokenizer.json")
```
16 changes: 12 additions & 4 deletions src/HuggingFaceTokenizers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,18 @@ module HuggingFaceTokenizers

using PythonCall

const tokenizers = Ref{Py}()
const tokenizers = PythonCall.pynew()

function __init__()
tokenizers[] = pyimport("tokenizers")
end
__init__() = PythonCall.pycopy!(tokenizers, pyimport("tokenizers"))

include("Tokenizer.jl")
export Tokenizer
export from_file
export from_pretrained
export save
export encode
export decode
export encode_batch
export decode_batch

end
87 changes: 87 additions & 0 deletions src/Tokenizer.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""
Tokenizer
A wrapper around a Python tokenizer.
"""
struct Tokenizer
py_tokenizer::Py
end

"""
from_file(::Type{Tokenizer}, path::String)
Create a tokenizer from a saved tokenizer file.
```julia
tokenizer = from_file(Tokenizer, "path/to/tokenizer.json")
```
"""
function from_file(::Type{Tokenizer}, path::String)
py_tokenizer = tokenizers.Tokenizer.from_file(path)
return Tokenizer(py_tokenizer)
end

"""
from_pretrained(::Type{Tokenizer}, name::String)
Create a tokenizer from a pretrained tokenizer.
```julia
tokenizer = from_pretrained(Tokenizer, "bert-base-uncased")
```
"""
function from_pretrained(::Type{Tokenizer}, name::String)
py_tokenizer = tokenizers.Tokenizer.from_pretrained(name)
return Tokenizer(py_tokenizer)
end

"""
save(tokenizer::Tokenizer, path::String)
Save the tokenizer to a file.
"""
function save(tokenizer::Tokenizer, path::String)
tokenizer.py_tokenizer.save(path)
return nothing
end

"""
encode(tokenizer::Tokenizer, text::String) -> (tokens::Vector{String}, ids::Vector{Int})
Encode a single text string into tokens and their corresponding IDs.
"""
function encode(tokenizer::Tokenizer, text::String)
output = tokenizer.py_tokenizer.encode(text)
tokens = pyconvert(Vector{String}, output.tokens)
ids = pyconvert(Vector{Int}, output.ids)
return (; tokens, ids)
end

"""
decode(tokenizer::Tokenizer, ids::Vector{Int}) -> String
Decode a sequence of token IDs back into text.
"""
function decode(tokenizer::Tokenizer, ids::Vector{Int})
return pyconvert(String, tokenizer.py_tokenizer.decode(ids))
end

"""
encode_batch(tokenizer::Tokenizer, texts::Vector{String}) -> Vector{Tuple{Vector{String}, Vector{Int}}}
Encode multiple texts in batch.
"""
function encode_batch(tokenizer::Tokenizer, texts::Vector{String})
return map(tokenizer.py_tokenizer.encode_batch(texts)) do output
(; tokens = pyconvert(Vector{String}, output.tokens), ids = pyconvert(Vector{Int}, output.ids))
end
end

"""
decode_batch(tokenizer::Tokenizer, batch_ids::Vector{Vector{Int}}) -> Vector{String}
Decode multiple sequences of token IDs in batch.
"""
function decode_batch(tokenizer::Tokenizer, batch_ids::Vector{Vector{Int}})
pyconvert(Vector{String}, tokenizer.py_tokenizer.decode_batch(batch_ids))
end

0 comments on commit 521aa9f

Please sign in to comment.