From 521aa9f73c286aed8cfc8856f851245c6eeeaa74 Mon Sep 17 00:00:00 2001 From: Anton Oresten Date: Sun, 24 Nov 2024 01:18:41 +0100 Subject: [PATCH] Add Tokenizer type and functions, README --- README.md | 72 ++++++++++++++++++++++++++++- src/HuggingFaceTokenizers.jl | 16 +++++-- src/Tokenizer.jl | 87 ++++++++++++++++++++++++++++++++++++ 3 files changed, 169 insertions(+), 6 deletions(-) create mode 100644 src/Tokenizer.jl diff --git a/README.md b/README.md index 4520a95..6ad4359 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,71 @@ -# HuggingFaceTokenizers +# HuggingFaceTokenizers.jl -[![Build Status](https://github.com/AntonOresten/HuggingFaceTokenizers.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/AntonOresten/HuggingFaceTokenizers.jl/actions/workflows/CI.yml?query=branch%3Amain) +[![Build Status](https://github.com/MurrellGroup/HuggingFaceTokenizers.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/MurrellGroup/HuggingFaceTokenizers.jl/actions/workflows/CI.yml?query=branch%3Amain) + +Rudimentary Julia bindings for [🤗 Tokenizers](https://github.com/huggingface/tokenizers), providing fast and easy-to-use tokenization through Python interop. + +## Installation + +From the Julia REPL, enter Pkg mode with `]` and add the package using the URL: + +```julia +add https://github.com/MurrellGroup/HuggingFaceTokenizers.jl +``` + +## Usage + +### Loading a Tokenizer + +You can load a tokenizer either from a pre-trained model or from a saved file: + +```julia +using HuggingFaceTokenizers + +# Load a pre-trained tokenizer +tokenizer = from_pretrained(Tokenizer, "bert-base-uncased") + +# Or load from a file +tokenizer = from_file(Tokenizer, "path/to/tokenizer.json") +``` + +### Basic Operations + +#### Single Text Processing + +```julia +# Encode a single text +text = "Hello, how are you?" +result = encode(tokenizer, text) +println("Tokens: ", result.tokens) +println("IDs: ", result.ids) + +# Decode back to text +decoded_text = decode(tokenizer, result.ids) +println("Decoded: ", decoded_text) +``` + +#### Batch Processing + +```julia +# Encode multiple texts at once +texts = ["Hello, how are you?", "I'm doing great!"] +batch_results = encode_batch(tokenizer, texts) + +# Each result contains tokens and ids +for (i, result) in enumerate(batch_results) + println("Text $i:") + println(" Tokens: ", result.tokens) + println(" IDs: ", result.ids) +end + +# Decode multiple sequences at once +ids_batch = [result.ids for result in batch_results] +decoded_texts = decode_batch(tokenizer, ids_batch) +``` + +### Saving a Tokenizer + +```julia +# Save the tokenizer to a file +save(tokenizer, "path/to/save/tokenizer.json") +``` diff --git a/src/HuggingFaceTokenizers.jl b/src/HuggingFaceTokenizers.jl index 087aab5..6d6d097 100644 --- a/src/HuggingFaceTokenizers.jl +++ b/src/HuggingFaceTokenizers.jl @@ -2,10 +2,18 @@ module HuggingFaceTokenizers using PythonCall -const tokenizers = Ref{Py}() +const tokenizers = PythonCall.pynew() -function __init__() - tokenizers[] = pyimport("tokenizers") -end +__init__() = PythonCall.pycopy!(tokenizers, pyimport("tokenizers")) + +include("Tokenizer.jl") +export Tokenizer +export from_file +export from_pretrained +export save +export encode +export decode +export encode_batch +export decode_batch end diff --git a/src/Tokenizer.jl b/src/Tokenizer.jl new file mode 100644 index 0000000..50cb249 --- /dev/null +++ b/src/Tokenizer.jl @@ -0,0 +1,87 @@ +""" + Tokenizer + +A wrapper around a Python tokenizer. +""" +struct Tokenizer + py_tokenizer::Py +end + +""" + from_file(::Type{Tokenizer}, path::String) + +Create a tokenizer from a saved tokenizer file. + +```julia +tokenizer = from_file(Tokenizer, "path/to/tokenizer.json") +``` +""" +function from_file(::Type{Tokenizer}, path::String) + py_tokenizer = tokenizers.Tokenizer.from_file(path) + return Tokenizer(py_tokenizer) +end + +""" + from_pretrained(::Type{Tokenizer}, name::String) + +Create a tokenizer from a pretrained tokenizer. + +```julia +tokenizer = from_pretrained(Tokenizer, "bert-base-uncased") +``` +""" +function from_pretrained(::Type{Tokenizer}, name::String) + py_tokenizer = tokenizers.Tokenizer.from_pretrained(name) + return Tokenizer(py_tokenizer) +end + +""" + save(tokenizer::Tokenizer, path::String) + +Save the tokenizer to a file. +""" +function save(tokenizer::Tokenizer, path::String) + tokenizer.py_tokenizer.save(path) + return nothing +end + +""" + encode(tokenizer::Tokenizer, text::String) -> (tokens::Vector{String}, ids::Vector{Int}) + +Encode a single text string into tokens and their corresponding IDs. +""" +function encode(tokenizer::Tokenizer, text::String) + output = tokenizer.py_tokenizer.encode(text) + tokens = pyconvert(Vector{String}, output.tokens) + ids = pyconvert(Vector{Int}, output.ids) + return (; tokens, ids) +end + +""" + decode(tokenizer::Tokenizer, ids::Vector{Int}) -> String + +Decode a sequence of token IDs back into text. +""" +function decode(tokenizer::Tokenizer, ids::Vector{Int}) + return pyconvert(String, tokenizer.py_tokenizer.decode(ids)) +end + +""" + encode_batch(tokenizer::Tokenizer, texts::Vector{String}) -> Vector{Tuple{Vector{String}, Vector{Int}}} + +Encode multiple texts in batch. +""" +function encode_batch(tokenizer::Tokenizer, texts::Vector{String}) + return map(tokenizer.py_tokenizer.encode_batch(texts)) do output + (; tokens = pyconvert(Vector{String}, output.tokens), ids = pyconvert(Vector{Int}, output.ids)) + end +end + +""" + decode_batch(tokenizer::Tokenizer, batch_ids::Vector{Vector{Int}}) -> Vector{String} + +Decode multiple sequences of token IDs in batch. +""" +function decode_batch(tokenizer::Tokenizer, batch_ids::Vector{Vector{Int}}) + pyconvert(Vector{String}, tokenizer.py_tokenizer.decode_batch(batch_ids)) +end