Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PIP] Tokenizer wheel build #8

Draft
wants to merge 10 commits into
base: tokenizer-fix-decode
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,37 @@ requires-python = ">=3.8"
authors = [
{ name = "OpenVINO Developers", email = "[email protected]" },
]
classifiers = [
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
]

dependencies = [
"openvino",
"openvino>=2023.1",
"numpy"
]

[project.optional-dependencies]
dev = [
"black",
"ruff",
"pytest",
]
transformers = [
"transformers[sentencepiece,tiktoken]"
]
tiktoken = [
"tiktoken"
"transformers[sentencepiece]"
]
all = [
"ov_tokenizer[dev,transformers,tiktoken]"
"ov_tokenizer[dev,transformers]"
]


[tool.black]
line-length = 119
target-versions = ["py38", "py39", "py310", "py311", "py312"]


[tool.ruff]
ignore = ["C901", "E501", "E741", "W605"]
select = ["C", "E", "F", "I", "W"]
Expand All @@ -38,3 +48,11 @@ line-length = 119

[tool.ruff.isort]
lines-after-imports = 2

[build-system]
requires = [
"setuptools>=42",
"scikit-build~=0.17.0",
"cmake>=3.14"
]
build-backend = "setuptools.build_meta"
15 changes: 15 additions & 0 deletions modules/custom_operations/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from skbuild import setup
from skbuild import constants

setup(
packages=["ov_tokenizer"],
package_dir={"": "user_ie_extensions/src/tokenizer/python"},
cmake_install_dir="user_ie_extensions/src/tokenizer/python/ov_tokenizer/libs",
cmake_args=['-DCUSTOM_OPERATIONS:STRING=tokenizer',
'-DBUILD_FAST_TOKENIZERS=OFF']
)

# When building extension modules `cmake_install_dir` should always be set to the
# location of the package you are building extension modules for.
# Specifying the installation directory in the CMakeLists subtley breaks the relative
# paths in the helloTargets.cmake file to all of the library components.
56 changes: 40 additions & 16 deletions modules/custom_operations/user_ie_extensions/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
# Copyright (C) 2018-2023 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
#

if(POLICY CMP0079)
cmake_policy(SET CMP0079 NEW)
endif()
cmake_policy(SET CMP0057 NEW)
cmake_policy(SET CMP0079 NEW)

if(POLICY CMP0057)
cmake_policy(SET CMP0057 NEW)
Expand All @@ -24,14 +22,17 @@ find_package(OpenCV COMPONENTS core)

set(OP_REQ_TBB "complex_mul" "fft")

set(SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/src")
set(HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/include")

#
# Select specific operations
#

if(NOT CUSTOM_OPERATIONS)
file(GLOB op_src "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp")
file(GLOB op_dirs LIST_DIRECTORIES true "${CMAKE_CURRENT_SOURCE_DIR}/*")
list(REMOVE_ITEM op_dirs "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
file(GLOB op_src "${SOURCES}/*.cpp")
file(GLOB op_dirs LIST_DIRECTORIES true "${SOURCES}/*")
list(REMOVE_ITEM op_dirs "${SOURCES}/cmake")

foreach(op IN LISTS op_src)
get_filename_component(op_name ${op} NAME_WE)
Expand All @@ -49,30 +50,31 @@ if(NOT CUSTOM_OPERATIONS)
list(REMOVE_ITEM CUSTOM_OPERATIONS ov_extension)
endif()

list(APPEND SRC "${CMAKE_CURRENT_SOURCE_DIR}/ov_extension.cpp")
list(APPEND SRC "${SOURCES}/ov_extension.cpp")

# filter out some operations, requiring specific dependencies

if(NOT OpenCV_FOUND)
list(REMOVE_ITEM SRC "${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp")
list(REMOVE_ITEM SRC "${SOURCES}/fft.cpp")
list(REMOVE_ITEM CUSTOM_OPERATIONS fft)
endif()

if(NOT TBB_FOUND)
foreach(op IN LISTS OP_REQ_TBB)
list(REMOVE_ITEM SRC "${CMAKE_CURRENT_SOURCE_DIR}/${op}.cpp")
list(REMOVE_ITEM SRC "${SOURCES}/${op}.cpp")
list(REMOVE_ITEM CUSTOM_OPERATIONS ${op})
endforeach()
endif()

message(" List of custom operations in ${TARGET_NAME} extension: ")
foreach(op IN LISTS CUSTOM_OPERATIONS)
if(IS_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/${op}")
file(GLOB op_src "${CMAKE_CURRENT_SOURCE_DIR}/${op}/*.cpp")
if(IS_DIRECTORY "${SOURCES}/${op}")
file(GLOB op_src "${SOURCES}/${op}/*.cpp")
list(APPEND SRC ${op_src})
elseif(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/${op}.cpp")
list(APPEND SRC "${CMAKE_CURRENT_SOURCE_DIR}/${op}.cpp")
elseif(EXISTS "${SOURCES}/${op}.cpp")
list(APPEND SRC "${SOURCES}/${op}.cpp")
else()
message("${SOURCES}/${op}")
message(FATAL_ERROR "${TARGET_NAME} does not have operation with name '${op}'")
endif()

Expand All @@ -85,6 +87,12 @@ endforeach()

add_library(${TARGET_NAME} SHARED ${SRC})

set_target_properties(${TARGET_NAME} PROPERTIES
RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} # .exe and .dll
ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/lib # .lib and .a
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} # .so and .dylib
)

if(OpenCV_FOUND)
target_link_libraries(${TARGET_NAME} PRIVATE ${OpenCV_LIBRARIES})
endif()
Expand All @@ -94,11 +102,27 @@ if(TBB_FOUND)
endif()

# Left sentence_piece for backward compatibility
if(tokenizer IN_LIST CUSTOM_OPERATIONS)
add_subdirectory(tokenizer)
if("tokenizer" IN_LIST CUSTOM_OPERATIONS)
add_subdirectory(${SOURCES}/tokenizer)
if(extra_dlls)
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy ${extra_dlls} $<TARGET_FILE_DIR:${TARGET_NAME}>)
install(FILES ${extra_dlls} DESTINATION .)
endif()
endif()

target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)

target_compile_definitions(${TARGET_NAME} PRIVATE ${CUSTOM_OPERATIONS})
target_include_directories(${TARGET_NAME} PUBLIC ./include/)

if(DEFINED SKBUILD)
# Installing the extension module to the root of the package
install(TARGETS ${TARGET_NAME} RUNTIME DESTINATION .)
if(APPLE)
set_target_properties(
${TARGET_NAME} PROPERTIES INSTALL_RPATH "@loader_path")
else()
set_target_properties(${TARGET_NAME} PROPERTIES INSTALL_RPATH "$ORIGIN")
endif()
endif()
Original file line number Diff line number Diff line change
@@ -1,16 +1,12 @@
# Copyright (C) 2018-2023 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
#

if(POLICY CMP0135)
cmake_policy(SET CMP0135 NEW)
endif()

option(BUILD_FAST_TOKENIZERS OFF)

# to build only sentencepiece-static target
set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY EXCLUDE_FROM_ALL ON)

#
# Compile flags
#
Expand Down Expand Up @@ -112,10 +108,12 @@ else()
FetchContent_MakeAvailable(fast_tokenizer)
include("${fast_tokenizer_SOURCE_DIR}/FastTokenizer.cmake")

set(fast_tokenizer_SOURCE_DIR "${fast_tokenizer_SOURCE_DIR}" PARENT_SCOPE)

if(WIN32 AND X86_64)
# we use re2 library in regex_normalization operation, so have to add to this list
# because prebuilt fast_tokenizers package does not provide this library
list(APPEND FAST_TOKENIZER_LIBS re2)
set (RE2_LIBS re2)
endif()
endif()

Expand All @@ -134,13 +132,32 @@ target_include_directories(${TARGET_NAME} PRIVATE
# fast_tokenizer
${FAST_TOKENIZER_INCS})

set_property(DIRECTORY ${sentencepiece_SOURCE_DIR}
PROPERTY EXCLUDE_FROM_ALL ON)

set_property(DIRECTORY ${re2_SOURCE_DIR}
PROPERTY EXCLUDE_FROM_ALL ON)

if(CMAKE_CL_64)
target_compile_definitions(sentencepiece-static PRIVATE _CRT_SECURE_NO_WARNINGS _SCL_SECURE_NO_WARNINGS)
endif()

target_link_libraries(${TARGET_NAME} PRIVATE ${FAST_TOKENIZER_LIBS} sentencepiece-static)
target_link_libraries(${TARGET_NAME} PRIVATE ${FAST_TOKENIZER_LIBS} ${RE2_LIBS} sentencepiece-static)

# string_view is used from cxx17
string(REPLACE " " ";" cxx_flags "${cxx_flags}")
set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17
COMPILE_OPTIONS "${cxx_flags}")
#
# Post build steps to copy core_tokenizers dependencies
#

if(WIN32 AND X86_64)
if(BUILD_FAST_TOKENIZERS)
# TODO
else()
set(extra_dlls "${fast_tokenizer_SOURCE_DIR}/lib/core_tokenizers.dll"
"${fast_tokenizer_SOURCE_DIR}/third_party/lib/icudt70.dll"
"${fast_tokenizer_SOURCE_DIR}/third_party/lib/icuuc70.dll" PARENT_SCOPE)
endif()
endif()
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2023 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import os
import sys
import openvino
from openvino.runtime.utils.node_factory import NodeFactory

from .convert_tokenizer import convert_tokenizer
from .node_factory import init_extension
from .str_pack import pack_strings, unpack_strings
from .utils import add_greedy_decoding, connect_models

_ext_name = "user_ov_extensions"
_ext_libs_path = os.path.join(os.path.dirname(__file__), "libs")

if sys.platform == "win32":
_ext_path = os.path.join(_ext_libs_path, f'{_ext_name}.dll')
if os.path.isdir(_ext_libs_path):
# On Windows, with Python >= 3.8, DLLs are no longer imported from the PATH.
os.add_dll_directory(os.path.abspath(_ext_path))
else:
sys.exit(f'Error: extention libriary path {_ext_libs_path} not found')
elif sys.platform == "darwin":
_ext_path = os.path.join(_ext_libs_path, f'lib{_ext_name}.dylib')
elif sys.platform == "linux":
_ext_path = os.path.join(_ext_libs_path, f'lib{_ext_name}.so')
else:
sys.exit(f'Error: extention does not support platform {sys.platform}')

old_core_init = openvino.runtime.Core.__init__
def new_core_init(self, *k, **kw):
old_core_init(self, *k, **kw)
self.add_extension(_ext_path)
openvino.runtime.Core.__init__ = new_core_init

_factory = NodeFactory()
_factory.add_extension(_ext_path)
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
TOKENIZER_DECODER_NAME,
TOKENIZER_ENCODER_NAME,
)
from .node_factory import factory
from . import _factory
from .tokenizer_pipeline import (
BPETokenizationStep,
BytesToCharsStep,
Expand Down Expand Up @@ -116,7 +116,7 @@ def __init__(self, tokenizer_object: Any, number_of_inputs: int = 1) -> None:
self.original_tokenizer = tokenizer_object
with TemporaryDirectory() as tmpdir:
tokenizer_object.save_pretrained(tmpdir)
with open(Path(tmpdir) / "tokenizer.json") as tj:
with open(Path(tmpdir) / "tokenizer.json", encoding="utf8") as tj:
self.tokenizer_json = json.load(tj)
self.pipeline = TokenizerPipeline()
self.number_of_inputs = number_of_inputs
Expand Down Expand Up @@ -313,7 +313,7 @@ def is_sentencepiece_model(hf_tokenizer: "PreTrainedTokenizerBase") -> bool:
def add_tokens_to_sentencepiece_model(sp_model_path: Path, hf_tokenizer: "PreTrainedTokenizerBase") -> None:
model_pb = import_protobuf()
model = model_pb.ModelProto()
with open(sp_model_path, "rb") as model_file:
with open(sp_model_path, "rb", encoding="utf8") as model_file:
model.ParseFromString(model_file.read())

add_token_dict = hf_tokenizer.tokenizer.index_special_tokens
Expand All @@ -322,7 +322,7 @@ def add_tokens_to_sentencepiece_model(sp_model_path: Path, hf_tokenizer: "PreTra
new_piece.piece = token
model.pieces.append(new_piece)

with open(sp_model_path, "wb") as model_file:
with open(sp_model_path, "wb", encoding="utf8") as model_file:
model_file.write(model.SerializeToString())


Expand Down Expand Up @@ -365,7 +365,7 @@ def convert_sentencepiece_model_tokenizer(
)
add_bos_token = getattr(hf_tokenizer, "add_bos_token", add_eos_token) or False

tokenizer_node = factory.create(
tokenizer_node = _factory.create(
"SentencepieceTokenizer",
[sp_model_node, input_node],
{
Expand All @@ -383,7 +383,7 @@ def convert_sentencepiece_model_tokenizer(

default_value = make_constant_node(hf_tokenizer.pad_token_id or 0, values.element_type)
broadcast = opset.broadcast(default_value, dense_shape)
scatternd_input_ids = factory.create(
scatternd_input_ids = _factory.create(
"ScatterNDUpdate",
[broadcast, indices, values], # FIXME: pad left side instead of right
)
Expand All @@ -399,7 +399,7 @@ def convert_sentencepiece_model_tokenizer(
outputs = scatternd_input_ids.outputs()

if add_attention_mask:
attention_mask = factory.create(
attention_mask = _factory.create(
"ScatterNDUpdate",
[
broadcast,
Expand Down Expand Up @@ -432,15 +432,15 @@ def convert_sentencepiece_model_tokenizer(
def get_sp_decoder(sp_model_node: Node, streaming_decoder: bool = False) -> Model:
token_ids = op.Parameter(Type.i32, PartialShape(["?", "?"])) # (batch, sequence)

decoder = factory.create(
decoder = _factory.create(
"SentencepieceStreamDetokenizer" if streaming_decoder else "SentencepieceDetokenizer",
[sp_model_node, token_ids],
).outputs()

if streaming_decoder:
decoder = RegexDecodingStep.replace_sp_spaces().get_ov_subgraph(decoder)

string_output = factory.create("StringTensorPack", decoder).outputs()
string_output = _factory.create("StringTensorPack", decoder).outputs()
string_output[0].tensor.add_names({STRING_OUTPUT_NAME})
tokenizer_decoder = Model(string_output, [token_ids], TOKENIZER_DECODER_NAME)
tokenizer_decoder.validate_nodes_and_infer_types()
Expand Down
Loading