Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

simdutf: simdutf_connector: in_tail: Implement UTF-16LE/UTF-16BE encoder #9468

Open
wants to merge 25 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
534b876
build: lib: Bundle simdutf amalgamation v5.5.0
cosmo0920 Sep 27, 2024
76c203a
simdutf_connector: Add C connector for simdutf library
cosmo0920 Sep 27, 2024
553cd6a
in_tail: Add Unicode encoder support
cosmo0920 Oct 3, 2024
0821f95
in_tail: tests: Add UTF-16LE and UTF-16BE with BOM test cases for uni…
cosmo0920 Oct 4, 2024
61ada39
simdutf_connector: Make buildable on macOS
cosmo0920 Oct 4, 2024
a5d0442
build: Do not install simdutf related header when turning off
cosmo0920 Oct 4, 2024
0fd7481
in_tail: tests: Add testcases for subdivision flags characters
cosmo0920 Oct 7, 2024
e918a14
in_tail: Treat as an error when unsupported encoding is specified
cosmo0920 Oct 7, 2024
522b920
workflows: Use g++ or clang++ for C++ source buildings and linkings
cosmo0920 Oct 7, 2024
e714b50
workflows: Use g++ for C++ sources on system libs task
cosmo0920 Oct 7, 2024
335a73d
build: Add CXX flags for gcov
cosmo0920 Oct 7, 2024
4a02a49
in_tail: Fix indentation style
cosmo0920 Oct 8, 2024
596b935
simdutf_connector: Suppress compiler warnings
cosmo0920 Oct 8, 2024
b270043
in_tail: Accept more variants for specifying UTF-16s
cosmo0920 Oct 9, 2024
7cc521f
simdutf_connector: Handle newlines
cosmo0920 Oct 9, 2024
3aeaf12
in_tail: Convert encodings before splitting lines
cosmo0920 Oct 10, 2024
c15f8b2
in_tail: Align 2-bytes alignments if UTF-16 encodings are enabled
cosmo0920 Oct 10, 2024
02b3587
dockerfiles: centos-7: Disable SIMDUTF module due to gcc-4 series doe…
cosmo0920 Oct 28, 2024
bcc1783
packaging: distros: centos: Disble simdutf stuffs on CentOS 7
cosmo0920 Oct 28, 2024
1503677
packaging: Detect centos/6 or centos/7 to turn off simdutf stuffs
cosmo0920 Oct 28, 2024
a54105d
packaging: Turn off simdutf on centos/7 ARM64bit
cosmo0920 Oct 28, 2024
d0a6267
build: flb_unicode: Add proxy component of calling simdutf stuffs
cosmo0920 Nov 8, 2024
09e95bb
in_tail: tests: Use proxyed function and constants for using simdutf …
cosmo0920 Nov 8, 2024
e809963
in_tail: tests: Follow UTF-8 encoder changes for multi-byte characters
cosmo0920 Dec 13, 2024
a8b0de9
in_tail: tests: Add SIMD enabled case of test cases
cosmo0920 Dec 13, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions .github/workflows/pr-compile-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,12 @@ jobs:
flb_option:
- "-DFLB_PREFER_SYSTEM_LIBS=On"
compiler:
- gcc
- clang
- gcc:
cc: gcc
cxx: g++
- clang:
cc: clang
cxx: clang++
steps:
- name: Setup environment
run: |
Expand All @@ -58,7 +62,7 @@ jobs:
- name: Checkout Fluent Bit code
uses: actions/checkout@v4

- name: ${{ matrix.compiler }} - ${{ matrix.flb_option }}
- name: ${{ matrix.compiler.cc }} & ${{ matrix.compiler.cxx }} - ${{ matrix.flb_option }}
run: |
export nparallel=$(( $(getconf _NPROCESSORS_ONLN) > 8 ? 8 : $(getconf _NPROCESSORS_ONLN) ))
echo "CC = $CC, CXX = $CXX, FLB_OPT = $FLB_OPT"
Expand All @@ -69,8 +73,8 @@ jobs:
make -j $nparallel
working-directory: build
env:
CC: ${{ matrix.compiler }}
CXX: ${{ matrix.compiler }}
CC: ${{ matrix.compiler.cc }}
CXX: ${{ matrix.compiler.cxx }}
FLB_OPT: ${{ matrix.flb_option }}
GLOBAL_OPTS: "-DFLB_JEMALLOC=On -DFLB_SHARED_LIB=Off -DFLB_DEBUG=On -DFLB_ALL=On -DFLB_EXAMPLES=Off"

Expand Down
29 changes: 18 additions & 11 deletions .github/workflows/unit-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,17 @@ jobs:
- "-DFLB_SIMD=On"
- "-DFLB_SIMD=Off"
compiler:
- gcc
- clang
- gcc:
cc: gcc
cxx: g++
- clang:
cc: clang
cxx: clang++
exclude:
- flb_option: "-DFLB_COVERAGE=On"
compiler: clang
compiler:
cc: clang
cxx: clang++
permissions:
contents: read
steps:
Expand All @@ -66,7 +72,7 @@ jobs:
repository: calyptia/fluent-bit-ci
path: ci

- name: ${{ matrix.compiler }} - ${{ matrix.flb_option }}
- name: ${{ matrix.compiler.cc }} & ${{ matrix.compiler.cxx }} - ${{ matrix.flb_option }}
run: |
echo "CC = $CC, CXX = $CXX, FLB_OPT = $FLB_OPT"
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 90
Expand All @@ -75,8 +81,8 @@ jobs:
sudo usermod -a -G systemd-journal $(id -un)
sudo -E su -p $(id -un) -c "PATH=$PATH ci/scripts/run-unit-tests.sh"
env:
CC: ${{ matrix.compiler }}
CXX: ${{ matrix.compiler }}
CC: ${{ matrix.compiler.cc }}
CXX: ${{ matrix.compiler.cxx }}
FLB_OPT: ${{ matrix.flb_option }}

run-macos-unit-tests:
Expand Down Expand Up @@ -130,7 +136,8 @@ jobs:
omit_option: ""
global_option: "-DFLB_BACKTRACE=Off -DFLB_SHARED_LIB=Off -DFLB_DEBUG=On -DFLB_ALL=On -DFLB_EXAMPLES=Off"
unit_test_option: "-DFLB_TESTS_INTERNAL=On"
compiler: gcc
compiler_cc: gcc
compiler_cxx: g++
steps:
- name: Checkout Fluent Bit code
uses: actions/checkout@v4
Expand Down Expand Up @@ -158,15 +165,15 @@ jobs:
export FLB_UNIT_TEST_OPTION="${{ matrix.config.unit_test_option }}"
export FLB_OPT="${FLB_OPTION} ${GLOBAL_OPTION} ${FLB_UNIT_TEST_OPTION} ${FLB_OMIT_OPTION}"

echo "CC = ${{ matrix.config.compiler }}, CXX = ${{ matrix.config.compiler }}, FLB_OPT = $FLB_OPT"
echo "CC = ${{ matrix.config.compiler_cc }}, CXX = ${{ matrix.config.compiler_cxx }}, FLB_OPT = $FLB_OPT"

cmake ${FLB_OPT} ../
make -j $nparallel
ctest -j $nparallel --build-run-dir . --output-on-failure
working-directory: build
env:
CC: ${{ matrix.config.compiler }}
CXX: ${{ matrix.config.compiler }}
CC: ${{ matrix.config.compiler_cc }}
CXX: ${{ matrix.config.compiler_cxx }}

run-qemu-ubuntu-unit-tests:
# We chain this after Linux one as there are CPU time costs for QEMU emulation
Expand Down Expand Up @@ -209,7 +216,7 @@ jobs:
export FLB_UNIT_TEST_OPTION="-DFLB_TESTS_INTERNAL=On"
export FLB_OPT="${FLB_OPTION} ${GLOBAL_OPTION} ${FLB_UNIT_TEST_OPTION} ${FLB_OMIT_OPTION}"
export CC=gcc
export CXX=gcc
export CXX=g++

echo "CC = $CC, CXX = $CXX, FLB_OPT = $FLB_OPT"

Expand Down
12 changes: 12 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ option(FLB_COVERAGE "Build with code-coverage" No)
option(FLB_JEMALLOC "Build with Jemalloc support" No)
option(FLB_REGEX "Build with Regex support" Yes)
option(FLB_UTF8_ENCODER "Build with UTF8 encoding support" Yes)
option(FLB_UNICODE_ENCODER "Build with Unicode (UTF-16LE, UTF-16BE) encoding support" Yes)
option(FLB_PARSER "Build with Parser support" Yes)
option(FLB_TLS "Build with SSL/TLS support" Yes)
option(FLB_BINARY "Build executable binary" Yes)
Expand Down Expand Up @@ -364,6 +365,9 @@ endif()

if(FLB_COVERAGE)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 --coverage -fprofile-arcs -ftest-coverage")
if (FLB_UNICODE_ENCODER)
set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -g -O0 --coverage -fprofile-arcs -ftest-coverage")
endif()
set(CMAKE_BUILD_TYPE "Debug")
endif()

Expand Down Expand Up @@ -506,6 +510,14 @@ if(FLB_UTF8_ENCODER)
add_subdirectory(${FLB_PATH_LIB_TUTF8E} EXCLUDE_FROM_ALL)
endif()

# simdutf
if(FLB_UNICODE_ENCODER)
enable_language(CXX)
set (CMAKE_CXX_STANDARD 11)
add_subdirectory(${FLB_PATH_LIB_SIMDUTF} EXCLUDE_FROM_ALL)
FLB_DEFINITION(FLB_HAVE_UNICODE_ENCODER)
endif()

# snappy
add_subdirectory(${FLB_PATH_LIB_SNAPPY} EXCLUDE_FROM_ALL)

Expand Down
1 change: 1 addition & 0 deletions cmake/libraries.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ set(FLB_PATH_LIB_SNAPPY "lib/snappy-fef67ac")
set(FLB_PATH_LIB_RDKAFKA "lib/librdkafka-2.4.0")
set(FLB_PATH_LIB_RING_BUFFER "lib/lwrb")
set(FLB_PATH_LIB_WASM_MICRO_RUNTIME "lib/wasm-micro-runtime-WAMR-1.3.3")
set(FLB_PATH_LIB_SIMDUTF "lib/simdutf-amalgamation-5.5.0")
3 changes: 2 additions & 1 deletion dockerfiles/Dockerfile.centos7
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ RUN cmake3 -DCMAKE_INSTALL_PREFIX=/opt/fluent-bit/ -DCMAKE_INSTALL_SYSCONFDIR=/e
-DFLB_OUT_KAFKA=On \
-DFLB_JEMALLOC=On \
-DFLB_CHUNK_TRACE=On \
-DFLB_OUT_PGSQL=On ../
-DFLB_OUT_PGSQL=On \
-DFLB_UNICODE_ENCODER=Off ../

RUN make -j "$(getconf _NPROCESSORS_ONLN)"
9 changes: 9 additions & 0 deletions include/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,15 @@ install(FILES ${headers}
COMPONENT headers
PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ)

# simdutf
if(FLB_UNICODE_ENCODER)
file(GLOB headers "fluent-bit/simdutf/*.h")
install(FILES ${headers}
DESTINATION ${FLB_INSTALL_INCLUDEDIR}/fluent-bit/simdutf/
COMPONENT headers
PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ)
endif()

install(FILES "../lib/monkey/include/monkey/mk_core.h"
DESTINATION ${FLB_INSTALL_INCLUDEDIR}/monkey/
COMPONENT headers-extra
Expand Down
57 changes: 57 additions & 0 deletions include/fluent-bit/flb_unicode.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */

/* Fluent Bit
* ==========
* Copyright (C) 2015-2024 The Fluent Bit Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef FLB_UNICODE
#define FLB_UNICODE

#include <stddef.h>

#ifdef FLB_HAVE_UNICODE_ENCODER
#include <fluent-bit/simdutf/flb_simdutf_connector.h>

#define FLB_UNICODE_CONVERT_OK FLB_SIMDUTF_CONNECTOR_CONVERT_OK
#define FLB_UNICODE_CONVERT_NOP FLB_SIMDUTF_CONNECTOR_CONVERT_NOP
#define FLB_UNICODE_CONVERT_UNSUPPORTED FLB_SIMDUTF_CONNECTOR_CONVERT_UNSUPPORTED
#define FLB_UNICODE_CONVERT_ERROR FLB_SIMDUTF_CONNECTOR_CONVERT_ERROR

enum flb_unicode_endocing_type {
FLB_UNICODE_ENCODING_UTF8 = FLB_SIMDUTF_ENCODING_TYPE_UTF8, /* BOM 0xef 0xbb 0xbf */
FLB_UNICODE_ENCODING_UTF16_LE = FLB_SIMDUTF_ENCODING_TYPE_UTF16_LE, /* BOM 0xff 0xfe */
FLB_UNICODE_ENCODING_UTF16_BE = FLB_SIMDUTF_ENCODING_TYPE_UTF16_BE, /* BOM 0xfe 0xff */
FLB_UNICODE_ENCODING_UTF32_LE = FLB_SIMDUTF_ENCODING_TYPE_UTF32_LE, /* BOM 0xff 0xfe 0x00 0x00 */
FLB_UNICODE_ENCODING_UTF32_BE = FLB_SIMDUTF_ENCODING_TYPE_UTF32_BE, /* BOM 0x00 0x00 0xfe 0xff */
FLB_UNICODE_ENCODING_Latin1 = FLB_SIMDUTF_ENCODING_TYPE_Latin1,

FLB_UNICODE_ENCODING_UNSPECIFIED = FLB_SIMDUTF_ENCODING_TYPE_UNSPECIFIED,
FLB_UNICODE_ENCODING_AUTO = FLB_SIMDUTF_ENCODING_TYPE_UNICODE_AUTO, /* Automatically detecting flag*/
};

#else

#define FLB_UNICODE_CONVERT_OK 0
#define FLB_UNICODE_CONVERT_UNSUPPORTED -2

#endif

/* Mainly converting from UTF-16LE/BE to UTF-8 */
int flb_unicode_convert(int preferred_encoding, const char *input, size_t length,
char **output, size_t *out_size);
int flb_unicode_validate(const char *record, size_t size);

#endif
91 changes: 91 additions & 0 deletions include/fluent-bit/simdutf/flb_simdutf_connector.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */

/* Fluent Bit
* ==========
* Copyright (C) 2015-2024 The Fluent Bit Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef FLB_SIMDUTF_CONNECTOR_H
#define FLB_SIMDUTF_CONNECTOR_H

#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */

#ifdef __APPLE__
#include <stdint.h>
#include <stddef.h>
typedef int_least16_t CHAR16_T;
#else
#include <uchar.h>
typedef char16_t CHAR16_T;
#endif

#define FLB_SIMDUTF_CONNECTOR_CONVERT_OK 0
#define FLB_SIMDUTF_CONNECTOR_CONVERT_NOP -1
#define FLB_SIMDUTF_CONNECTOR_CONVERT_UNSUPPORTED -2
#define FLB_SIMDUTF_CONNECTOR_CONVERT_ERROR -3

/* Just copy and pasted from amalugamated simdutf.h to remove C++ namespace */
enum flb_simdutf_encoding_type {
FLB_SIMDUTF_ENCODING_TYPE_UTF8 = 1, /* BOM 0xef 0xbb 0xbf */
FLB_SIMDUTF_ENCODING_TYPE_UTF16_LE = 2, /* BOM 0xff 0xfe */
FLB_SIMDUTF_ENCODING_TYPE_UTF16_BE = 4, /* BOM 0xfe 0xff */
FLB_SIMDUTF_ENCODING_TYPE_UTF32_LE = 8, /* BOM 0xff 0xfe 0x00 0x00 */
FLB_SIMDUTF_ENCODING_TYPE_UTF32_BE = 16, /* BOM 0x00 0x00 0xfe 0xff */
FLB_SIMDUTF_ENCODING_TYPE_Latin1 = 32,

FLB_SIMDUTF_ENCODING_TYPE_UNSPECIFIED = 0,
FLB_SIMDUTF_ENCODING_TYPE_UNICODE_AUTO = 1 << 10, /* Automatically detecting flag*/
};

enum flb_simdutf_error_code {
FLB_SIMDUTF_ERROR_CODE_SUCCESS = FLB_SIMDUTF_CONNECTOR_CONVERT_OK,
FLB_SIMDUTF_ERROR_CODE_HEADER_BITS,
FLB_SIMDUTF_ERROR_CODE_TOO_SHORT,
FLB_SIMDUTF_ERROR_CODE_TOO_LONG,
FLB_SIMDUTF_ERROR_CODE_OVERLONG,
FLB_SIMDUTF_ERROR_CODE_TOO_LARGE,
FLB_SIMDUTF_ERROR_CODE_SURROGATE,
FLB_SIMDUTF_ERROR_CODE_INVALID_BASE64_CHARACTER,
FLB_SIMDUTF_ERROR_CODE_BASE64_INPUT_REMAINDER,
FLB_SIMDUTF_ERROR_CODE_OUTPUT_BUFFER_TOO_SMALL,
FLB_SIMDUTF_ERROR_CODE_OTHER,
};

int flb_simdutf_connector_utf8_length_from_utf16le(const CHAR16_T *buf, size_t len);
int flb_simdutf_connector_utf8_length_from_utf16be(const CHAR16_T *buf, size_t len);
int flb_simdutf_connector_utf8_length_from_utf16(const CHAR16_T *buf, size_t len);
int flb_simdutf_connector_validate_utf8(const char *buf, size_t len);
int flb_simdutf_connector_validate_utf16le(const CHAR16_T *buf, size_t len);
int flb_simdutf_connector_validate_utf16be(const CHAR16_T *buf, size_t len);
int flb_simdutf_connector_validate_utf16(const CHAR16_T *buf, size_t len);
int flb_simdutf_connector_convert_utf16le_to_utf8(const CHAR16_T *buf, size_t len,
char **utf8_output, size_t *out_size);
int flb_simdutf_connector_convert_utf16be_to_utf8(const CHAR16_T *buf, size_t len,
char **utf8_output, size_t *out_size);
int flb_simdutf_connector_convert_utf16_to_utf8(const CHAR16_T *buf, size_t len,
char **utf8_output, size_t *out_size);
void flb_simdutf_connector_change_endianness_utf16(const CHAR16_T *input, size_t length, CHAR16_T *output);
int flb_simdutf_connector_detect_encodings(const char *input, size_t length);
int flb_simdutf_connector_convert_from_unicode(int preferred_encoding,
const char *input, size_t length,
char **output, size_t *out_size);

#ifdef __cplusplus
}
#endif /* __cplusplus */

#endif
5 changes: 5 additions & 0 deletions lib/simdutf-amalgamation-5.5.0/AUTHORS
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Wojciech Muła
Daniel Lemire
Nicolas Boyer
John Keiser
Robert Clausecker
5 changes: 5 additions & 0 deletions lib/simdutf-amalgamation-5.5.0/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
set(src
src/simdutf/simdutf.cpp
)

add_library(simdutf-static STATIC ${src})
Loading
Loading