From 81864a2bc5637808ff86a5bc2365a0a5e6766e71 Mon Sep 17 00:00:00 2001 From: Curtis Vogt Date: Sat, 21 Oct 2023 07:11:32 -0500 Subject: [PATCH] Handle UTF-8 code points in `StdString` (#381) * Improve UTF-8 support for StdString * Common support for StdWString/StdString * Fix invalid empty character literal on Julia 1.6 * Debug Windows StdWString issue * Fix StdWString tests on Windows * Support constructing of malformed chars for tests * Update iterating into invalid index * Support null-terminated constructor via Cstring * Comment on custom iterate method * Add docstring for StdString(::Any, ::Integer) * Add tests for StdString(::String, ::Integer) * Add README entry * fixup! Add README entry * fixup! Add tests for StdString(::String, ::Integer) --- README.md | 4 ++ src/StdLib.jl | 88 +++++++++++++++++++++++++++++++++++----- test/stdlib.jl | 107 ++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 182 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index e4f6028..b7feacf 100644 --- a/README.md +++ b/README.md @@ -902,6 +902,10 @@ mod.method("getSecondaryWorldVector", [](const World* p)->const std::vector ncodeunits(s) && return nothing + return convert(Char, codeunit(s, i)), nextind(s, i) +end + +# Since the Julia base string iteration is `String` specific we need to implement our own. +# This implementation is based around a functioning `nextind` which allows us to convert the +# UTF-8 codeunits into their big-endian encoding. +function Base.iterate(s::StdString, i::Integer=firstindex(s)) + i > ncodeunits(s) && return nothing + j = isvalid(s, i) ? nextind(s, i) : i + 1 + u = UInt32(codeunit(s, i)) << 24 + (i += 1) < j || @goto ret + u |= UInt32(codeunit(s, i)) << 16 + (i += 1) < j || @goto ret + u |= UInt32(codeunit(s, i)) << 8 + (i += 1) < j || @goto ret + u |= UInt32(codeunit(s, i)) + @label ret + return reinterpret(Char, u), j +end + +function Base.getindex(s::CppBasicString, i::Int) + checkbounds(s, i) + isvalid(s, i) || Base.string_index_err(s, i) + c, i = iterate(s, i) + return c end -Base.getindex(s::CppBasicString, i::Int) = Char(cxxgetindex(s,i)) function StdWString(s::String) char_arr = transcode(Cwchar_t, s) @@ -112,10 +136,52 @@ Base.cmp(a::String, b::CppBasicString) = cmp(a,String(b)) # Make sure functions taking a C++ string as argument can also take a Julia string CxxWrapCore.map_julia_arg_type(x::Type{<:StdString}) = AbstractString -StdString(x::String) = StdString(x,ncodeunits(x)) -StdLib.StdStringAllocated(x::String) = StdString(x,ncodeunits(x)) -Base.cconvert(::Type{CxxWrapCore.ConstCxxRef{StdString}}, x::String) = StdString(x,ncodeunits(x)) -Base.cconvert(::Type{StdLib.StdStringDereferenced}, x::String) = StdString(x,ncodeunits(x)) + +""" + StdString(str::String) + +Create a `StdString` from the contents of the string. Any null-characters ('\\0') will be +included in the string such that `ncodeunits(str) == ncodeunits(StdString(str))`. +""" +StdString(x::String) = StdString(x, ncodeunits(x)) + +""" + StdString(str::Union{Cstring, Base.CodeUnits, Vector{UInt8}, Ref{Int8}, Array{Int8}}) + +Create a `StdString` from the null-terminated character sequence. + +If you want to construct a `StdString` that includes the null-character ('\\0') either use +[`StdString(::String)`](@ref) or [`StdString(::Any, ::Int)`](@ref). + +## Examples + +```julia +julia> StdString(b"visible\\0hidden") +"visible" +``` +""" +StdString(::Union{Cstring, Base.CodeUnits, Vector{UInt8}, Ref{Int8}, Array{Int8}}) + +StdString(x::Cstring) = StdString(convert(Ptr{Int8}, x)) +StdString(x::Base.CodeUnits) = StdString(collect(x)) +StdString(x::Vector{UInt8}) = StdString(collect(reinterpret(Int8, x))) + +""" + StdString(str, n::Integer) + +Create a `StdString` from the first `n` code units of `str` (including null-characters). + +## Examples + +```julia +julia> StdString("visible\\0hidden", 10) +"visible\\0hi" +``` +""" +StdString(::Any, ::Integer) + +Base.cconvert(::Type{CxxWrapCore.ConstCxxRef{StdString}}, x::String) = StdString(x, ncodeunits(x)) +Base.cconvert(::Type{StdLib.StdStringDereferenced}, x::String) = StdString(x, ncodeunits(x)) Base.unsafe_convert(::Type{CxxWrapCore.ConstCxxRef{StdString}}, x::StdString) = ConstCxxRef(x) function StdValArray(v::Vector{T}) where {T} diff --git a/test/stdlib.jl b/test/stdlib.jl index c1a465b..01869e9 100644 --- a/test/stdlib.jl +++ b/test/stdlib.jl @@ -1,6 +1,10 @@ using CxxWrap using Test +# Can use invalid character literals (e.g. '\xa8') as of Julia 1.9: +# https://github.com/JuliaLang/julia/pull/44989 +malformed_char(x) = reinterpret(Char, UInt32(x) << 24) + @testset "$(basename(@__FILE__)[1:end-3])" begin let s = StdString("test") @@ -38,12 +42,103 @@ let s = StdString("foo") @test unsafe_string(CxxWrap.StdLib.c_str(s),2) == "fo" end -let s = "\x01\x00\x02" - @test length(StdString(s)) == 3 - @test length(StdString(s, length(s))) == 3 +let str = "\x01\x00\x02" + std_str = StdString(codeunits(str)) + @test length(std_str) == 1 + @test collect(std_str) == ['\x01'] + @test ncodeunits(std_str) == 1 + @test codeunits(std_str) == b"\x01" + + std_str = StdString(str) + @test length(std_str) == 3 + @test collect(std_str) == ['\x01', '\x00', '\x02'] + @test ncodeunits(std_str) == 3 + @test codeunits(std_str) == b"\x01\x00\x02" + + std_str = StdString(str, 2) + @test length(std_str) == 2 + @test collect(std_str) == ['\x01', '\x00'] + @test ncodeunits(std_str) == 2 + @test codeunits(std_str) == b"\x01\x00" + + std_str = convert(StdString, str) + @test length(std_str) == 3 + @test collect(std_str) == ['\x01', '\x00', '\x02'] + @test ncodeunits(std_str) == 3 + @test codeunits(std_str) == b"\x01\x00\x02" + @test convert(String, std_str) == str +end - @test String(StdString(s)) == s - @test String(StdString(s, length(s))) == s +let str = "α\0β" + std_str = StdString(codeunits(str)) + @test length(std_str) == 1 + @test collect(std_str) == ['α'] + @test ncodeunits(std_str) == 2 + @test codeunits(std_str) == b"α" + + std_str = StdString(str) + @test length(std_str) == 3 + @test collect(std_str) == ['α', '\0', 'β'] + @test ncodeunits(std_str) == 5 + @test codeunits(std_str) == b"α\0β" + + std_str = StdString(str, 4) + @test length(std_str) == 3 + @test collect(std_str) == ['α', '\0', malformed_char(0xce)] + @test ncodeunits(std_str) == 4 + @test codeunits(std_str) == b"α\0\xce" + + std_str = convert(StdString, str) + @test length(std_str) == 3 + @test collect(std_str) == ['α', '\0', 'β'] + @test ncodeunits(std_str) == 5 + @test codeunits(std_str) == b"α\0β" + @test convert(String, std_str) == str +end + +@testset "StdString" begin + @testset "null-terminated constructors" begin + c_str = Cstring(Base.unsafe_convert(Ptr{Cchar}, "visible\0hidden")) + @test StdString(c_str) == "visible" + @test StdString(b"visible\0hidden") == "visible" + @test StdString(UInt8[0xff, 0x00, 0xff]) == "\xff" + end + + @testset "iterate" begin + s = StdString("𨉟") + @test iterate(s) == ('𨉟', 5) + @test iterate(s, firstindex(s)) == ('𨉟', 5) + @test iterate(s, 2) == (malformed_char(0xa8), 3) + @test iterate(s, 3) == (malformed_char(0x89), 4) + @test iterate(s, 4) == (malformed_char(0x9f), 5) + @test iterate(s, 5) === nothing + @test iterate(s, typemax(Int)) === nothing + end + + @testset "getindex" begin + s = StdString("α") + @test getindex(s, firstindex(s)) == 'α' + @test_throws StringIndexError getindex(s, 2) + @test_throws BoundsError getindex(s, 3) + end +end + +@testset "StdWString" begin + @testset "iterate" begin + char = codeunit(StdWString()) == UInt32 ? '😄' : 'α' + s = StdWString(string(char)) + @test iterate(s) == (char, 2) + @test iterate(s, firstindex(s)) == (char, 2) + @test iterate(s, 2) === nothing + @test iterate(s, typemax(Int)) === nothing + end + + @testset "getindex" begin + char = codeunit(StdWString()) == UInt32 ? '😄' : 'α' + s = StdWString(string(char)) + @test getindex(s, firstindex(s)) == char + @test_throws BoundsError getindex(s, 2) + end end stvec = StdVector(Int32[1,2,3]) @@ -112,4 +207,4 @@ let @test length(deque2) == 1 end -end \ No newline at end of file +end