Skip to content

Commit

Permalink
Handle UTF-8 code points in StdString (#381)
Browse files Browse the repository at this point in the history
* Improve UTF-8 support for StdString

* Common support for StdWString/StdString

* Fix invalid empty character literal on Julia 1.6

* Debug Windows StdWString issue

* Fix StdWString tests on Windows

* Support constructing of malformed chars for tests

* Update iterating into invalid index

* Support null-terminated constructor via Cstring

* Comment on custom iterate method

* Add docstring for StdString(::Any, ::Integer)

* Add tests for StdString(::String, ::Integer)

* Add README entry

* fixup! Add README entry

* fixup! Add tests for StdString(::String, ::Integer)
  • Loading branch information
omus authored Oct 21, 2023
1 parent a1e26aa commit 81864a2
Show file tree
Hide file tree
Showing 3 changed files with 182 additions and 17 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -902,6 +902,10 @@ mod.method("getSecondaryWorldVector", [](const World* p)->const std::vector<Worl

Linking wrappers using STL support requires adding `JlCxx::cxxwrap_julia_stl` to the `target_link_libraries` command in `CMakeLists.txt`.

### Working with `StdString`

The `StdString` implements the Julia string interface and interprets `std::string` data as UTF-8 data. Since C++ strings do not require the use of the null-character to denote the end of a string the `StdString` constructors usually rely on the `ncodeunits` to determin the size of the string. When constructing a `StdString` from a `Cstring`, `Base.CodeUnits`, or `Vector{UInt8}` the first null-character present will denote the end of the string.

## Release procedure

Often, new releases of `CxxWrap` also require a new release of the C++ component `libcxxwrap-julia`, and a rebuild of its JLL package. To make sure everything is tested properly, the following procedure should be followed for each release that requires changing both the Julia and the C++ component:
Expand Down
88 changes: 77 additions & 11 deletions src/StdLib.jl
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,38 @@ Base.ncodeunits(s::CppBasicString)::Int = cppsize(s)
Base.codeunit(s::StdString) = UInt8
Base.codeunit(s::StdWString) = Cwchar_t == Int32 ? UInt32 : UInt16
Base.codeunit(s::CppBasicString, i::Integer) = reinterpret(codeunit(s), cxxgetindex(s,i))
Base.isvalid(s::CppBasicString, i::Integer) = (0 < i <= ncodeunits(s))
function Base.iterate(s::CppBasicString, i::Integer=1)
if !isvalid(s,i)
return nothing
end
return(convert(Char,codeunit(s,i)),i+1)
Base.isvalid(s::CppBasicString, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i
Base.thisind(s::CppBasicString, i::Int) = Base._thisind_str(s, i)
Base.nextind(s::CppBasicString, i::Int) = Base._nextind_str(s, i)

function Base.iterate(s::CppBasicString, i::Integer=firstindex(s))
i > ncodeunits(s) && return nothing
return convert(Char, codeunit(s, i)), nextind(s, i)
end

# Since the Julia base string iteration is `String` specific we need to implement our own.
# This implementation is based around a functioning `nextind` which allows us to convert the
# UTF-8 codeunits into their big-endian encoding.
function Base.iterate(s::StdString, i::Integer=firstindex(s))
i > ncodeunits(s) && return nothing
j = isvalid(s, i) ? nextind(s, i) : i + 1
u = UInt32(codeunit(s, i)) << 24
(i += 1) < j || @goto ret
u |= UInt32(codeunit(s, i)) << 16
(i += 1) < j || @goto ret
u |= UInt32(codeunit(s, i)) << 8
(i += 1) < j || @goto ret
u |= UInt32(codeunit(s, i))
@label ret
return reinterpret(Char, u), j
end

function Base.getindex(s::CppBasicString, i::Int)
checkbounds(s, i)
isvalid(s, i) || Base.string_index_err(s, i)
c, i = iterate(s, i)
return c
end
Base.getindex(s::CppBasicString, i::Int) = Char(cxxgetindex(s,i))

function StdWString(s::String)
char_arr = transcode(Cwchar_t, s)
Expand Down Expand Up @@ -112,10 +136,52 @@ Base.cmp(a::String, b::CppBasicString) = cmp(a,String(b))

# Make sure functions taking a C++ string as argument can also take a Julia string
CxxWrapCore.map_julia_arg_type(x::Type{<:StdString}) = AbstractString
StdString(x::String) = StdString(x,ncodeunits(x))
StdLib.StdStringAllocated(x::String) = StdString(x,ncodeunits(x))
Base.cconvert(::Type{CxxWrapCore.ConstCxxRef{StdString}}, x::String) = StdString(x,ncodeunits(x))
Base.cconvert(::Type{StdLib.StdStringDereferenced}, x::String) = StdString(x,ncodeunits(x))

"""
StdString(str::String)
Create a `StdString` from the contents of the string. Any null-characters ('\\0') will be
included in the string such that `ncodeunits(str) == ncodeunits(StdString(str))`.
"""
StdString(x::String) = StdString(x, ncodeunits(x))

"""
StdString(str::Union{Cstring, Base.CodeUnits, Vector{UInt8}, Ref{Int8}, Array{Int8}})
Create a `StdString` from the null-terminated character sequence.
If you want to construct a `StdString` that includes the null-character ('\\0') either use
[`StdString(::String)`](@ref) or [`StdString(::Any, ::Int)`](@ref).
## Examples
```julia
julia> StdString(b"visible\\0hidden")
"visible"
```
"""
StdString(::Union{Cstring, Base.CodeUnits, Vector{UInt8}, Ref{Int8}, Array{Int8}})

StdString(x::Cstring) = StdString(convert(Ptr{Int8}, x))
StdString(x::Base.CodeUnits) = StdString(collect(x))
StdString(x::Vector{UInt8}) = StdString(collect(reinterpret(Int8, x)))

"""
StdString(str, n::Integer)
Create a `StdString` from the first `n` code units of `str` (including null-characters).
## Examples
```julia
julia> StdString("visible\\0hidden", 10)
"visible\\0hi"
```
"""
StdString(::Any, ::Integer)

Base.cconvert(::Type{CxxWrapCore.ConstCxxRef{StdString}}, x::String) = StdString(x, ncodeunits(x))
Base.cconvert(::Type{StdLib.StdStringDereferenced}, x::String) = StdString(x, ncodeunits(x))
Base.unsafe_convert(::Type{CxxWrapCore.ConstCxxRef{StdString}}, x::StdString) = ConstCxxRef(x)

function StdValArray(v::Vector{T}) where {T}
Expand Down
107 changes: 101 additions & 6 deletions test/stdlib.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
using CxxWrap
using Test

# Can use invalid character literals (e.g. '\xa8') as of Julia 1.9:
# https://github.com/JuliaLang/julia/pull/44989
malformed_char(x) = reinterpret(Char, UInt32(x) << 24)

@testset "$(basename(@__FILE__)[1:end-3])" begin

let s = StdString("test")
Expand Down Expand Up @@ -38,12 +42,103 @@ let s = StdString("foo")
@test unsafe_string(CxxWrap.StdLib.c_str(s),2) == "fo"
end

let s = "\x01\x00\x02"
@test length(StdString(s)) == 3
@test length(StdString(s, length(s))) == 3
let str = "\x01\x00\x02"
std_str = StdString(codeunits(str))
@test length(std_str) == 1
@test collect(std_str) == ['\x01']
@test ncodeunits(std_str) == 1
@test codeunits(std_str) == b"\x01"

std_str = StdString(str)
@test length(std_str) == 3
@test collect(std_str) == ['\x01', '\x00', '\x02']
@test ncodeunits(std_str) == 3
@test codeunits(std_str) == b"\x01\x00\x02"

std_str = StdString(str, 2)
@test length(std_str) == 2
@test collect(std_str) == ['\x01', '\x00']
@test ncodeunits(std_str) == 2
@test codeunits(std_str) == b"\x01\x00"

std_str = convert(StdString, str)
@test length(std_str) == 3
@test collect(std_str) == ['\x01', '\x00', '\x02']
@test ncodeunits(std_str) == 3
@test codeunits(std_str) == b"\x01\x00\x02"
@test convert(String, std_str) == str
end

@test String(StdString(s)) == s
@test String(StdString(s, length(s))) == s
let str = "α\0β"
std_str = StdString(codeunits(str))
@test length(std_str) == 1
@test collect(std_str) == ['α']
@test ncodeunits(std_str) == 2
@test codeunits(std_str) == b"α"

std_str = StdString(str)
@test length(std_str) == 3
@test collect(std_str) == ['α', '\0', 'β']
@test ncodeunits(std_str) == 5
@test codeunits(std_str) == b"α\0β"

std_str = StdString(str, 4)
@test length(std_str) == 3
@test collect(std_str) == ['α', '\0', malformed_char(0xce)]
@test ncodeunits(std_str) == 4
@test codeunits(std_str) == b"α\0\xce"

std_str = convert(StdString, str)
@test length(std_str) == 3
@test collect(std_str) == ['α', '\0', 'β']
@test ncodeunits(std_str) == 5
@test codeunits(std_str) == b"α\0β"
@test convert(String, std_str) == str
end

@testset "StdString" begin
@testset "null-terminated constructors" begin
c_str = Cstring(Base.unsafe_convert(Ptr{Cchar}, "visible\0hidden"))
@test StdString(c_str) == "visible"
@test StdString(b"visible\0hidden") == "visible"
@test StdString(UInt8[0xff, 0x00, 0xff]) == "\xff"
end

@testset "iterate" begin
s = StdString("𨉟")
@test iterate(s) == ('𨉟', 5)
@test iterate(s, firstindex(s)) == ('𨉟', 5)
@test iterate(s, 2) == (malformed_char(0xa8), 3)
@test iterate(s, 3) == (malformed_char(0x89), 4)
@test iterate(s, 4) == (malformed_char(0x9f), 5)
@test iterate(s, 5) === nothing
@test iterate(s, typemax(Int)) === nothing
end

@testset "getindex" begin
s = StdString("α")
@test getindex(s, firstindex(s)) == 'α'
@test_throws StringIndexError getindex(s, 2)
@test_throws BoundsError getindex(s, 3)
end
end

@testset "StdWString" begin
@testset "iterate" begin
char = codeunit(StdWString()) == UInt32 ? '😄' : 'α'
s = StdWString(string(char))
@test iterate(s) == (char, 2)
@test iterate(s, firstindex(s)) == (char, 2)
@test iterate(s, 2) === nothing
@test iterate(s, typemax(Int)) === nothing
end

@testset "getindex" begin
char = codeunit(StdWString()) == UInt32 ? '😄' : 'α'
s = StdWString(string(char))
@test getindex(s, firstindex(s)) == char
@test_throws BoundsError getindex(s, 2)
end
end

stvec = StdVector(Int32[1,2,3])
Expand Down Expand Up @@ -112,4 +207,4 @@ let
@test length(deque2) == 1
end

end
end

0 comments on commit 81864a2

Please sign in to comment.