Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce Encoding parametric singleton type #9

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 44 additions & 30 deletions src/StringEncodings.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# This file is a part of Julia. License is MIT: http://julialang.org/license
# This file is a part of StringEncodings.jl. License is MIT: http://julialang.org/license

module StringEncodings
import Base: close, eof, flush, read, readall, write, show
Expand All @@ -8,6 +8,7 @@ export StringEncoder, StringDecoder, encode, decode, encodings
export StringEncodingError, OutputBufferError, IConvError
export InvalidEncodingError, InvalidSequenceError, IncompleteSequenceError

include("encodings.jl")

abstract StringEncodingError

Expand Down Expand Up @@ -62,7 +63,7 @@ function iconv_close(cd::Ptr{Void})
end
end

function iconv_open(tocode, fromcode)
function iconv_open(tocode::ASCIIString, fromcode::ASCIIString)
p = ccall((:iconv_open, libiconv), Ptr{Void}, (Cstring, Cstring), tocode, fromcode)
if p != Ptr{Void}(-1)
return p
Expand Down Expand Up @@ -173,14 +174,16 @@ end
## StringEncoder

"""
StringEncoder(istream, to, from="UTF-8")
StringEncoder(istream, to, from=enc"UTF-8")

Returns a new write-only I/O stream, which converts any text in the encoding `from`
written to it into text in the encoding `to` written to ostream. Calling `close` on the
stream is necessary to complete the encoding (but does not close `ostream`).

`to` and `from` can be specified either as a string or as an `Encoding` object.
"""
function StringEncoder(ostream::IO, to::ASCIIString, from::ASCIIString="UTF-8")
cd = iconv_open(to, from)
function StringEncoder(ostream::IO, to::Encoding, from::Encoding=enc"UTF-8")
cd = iconv_open(ASCIIString(to), ASCIIString(from))
inbuf = Vector{UInt8}(BUFSIZE)
outbuf = Vector{UInt8}(BUFSIZE)
s = StringEncoder(ostream, cd, inbuf, outbuf,
Expand All @@ -190,6 +193,11 @@ function StringEncoder(ostream::IO, to::ASCIIString, from::ASCIIString="UTF-8")
s
end

StringEncoder(ostream::IO, to::AbstractString, from::Encoding=enc"UTF-8") =
StringEncoder(ostream, Encoding(to), from)
StringEncoder(ostream::IO, to::AbstractString, from::AbstractString) =
StringEncoder(ostream, Encoding(to), Encoding(from))

# Flush input buffer and convert it into output buffer
# Returns the number of bytes written to output buffer
function flush(s::StringEncoder)
Expand Down Expand Up @@ -226,16 +234,18 @@ end
## StringDecoder

"""
StringDecoder(istream, from, to="UTF-8")
StringDecoder(istream, from::Encoding, to::Encoding=enc"UTF-8")

Returns a new read-only I/O stream, which converts text in the encoding `from`
read from `istream` into text in the encoding `to`.

`to` and `from` can be specified either as a string or as an `Encoding` object.

Note that some implementations (notably the Windows one) may accept invalid sequences
in the input data without raising an error.
"""
function StringDecoder(istream::IO, from::ASCIIString, to::ASCIIString="UTF-8")
cd = iconv_open(to, from)
function StringDecoder(istream::IO, from::Encoding, to::Encoding=enc"UTF-8")
cd = iconv_open(ASCIIString(to), ASCIIString(from))
inbuf = Vector{UInt8}(BUFSIZE)
outbuf = Vector{UInt8}(BUFSIZE)
s = StringDecoder(istream, cd, inbuf, outbuf,
Expand All @@ -245,6 +255,11 @@ function StringDecoder(istream::IO, from::ASCIIString, to::ASCIIString="UTF-8")
s
end

StringDecoder(istream::IO, from::AbstractString, to::Encoding=enc"UTF-8") =
StringDecoder(istream, Encoding(from), to)
StringDecoder(istream::IO, from::AbstractString, to::AbstractString) =
StringDecoder(istream, Encoding(from), Encoding(to))

# Fill input buffer and convert it into output buffer
# Returns the number of bytes written to output buffer
function fill_buffer!(s::StringDecoder)
Expand Down Expand Up @@ -289,68 +304,67 @@ end
## Convenience I/O functions
if isdefined(Base, :readstring)
@doc """
readstring(stream or filename, enc::ASCIIString)
readstring(stream or filename, enc::Encoding)

Read the entire contents of an I/O stream or a file in encoding `enc` as a string.
""" ->
Base.readstring(s::IO, enc::ASCIIString) = readstring(StringDecoder(s, enc))
Base.readstring(filename::AbstractString, enc::ASCIIString) = open(io->readstring(io, enc), filename)
Base.readstring(s::IO, enc::Encoding) = readstring(StringDecoder(s, enc))
Base.readstring(filename::AbstractString, enc::Encoding) = open(io->readstring(io, enc), filename)
else # Compatibility with Julia 0.4
@doc """
readall(stream or filename, enc::ASCIIString)
readall(stream or filename, enc::Encoding)

Read the entire contents of an I/O stream or a file in encoding `enc` as a string.
""" ->
Base.readall(s::IO, enc::ASCIIString) = readall(StringDecoder(s, enc))
Base.readall(filename::AbstractString, enc::ASCIIString) = open(io->readall(io, enc), filename)
Base.readall(s::IO, enc::Encoding) = readall(StringDecoder(s, enc))
Base.readall(filename::AbstractString, enc::Encoding) = open(io->readall(io, enc), filename)
end


## Functions to encode/decode strings

encoding_string(::Type{ASCIIString}) = "ASCII"
encoding_string(::Type{UTF8String}) = "UTF-8"
encoding_string(::Type{UTF16String}) = (ENDIAN_BOM == 0x04030201) ? "UTF-16LE" : "UTF-16BE"
encoding_string(::Type{UTF32String}) = (ENDIAN_BOM == 0x04030201) ? "UTF-32LE" : "UTF-32BE"

"""
decode([T,] a::Vector{UInt8}, enc::ASCIIString)
decode([T,] a::Vector{UInt8}, enc)

Convert an array of bytes `a` representing text in encoding `enc` to a string of type `T`.
By default, a `UTF8String` is returned.

`enc` can be specified either as a string or as an `Encoding` object.

Note that some implementations (notably the Windows one) may accept invalid sequences
in the input data without raising an error.
"""
function decode{T<:AbstractString}(::Type{T}, a::Vector{UInt8}, enc::ASCIIString)
function decode{T<:AbstractString}(::Type{T}, a::Vector{UInt8}, enc::Encoding)
b = IOBuffer(a)
try
T(readbytes(StringDecoder(b, enc, encoding_string(T))))
T(readbytes(StringDecoder(b, enc, encoding(T))))
finally
close(b)
end
end

decode(a::Vector{UInt8}, enc::ASCIIString) = decode(UTF8String, a, enc)
decode{T<:AbstractString}(::Type{T}, a::Vector{UInt8}, enc::AbstractString) = decode(T, a, Encoding(enc))

decode(a::Vector{UInt8}, enc::AbstractString) = decode(UTF8String, a, Encoding(enc))
decode(a::Vector{UInt8}, enc::Union{AbstractString, Encoding}) = decode(UTF8String, a, enc)

"""
encode(s::AbstractString, enc::ASCIIString)
encode(s::AbstractString, enc)

Convert string `s` to an array of bytes representing text in encoding `enc`.
`enc` can be specified either as a string or as an `Encoding` object.
"""
function encode(s::AbstractString, enc::ASCIIString)
function encode(s::AbstractString, enc::Encoding)
b = IOBuffer()
p = StringEncoder(b, enc, encoding_string(typeof(s)))
p = StringEncoder(b, enc, encoding(typeof(s)))
write(p, s)
close(p)
takebuf_array(b)
end

encode(s::AbstractString, enc::AbstractString) = encode(s, Encoding(enc))

## Function to list supported encodings
include("encodings.jl")

function test_encoding(enc)
function test_encoding(enc::ASCIIString)
# We assume that an encoding is supported if it's possible to convert from it to UTF-8:
cd = ccall((:iconv_open, libiconv), Ptr{Void}, (Cstring, Cstring), enc, "UTF-8")
if cd == Ptr{Void}(-1)
Expand Down
Loading