diff --git a/cbits/codepoint_offset.c b/cbits/codepoint_offset.c new file mode 100644 index 00000000..bcfed52c --- /dev/null +++ b/cbits/codepoint_offset.c @@ -0,0 +1,138 @@ + +#include +#include +#include +#ifdef __x86_64__ +#include +#include +#endif +#include +#include + +// The following is from FreeBSD's memmem.c +// https://github.com/freebsd/freebsd-src/blob/9921563f43a924d21c7bf43db4a34e724577db95/lib/libc/string/memmem.c + +/*- + * SPDX-License-Identifier: MIT + * + * Copyright (c) 2005-2014 Rich Felker, et al, 2022 Alex Mason. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +static uint8_t * +twobyte_memmem(const uint8_t *h, size_t hlen, const uint8_t *n) +{ + uint16_t nw = n[0] << 8 | n[1], hw = h[0] << 8 | h[1]; + for (h += 2, hlen -= 2; hlen; hlen--, hw = hw << 8 | *h++) + if (hw == nw) + return (uint8_t *)h - 2; + return hw == nw ? (uint8_t *)h - 2 : 0; +} + +static uint8_t * +threebyte_memmem(const uint8_t *h, size_t hlen, const uint8_t *n) +{ + uint32_t nw = (uint32_t)n[0] << 24 | n[1] << 16 | n[2] << 8; + uint32_t hw = (uint32_t)h[0] << 24 | h[1] << 16 | h[2] << 8; + for (h += 3, hlen -= 3; hlen; hlen--, hw = (hw | *h++) << 8) + if (hw == nw) + return (uint8_t *)h - 3; + return hw == nw ? (uint8_t *)h - 3 : 0; +} + +static uint8_t * +fourbyte_memmem(const uint8_t *h, size_t hlen, const uint8_t *n) +{ + uint32_t nw = (uint32_t)n[0] << 24 | n[1] << 16 | n[2] << 8 | n[3]; + uint32_t hw = (uint32_t)h[0] << 24 | h[1] << 16 | h[2] << 8 | h[3]; + for (h += 4, hlen -= 4; hlen; hlen--, hw = hw << 8 | *h++) + if (hw == nw) + return (uint8_t *)h - 4; + return hw == nw ? (uint8_t *)h - 4 : 0; +} + +static int _hs_codepoint_to_utf8(uint8_t asUtf8[4], uint32_t codepoint) +{ + + if (codepoint < 0x80) + { + asUtf8[0] = codepoint; + return 1; + } + else if (codepoint < 0x0800) + { + asUtf8[0] = (uint8_t)(((codepoint >> 6) & 0x1F) | 0xC0); + asUtf8[1] = (uint8_t)(((codepoint >> 0) & 0x3F) | 0x80); + return 2; + } + else if (codepoint < 0x10000) + { + asUtf8[0] = (uint8_t)(((codepoint >> 12) & 0x0F) | 0xE0); + asUtf8[1] = (uint8_t)(((codepoint >> 6) & 0x3F) | 0x80); + asUtf8[2] = (uint8_t)(((codepoint >> 0) & 0x3F) | 0x80); + return 3; + } + else + { + asUtf8[0] = (uint8_t)(((codepoint >> 18) & 0x07) | 0xF0); + asUtf8[1] = (uint8_t)(((codepoint >> 12) & 0x3F) | 0x80); + asUtf8[2] = (uint8_t)(((codepoint >> 6) & 0x3F) | 0x80); + asUtf8[3] = (uint8_t)(((codepoint >> 0) & 0x3F) | 0x80); + return 4; + } +} + +size_t _hs_offset_of_codepoint(const uint8_t *haystack0, const size_t hoffset, const size_t hlen0, const size_t needle) +{ + const uint8_t *haystack = haystack0 + hoffset; + uint8_t *res = NULL; + uint8_t asUtf8[4] = {0}; + const int codepointLen = _hs_codepoint_to_utf8(asUtf8, needle); + + // Skip to first location that could contain the character. + uint8_t *haystackFirst = (uint8_t *)memchr(haystack, asUtf8[0], hlen0); + + if (haystackFirst) + { + const size_t hlen = hlen0 - (haystackFirst - haystack); + + switch (codepointLen) + { + case 1: + res = haystackFirst; + break; + case 2: + res = twobyte_memmem(haystackFirst, hlen, asUtf8); + break; + case 3: + res = threebyte_memmem(haystackFirst, hlen, asUtf8); + break; + case 4: + res = fourbyte_memmem(haystackFirst, hlen, asUtf8); + break; + default: + res = NULL; + break; + } + } + + return res == NULL ? -1 : (size_t)((uint8_t *)res - haystack); +} diff --git a/src/Data/Text.hs b/src/Data/Text.hs index 2ec5980a..16364aff 100644 --- a/src/Data/Text.hs +++ b/src/Data/Text.hs @@ -143,6 +143,7 @@ module Data.Text , stripEnd , splitAt , breakOn + , breakOnChar , breakOnEnd , break , span @@ -157,6 +158,7 @@ module Data.Text -- $split , splitOn , split + , splitOnChar , chunksOf -- ** Breaking into lines and words @@ -204,6 +206,7 @@ module Data.Text , unpackCStringAscii# , measureOff + , codepointOffset ) where import Prelude (Char, Bool(..), Int, Maybe(..), String, @@ -262,6 +265,7 @@ import System.IO.Unsafe (unsafePerformIO) -- $setup -- >>> :set -package transformers -- >>> import Control.Monad.Trans.State +-- >>> import Data.Char (isUpper) -- >>> import Data.Text -- >>> import qualified Data.Text as T -- >>> :seti -XOverloadedStrings @@ -411,7 +415,7 @@ instance Data Text where instance TH.Lift Text where #if MIN_VERSION_template_haskell(2,16,0) lift txt = do - let (ptr, len) = unsafePerformIO $ asForeignPtr txt + let (ptr, len) = unsafePerformIO $ asForeignPtr txt let lenInt = P.fromIntegral len TH.appE (TH.appE (TH.varE 'unpackCStringLen#) (TH.litE . TH.bytesPrimL $ TH.mkBytes ptr 0 lenInt)) (TH.lift lenInt) #else @@ -1300,6 +1304,19 @@ measureOff !n (Text (A.ByteArray arr) off len) = if len == 0 then 0 else foreign import ccall unsafe "_hs_text_measure_off" c_measure_off :: ByteArray# -> CSize -> CSize -> CSize -> IO CSsize +-- | /O(n)/ Finds the byte offset of the first occurrence of @c@ in the @Text@, or +-- '-1' if if can't be found. +codepointOffset :: Text -> Char -> Int +codepointOffset !(Text (A.ByteArray arr) off len) c = if len == 0 then -1 else + cSsizeToInt $ unsafeDupablePerformIO $ + c_hs_offset_of_codepoint arr (intToCSize off) (intToCSize len) (intToCSize $ ord c) + +-- | The input buffer (arr :: ByteArray#, off :: CSize, len :: CSize) +-- must specify a valid UTF-8 sequence, and the character must be less +-- than 0x10FFFF, these conditions are not checked. +foreign import ccall unsafe "_hs_offset_of_codepoint" c_hs_offset_of_codepoint + ::ByteArray# -> CSize -> CSize -> CSize -> IO CSsize + -- | /O(n)/ 'takeEnd' @n@ @t@ returns the suffix remaining after -- taking @n@ characters from the end of @t@. -- @@ -1584,7 +1601,7 @@ splitOn :: HasCallStack -> [Text] splitOn pat@(Text _ _ l) src@(Text arr off len) | l <= 0 = emptyError "splitOn" - | isSingleton pat = split (== unsafeHead pat) src + | isSingleton pat = splitOnChar (unsafeHead pat) src | otherwise = go 0 (indices pat src) where go !s (x:xs) = text arr (s+off) (x-s) : go (x+l) xs @@ -1592,19 +1609,22 @@ splitOn pat@(Text _ _ l) src@(Text arr off len) {-# INLINE [1] splitOn #-} {-# RULES -"TEXT splitOn/singleton -> split/==" [~1] forall c t. - splitOn (singleton c) t = split (==c) t +"TEXT splitOn/singleton -> splitOnChar" [~1] forall c t. + splitOn (singleton c) t = splitOnChar c t #-} + -- | /O(n)/ Splits a 'Text' into components delimited by separators, -- where the predicate returns True for a separator element. The -- resulting components do not contain the separators. Two adjacent --- separators result in an empty component in the output. eg. +-- separators result in an empty component in the output. To split +-- on a specific character, use @splitOnChar@. +-- eg. -- --- >>> split (=='a') "aabbaca" --- ["","","bb","c",""] +-- >>> split isUpper "theQuickBrownFox" +-- ["the","uick","rown","ox"] -- --- >>> split (=='a') "" +-- >>> split isUpper "" -- [""] split :: (Char -> Bool) -> Text -> [Text] split _ t@(Text _off _arr 0) = [t] @@ -1614,6 +1634,32 @@ split p t = loop t where (# l, s' #) = span_ (not . p) s {-# INLINE split #-} + +{- TODO Fix: +Rule "TEXT split/eq1 -> splitOnChar/==" may never fire + because rule "Class op ==" for ‘==’ might fire first +Probable fix: add phase [n] or [~n] to the competing rulecompile(-Winline-rule-shadowing) +-} +{-# RULES +"TEXT split/eq1 -> splitOnChar/==" [~2] forall c t. + split (== c) t = splitOnChar c t +"TEXT split/eq1 -> splitOnChar/==" [~2] forall c t. + split (c ==) t = splitOnChar c t + #-} + + +-- | /O(n)/ Splits a 'Text' into components delimited by the given @Char@. +-- The behaviour is the same as @split@ but should be faster than @split (== c)@ +-- +-- >>> split (=='a') "aabbaca" +-- ["","","bb","c",""] +splitOnChar :: Char -> Text -> [Text] +splitOnChar _ t@(Text _off _arr 0) = [t] +splitOnChar c t = loop t + where loop s | null s' = [l] + | otherwise = l : loop (unsafeTail s') + where ( l, s' ) = breakOnChar c s + -- | /O(n)/ Splits a 'Text' into components of length @k@. The last -- element may be shorter than the other chunks, depending on the -- length of the input. Examples: @@ -1737,6 +1783,8 @@ filter p = go -- is the prefix of @haystack@ before @needle@ is matched. The second -- is the remainder of @haystack@, starting with the match. -- +-- To break on a specific character, use @breakOnChar@ +-- -- Examples: -- -- >>> breakOn "::" "a::b::c" @@ -1764,6 +1812,19 @@ breakOn pat src@(Text arr off len) (x:_) -> (text arr off x, text arr (off+x) (len-x)) {-# INLINE breakOn #-} +-- | /O(n)/ Equivalent to @breakOn (== c)@ but should be faster. +-- +-- >>> breakOnChar '/' "foo/bar/" +-- ("foo","/bar/") +-- +-- >>> breakOnChar '/' "foobar" +-- ("foobar","") +breakOnChar :: Char -> Text -> (Text, Text) +breakOnChar c src@(Text arr off len) = case codepointOffset src c of + -1 -> (src, empty) + n -> (text arr off n, text arr (off+n) (len-n) ) + + -- | /O(n+m)/ Similar to 'breakOn', but searches from the end of the -- string. -- diff --git a/tests/Tests/Properties/Text.hs b/tests/Tests/Properties/Text.hs index 3faa7bf3..b5d6e665 100644 --- a/tests/Tests/Properties/Text.hs +++ b/tests/Tests/Properties/Text.hs @@ -23,6 +23,7 @@ import qualified Data.Text.Internal.Fusion.Common as S import qualified Data.Text.Internal.Lazy.Fusion as SL import qualified Data.Text.Internal.Lazy.Search as S (indices) import qualified Data.Text.Internal.Search as T (indices) +import qualified Data.Text.Internal as TI (Text(..)) import qualified Data.Text.Lazy as TL import qualified Tests.SlowFunctions as Slow @@ -267,6 +268,28 @@ tl_indices_char_drop n c pref suff = map fromIntegral (S.indices s t) === Slow.i s = TL.singleton c t = TL.drop n $ pref `TL.append` s `TL.append` suff +t_codepointOffset_exists :: T.Text -> Char -> T.Text -> Property +t_codepointOffset_exists tPrefix target tSuffix = + let cleanPrefix@(TI.Text _ _ len) = T.filter (/= target) tPrefix + in T.codepointOffset (T.append cleanPrefix $ T.cons target tSuffix) target === len + +t_codepointOffset_missing :: T.Text -> Char -> Bool +t_codepointOffset_missing t target = T.codepointOffset (T.filter (/= target) t) target == -1 + +t_breakOnChar_exists :: T.Text -> Char -> T.Text -> Bool +t_breakOnChar_exists tPrefix target tSuffix = + let cleanPrefix = T.filter (/= target) tPrefix + (before, after) = T.breakOnChar target (T.append cleanPrefix $ T.cons target tSuffix) + in before == cleanPrefix && after == T.cons target tSuffix + +t_breakOnChar_missing :: T.Text -> Char -> Bool +t_breakOnChar_missing t target = + let filtered = T.filter (/= target) t + in T.breakOnChar target filtered == (filtered,T.empty) + +t_breakOnChar_is_break_eq_char :: T.Text -> Char -> Bool +t_breakOnChar_is_break_eq_char t c = T.breakOnChar c t == T.break (== c) t + -- Make a stream appear shorter than it really is, to ensure that -- functions that consume inaccurately sized streams behave -- themselves. @@ -374,7 +397,12 @@ testText = testProperty "t_find" t_find, testProperty "tl_find" tl_find, testProperty "t_partition" t_partition, - testProperty "tl_partition" tl_partition + testProperty "tl_partition" tl_partition, + testProperty "t_codepointOffset_exists" t_codepointOffset_exists, + testProperty "t_codepointOffset_missing" t_codepointOffset_missing, + testProperty "t_breakOnChar_exists" t_breakOnChar_exists, + testProperty "t_breakOnChar_missing" t_breakOnChar_missing, + testProperty "t_breakOnChar_is_break_eq_char" t_breakOnChar_is_break_eq_char ], testGroup "indexing" [ diff --git a/text.cabal b/text.cabal index f6343609..2f9ba03b 100644 --- a/text.cabal +++ b/text.cabal @@ -82,6 +82,7 @@ library cbits/measure_off.c cbits/reverse.c cbits/utils.c + cbits/codepoint_offset.c hs-source-dirs: src if flag(simdutf)