Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added a new popGrapheme function to std.uni #9053

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions changelog/pop-grapheme.dd
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
Added popGrapheme function to std.uni.

The new function is a cross between the existing $(REF graphemeStride, std,
uni) and $(REF decodeGrapheme, std, uni) functions. The new function both
supports `@safe pure nothrow @nogc` like `graphemeStride` does as long as you
don't rely on autodecoding (side node: `@nogc` support for `graphemeStride`
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This side note sounds like it should be its own PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is. I felt an added @nogc compability to an existing funtion alone wasn't worth a changelog entry so didn't add one in that PR. But since I'm writing here that graphemeStride is @nogc anyway, which until very recently wasn't the case, I feel it's useful to mention it. As in: "this note applies only with the latest compiler/Phobos".

added in this release), and works with any non-array ranges just like
`decodeGrapheme` does.

Example:

-------
import std.uni;

// Two Union Jacks of the Great Britain in each
string s = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7";
wstring ws = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7";
dstring ds = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7";

// String pop length in code units, not points.
assert(s.popGrapheme() == 8);
assert(ws.popGrapheme() == 4);
assert(ds.popGrapheme() == 2);

assert(s == "\U0001F1EC\U0001F1E7");
assert(ws == "\U0001F1EC\U0001F1E7");
assert(ds == "\U0001F1EC\U0001F1E7");

import std.algorithm.comparison : equal;
import std.algorithm.iteration : filter;

// Also works for non-random access ranges as long as the
// character type is 32-bit.
auto testPiece = "\r\nhello!"d.filter!(x => !x.isAlpha);
// Windows-style line ending is two code point in a single grapheme.
assert(testPiece.popGrapheme() == 2);
assert(testPiece.equal("!"d));
-------
113 changes: 96 additions & 17 deletions std/uni/package.d
Original file line number Diff line number Diff line change
Expand Up @@ -708,8 +708,8 @@ import std.meta : AliasSeq;
import std.range.primitives : back, ElementEncodingType, ElementType, empty,
front, hasLength, hasSlicing, isForwardRange, isInputRange,
isRandomAccessRange, popFront, put, save;
import std.traits : isConvertibleToString, isIntegral, isSomeChar,
isSomeString, Unqual, isDynamicArray;
import std.traits : isAutodecodableString, isConvertibleToString, isIntegral,
isSomeChar, isSomeString, Unqual, isDynamicArray;
// debug = std_uni;

import std.internal.unicode_tables; // generated file
Expand Down Expand Up @@ -7148,17 +7148,25 @@ private immutable TransformRes
TransformRes.goOn
];

template genericDecodeGrapheme(bool getValue)
{
static if (getValue)
enum GraphemeRet { none, step, value }

template genericDecodeGrapheme(GraphemeRet retType)
{ alias Ret = GraphemeRet;

static if (retType == Ret.value)
alias Value = Grapheme;
else
else static if (retType == Ret.step)
alias Value = size_t;
else static if (retType == Ret.none)
alias Value = void;

Value genericDecodeGrapheme(Input)(ref Input range)
{
static if (getValue)
Grapheme grapheme;
static if (retType == Ret.value)
Grapheme result;
else static if (retType == Ret.step)
size_t result = 0;

auto state = GraphemeState.Start;
dchar ch;

Expand All @@ -7173,17 +7181,21 @@ template genericDecodeGrapheme(bool getValue)
with(TransformRes)
{
case goOn:
static if (getValue)
grapheme ~= ch;
static if (retType == Ret.value)
result ~= ch;
else static if (retType == Ret.step)
result++;
range.popFront();
continue;

case redo:
goto rerun;

case retInclude:
static if (getValue)
grapheme ~= ch;
static if (retType == Ret.value)
result ~= ch;
else static if (retType == Ret.step)
result++;
range.popFront();
break outer;

Expand All @@ -7192,8 +7204,8 @@ template genericDecodeGrapheme(bool getValue)
}
}

static if (getValue)
return grapheme;
static if (retType != Ret.none)
return result;
}
}

Expand All @@ -7217,7 +7229,7 @@ if (is(C : dchar))
{
auto src = input[index..$];
auto n = src.length;
genericDecodeGrapheme!(false)(src);
genericDecodeGrapheme!(GraphemeRet.none)(src);
return n - src.length;
}

Expand Down Expand Up @@ -7279,7 +7291,7 @@ if (is(C : dchar))
Grapheme decodeGrapheme(Input)(ref Input inp)
if (isInputRange!Input && is(immutable ElementType!Input == immutable dchar))
{
return genericDecodeGrapheme!true(inp);
return genericDecodeGrapheme!(GraphemeRet.value)(inp);
}

@safe unittest
Expand All @@ -7304,6 +7316,73 @@ if (isInputRange!Input && is(immutable ElementType!Input == immutable dchar))
assert(equal(decodeGrapheme(s)[], "\U0001F1EC\U0001F1E7"));
}

/++
Reads one full grapheme cluster from an
$(REF_ALTTEXT input range, isInputRange, std,range,primitives) of dchar `inp`,
but doesn't return it. Instead returns the number of code units read.
This differs from number of code points read only if `input` is an
autodecodable string.
Note:
This function modifies `inp` and thus `inp`
must be an L-value.
+/
size_t popGrapheme(Input)(ref Input inp)
if (isInputRange!Input && is(immutable ElementType!Input == immutable dchar))
{
static if (isAutodecodableString!Input || hasLength!Input)
{
// Why count each step in the decoder when you can just
// measure the grapheme in one go?
auto n = inp.length;
genericDecodeGrapheme!(GraphemeRet.none)(inp);
return n - inp.length;
}
else return genericDecodeGrapheme!(GraphemeRet.step)(inp);
}

///
@safe pure unittest
{
// Two Union Jacks of the Great Britain in each
string s = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7";
wstring ws = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7";
dstring ds = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7";

// String pop length in code units, not points.
assert(s.popGrapheme() == 8);
assert(ws.popGrapheme() == 4);
assert(ds.popGrapheme() == 2);

assert(s == "\U0001F1EC\U0001F1E7");
assert(ws == "\U0001F1EC\U0001F1E7");
assert(ds == "\U0001F1EC\U0001F1E7");

import std.algorithm.comparison : equal;
import std.algorithm.iteration : filter;

// Also works for non-random access ranges as long as the
// character type is 32-bit.
auto testPiece = "\r\nhello!"d.filter!(x => !x.isAlpha);
// Windows-style line ending is two code points in a single grapheme.
assert(testPiece.popGrapheme() == 2);
assert(testPiece.equal("!"d));
}

// Attribute compliance test. Should be nothrow `@nogc` when
// no autodecoding needed.
@safe pure nothrow @nogc unittest
{
import std.algorithm.iteration : filter;

auto str = "abcdef"d;
assert(str.popGrapheme() == 1);

// also test with non-random access
auto filtered = "abcdef"d.filter!(x => x%2);
assert(filtered.popGrapheme() == 1);
}

/++
$(P Iterate a string by $(LREF Grapheme).)
Expand Down Expand Up @@ -7722,7 +7801,7 @@ public:
@property bool valid()() /*const*/
{
auto r = this[];
genericDecodeGrapheme!false(r);
genericDecodeGrapheme!(GraphemeRet.none)(r);
return r.length == 0;
}

Expand Down
Loading