Skip to content

Commit

Permalink
Implement UAX31 character ranges
Browse files Browse the repository at this point in the history
  • Loading branch information
rikkimax committed Feb 5, 2024
1 parent f62c501 commit 615b07a
Show file tree
Hide file tree
Showing 32 changed files with 5,901 additions and 457 deletions.
12 changes: 12 additions & 0 deletions changelog/dmd.identifier-tables.dd
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Expansion of identifier tables to allow new characters to match C23 have been added along with CLI configurability

You can currently choose between ``c99``, ``c11``, ``UAX31`` (C23's) and ``all`` (the least restrictive set) for both D and ImportC.

This can be done with ``-identifiers=<table>`` and for ImportC ``-identifiers-importc=<table>``.

The default table for D is currently set to ``all``, while ImportC is set to ``c11``.
Previously both D and ImportC used the ``c99`` tables.

D's table will be swapped over at a later date to [UAX31](https://unicode.org/reports/tr31/), this should be done in 2.117.
If you find yourself at this time using ``c99`` specific characters and not willing to change them, you may switch back to ``all``.
Although it should be unlikely that you will need to.
6 changes: 6 additions & 0 deletions changelog/dmd.importc-unicode.dd
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
ImportC has improved Unicode support

Universal Character Names are now supported, allowing you to use the ``\uXXXX`` and ``\UXXXXXXXX`` syntax where ``X`` is a hex digit as part of an identifier.

DigitalMars sppn does not support anything newer than C99.
It is known to be limited and using any Unicode character not in those ranges will result in an error.
2 changes: 1 addition & 1 deletion compiler/src/build.d
Original file line number Diff line number Diff line change
Expand Up @@ -1576,7 +1576,7 @@ auto sourceFiles()
stringtable.d utf.d
"),
common: fileArray(env["COMMON"], "
bitfields.d file.d int128.d md5.d outbuffer.d smallbuffer.d
bitfields.d file.d int128.d md5.d outbuffer.d smallbuffer.d charactertables.d identifiertables.d
"),
commonHeaders: fileArray(env["COMMON"], "
outbuffer.h
Expand Down
20 changes: 20 additions & 0 deletions compiler/src/dmd/cli.d
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,26 @@ dmd -cov -unittest myprog.d
$(P Note that multiple `-i=...` options are allowed, each one adds a pattern.)}"
),
Option("identifiers=<table>",
"Specify the non-ASCII tables for D identifiers",
`Set the identifier table to use for the non-ASCII values.
$(UL
$(LI $(I UAX31): UAX31)
$(LI $(I c99): C99)
$(LI $(I c11): C11)
$(LI $(I all): All, the least restrictive set, which comes all others (default))
)`
),
Option("identifiers-importc=<table>",
"Specify the non-ASCII tables for ImportC identifiers",
`Set the identifier table to use for the non-ASCII values.
$(UL
$(LI $(I UAX31): UAX31)
$(LI $(I c99): C99)
$(LI $(I c11): C11 (default))
$(LI $(I all): All, the least restrictive set, which comes all others)
)`
),
Option("ignore",
"deprecated flag, unsupported pragmas are always ignored now"
),
Expand Down
267 changes: 267 additions & 0 deletions compiler/src/dmd/common/charactertables.d
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
/**
* Character tables related to identifiers.
*
* Supports UAX31, C99, C11 and least restrictive (All).
*
* Copyright: Copyright (C) 1999-2024 by The D Language Foundation, All Rights Reserved
* Authors: $(LINK2 https://cattermole.co.nz, Richard (Rikki) Andrew Cattermole)
* License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
* Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/common/charactertables.d, common/charactertables.d)
* Documentation: https://dlang.org/phobos/dmd_common_charactertables.html
* Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/common/charactertables.d
*/
module dmd.common.charactertables;

@safe nothrow @nogc pure:

extern(C++):

///
enum IdentifierTable {
UAX31, ///
C99, ///
C11, ///
LR, /// Least Restrictive aka All
}

///
struct IdentifierCharLookup
{
@safe nothrow @nogc pure:

///
extern(C++) bool function(dchar) isStart;
///
extern(C++) bool function(dchar) isContinue;

/// Lookup the table given the table name
static IdentifierCharLookup forTable(IdentifierTable table)
{
import dmd.common.identifiertables;

// Awful solution to require these lambdas.
// However without them the extern(C++) ABI issues crop up for isInRange,
// and then it can't access the tables.
final switch(table) {
case IdentifierTable.UAX31:
return IdentifierCharLookup(
(c) => isInRange!UAX31_Start(c),
(c) => isInRange!UAX31_Continue(c));
case IdentifierTable.C99:
return IdentifierCharLookup(
(c) => isInRange!FixedTable_C99_Start(c),
(c) => isInRange!FixedTable_C99_Continue(c));
case IdentifierTable.C11:
return IdentifierCharLookup(
(c) => isInRange!FixedTable_C11_Start(c),
(c) => isInRange!FixedTable_C11_Continue(c));
case IdentifierTable.LR:
return IdentifierCharLookup(
(c) => isInRange!LeastRestrictive_Start(c),
(c) => isInRange!LeastRestrictive_Continue(c));
}
}
}

/**
Convenience function for use in places where we just don't care,
what the identifier ranges are, or if it is start/continue.
Returns: is character a member of least restrictive of all.
*/
bool isAnyIdentifierCharacter(dchar c)
{
import dmd.common.identifiertables;
return isInRange!LeastRestrictive_OfAll(c);
}

///
unittest
{
assert(isAnyContinue('ğ'));
}

/**
Convenience function for use in places where we just don't care,
what the identifier ranges are.
Returns: is character a member of restrictive Start
*/
bool isAnyStart(dchar c)
{
import dmd.common.identifiertables;
return isInRange!LeastRestrictive_Start(c);
}

///
unittest
{
assert(isAnyStart('ğ'));
}

/**
Convenience function for use in places where we just don't care,
what the identifier ranges are.
Returns: is character a member of least restrictive Continue
*/
bool isAnyContinue(dchar c)
{
import dmd.common.identifiertables;
return isInRange!LeastRestrictive_Continue(c);
}

///
unittest
{
assert(isAnyContinue('ğ'));
}

/// UTF line separator
enum LS = 0x2028;
/// UTF paragraph separator
enum PS = 0x2029;

private
{
enum CMoctal = 0x1;
enum CMhex = 0x2;
enum CMidchar = 0x4;
enum CMzerosecond = 0x8;
enum CMdigitsecond = 0x10;
enum CMsinglechar = 0x20;
}

///
bool isoctal(const char c)
{
return (cmtable[c] & CMoctal) != 0;
}

///
bool ishex(const char c)
{
return (cmtable[c] & CMhex) != 0;
}

///
bool isidchar(const char c)
{
return (cmtable[c] & CMidchar) != 0;
}

///
bool isZeroSecond(const char c)
{
return (cmtable[c] & CMzerosecond) != 0;
}

///
bool isDigitSecond(const char c)
{
return (cmtable[c] & CMdigitsecond) != 0;
}

///
bool issinglechar(const char c)
{
return (cmtable[c] & CMsinglechar) != 0;
}

///
bool c_isxdigit(const int c)
{
return (( c >= '0' && c <= '9') ||
( c >= 'a' && c <= 'f') ||
( c >= 'A' && c <= 'F'));
}

///
bool c_isalnum(const int c)
{
return (( c >= '0' && c <= '9') ||
( c >= 'a' && c <= 'z') ||
( c >= 'A' && c <= 'Z'));
}

extern(D) private:

// originally from dmd.root.utf
bool isInRange(alias Ranges)(dchar c)
{
size_t high = Ranges.length - 1;
// Shortcut search if c is out of range
size_t low = (c < Ranges[0][0] || Ranges[high][1] < c) ? high + 1 : 0;
// Binary search
while (low <= high)
{
const size_t mid = low + ((high - low) >> 1);
if (c < Ranges[mid][0])
high = mid - 1;
else if (Ranges[mid][1] < c)
low = mid + 1;
else
{
assert(Ranges[mid][0] <= c && c <= Ranges[mid][1]);
return true;
}
}
return false;
}

/********************************************
* Do our own char maps
*/
// originally from dmd.lexer (was private)
static immutable cmtable = ()
{
ubyte[256] table;
foreach (const c; 0 .. table.length)
{
if ('0' <= c && c <= '7')
table[c] |= CMoctal;
if (c_isxdigit(c))
table[c] |= CMhex;
if (c_isalnum(c) || c == '_')
table[c] |= CMidchar;

switch (c)
{
case 'x': case 'X':
case 'b': case 'B':
table[c] |= CMzerosecond;
break;

case '0': .. case '9':
case 'e': case 'E':
case 'f': case 'F':
case 'l': case 'L':
case 'p': case 'P':
case 'u': case 'U':
case 'i':
case '.':
case '_':
table[c] |= CMzerosecond | CMdigitsecond;
break;

default:
break;
}

switch (c)
{
case '\\':
case '\n':
case '\r':
case 0:
case 0x1A:
case '\'':
break;
default:
if (!(c & 0x80))
table[c] |= CMsinglechar;
break;
}
}
return table;
}();
20 changes: 20 additions & 0 deletions compiler/src/dmd/common/charactertables.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/**
* Character tables related to identifiers.
*
* Supports UAX31, C99, C11 and least restrictive (All).
*
* Copyright: Copyright (C) 1999-2024 by The D Language Foundation, All Rights Reserved
* Authors: $(LINK2 https://cattermole.co.nz, Richard (Rikki) Andrew Cattermole)
* License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
* Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/common/charactertables.d, common/charactertables.d)
*/

#pragma once

struct IdentifierCharLookup final
{
bool(*isStart)(char32_t);
bool(*isContinue)(char32_t);

// constructor not provided here.
};
Loading

0 comments on commit 615b07a

Please sign in to comment.