diff --git a/packages/math/atoms.lua b/packages/math/atoms.lua index 9aaeef16b..d291d3784 100644 --- a/packages/math/atoms.lua +++ b/packages/math/atoms.lua @@ -14,7 +14,7 @@ local atomType = { over = 8, -- Unused for now (used for overlines etc. in The TeXbook) under = 9, -- Unused for now (used for underlines etc. in The TeXbook) accent = 10, - botaccent = 11, -- Unused for now but botaccent is encoded in our dictionary + botaccent = 11, } return { types = atomType } diff --git a/packages/math/base-elements.lua b/packages/math/base-elements.lua index 331ed7f54..0aa9bb17c 100644 --- a/packages/math/base-elements.lua +++ b/packages/math/base-elements.lua @@ -732,6 +732,18 @@ local function isNotEmpty (element) return element and (element:is_a(elements.terminal) or #element.children > 0) end +local function getAccentMode (mode) + -- Size unchanged but leave display mode + -- See MathML Core §3.4.3 + if mode == mathMode.display then + return mathMode.text + end + if mode == mathMode.displayCramped then + return mathMode.textCramped + end + return mode +end + local function unwrapSingleElementMrow (elt) -- CODE SMELL. -- For \overset or \underset in LaTeX, MathML would use or . @@ -748,10 +760,13 @@ local function unwrapSingleElementMrow (elt) end end -function elements.underOver:_init (base, sub, sup) +function elements.underOver:_init (attributes, base, sub, sup) elements.mbox._init(self) base = unwrapSingleElementMrow(base) self.atom = base.atom + self.attributes = attributes or {} + self.attributes.accent = SU.boolean(self.attributes.accent, false) + self.attributes.accentunder = SU.boolean(self.attributes.accentunder, false) self.base = base self.sub = isNotEmpty(sub) and sub or nil self.sup = isNotEmpty(sup) and sup or nil @@ -771,10 +786,10 @@ function elements.underOver:styleChildren () self.base.mode = self.mode end if self.sub then - self.sub.mode = getSubscriptMode(self.mode) + self.sub.mode = self.attributes.accentunder and getAccentMode(self.mode) or getSubscriptMode(self.mode) end if self.sup then - self.sup.mode = getSuperscriptMode(self.mode) + self.sup.mode = self.attributes.accent and getAccentMode(self.mode) or getSuperscriptMode(self.mode) end end @@ -816,7 +831,10 @@ function elements.underOver:_stretchyReshapeToBase (part) end function elements.underOver:shape () + local constants = self:getMathMetrics().constants + local scaleDown = self:getScaleDown() local isMovableLimits = SU.boolean(self.base and self.base.movablelimits, false) + local itCorr = self:calculateItalicsCorrection() * scaleDown if not (self.mode == mathMode.display or self.mode == mathMode.displayCramped) and isMovableLimits then -- When the base is a movable limit, the under/over scripts are not placed under/over the base, -- but other to the right of it, when display mode is not used. @@ -827,32 +845,54 @@ function elements.underOver:shape () elements.subscript.shape(self) return end - local constants = self:getMathMetrics().constants - local scaleDown = self:getScaleDown() -- Determine relative Ys if self.base then self.base.relY = SILE.types.length(0) end if self.sub then self:_stretchyReshapeToBase(self.sub) - self.sub.relY = self.base.depth - + SILE.types.length( - math.max( - (self.sub.height + constants.lowerLimitGapMin * scaleDown):tonumber(), - constants.lowerLimitBaselineDropMin * scaleDown + -- TODO These rules are incomplete and even wrong if we were to fully implement MathML Core. + if self.attributes.accentunder then + self.sub.relY = self.base.depth + + SILE.types.length( + (self.sub.height + constants.lowerLimitGapMin * scaleDown):tonumber() + -- We assume that the accent is aligned on the base. ) - ) + else + self.sub.relY = self.base.depth + + SILE.types.length( + math.max( + (self.sub.height + constants.lowerLimitGapMin * scaleDown):tonumber(), + constants.lowerLimitBaselineDropMin * scaleDown + ) + ) + end end if self.sup then self:_stretchyReshapeToBase(self.sup) - self.sup.relY = 0 - - self.base.height - - SILE.types.length( - math.max( - (constants.upperLimitGapMin * scaleDown + self.sup.depth):tonumber(), - constants.upperLimitBaselineRiseMin * scaleDown + -- TODO These rules are incomplete if we were to fully implement MathML Core. + if self.attributes.accent then + self.sup.relY = 0 - self.base.height + -- MathML Core wants to align on the accentBaseHeight... + local overShift = math.max(0, constants.accentBaseHeight * scaleDown - self.base.height:tonumber()) + self.sup.relY = self.sup.relY - SILE.types.length(overShift) + -- HACK: .... but improperly dimensioned accents can overshoot the base glyph. + -- So we try some guesswork to correct this. + -- Typically some non-combining symbols are in this case... + local heuristics = 0.5 * constants.flattenedAccentBaseHeight + 0.5 * constants.accentBaseHeight + if self.sup.height > SILE.types.length(heuristics * scaleDown) then + self.sup.relY = self.sup.relY + SILE.types.length(constants.accentBaseHeight * scaleDown) + end + else + self.sup.relY = 0 + - self.base.height + - SILE.types.length( + math.max( + (constants.upperLimitGapMin * scaleDown + self.sup.depth):tonumber(), + constants.upperLimitBaselineRiseMin * scaleDown + ) ) - ) + end end -- Determine relative Xs based on widest symbol local widest, a, b @@ -893,7 +933,6 @@ function elements.underOver:shape () if b then b.relX = c - b.width / 2 end - local itCorr = self:calculateItalicsCorrection() * scaleDown if self.sup then self.sup.relX = self.sup.relX + itCorr / 2 end @@ -1201,7 +1240,10 @@ end function elements.text:_vertStretchyReshape (depth, height) local hasStretched = self:_stretchyReshape(depth + height, true) if hasStretched then - -- HACK: see output routine + -- RESCALING HACK: see output routine + -- We only do it if the scaling logic found constructions on the vertical block axis. + -- It's a dirty hack until we properly implement assembly of glyphs in the case we couldn't + -- find a big enough variant. self.vertExpectedSz = height + depth self.vertScalingRatio = (depth + height):tonumber() / (self.height:tonumber() + self.depth:tonumber()) self.height = height @@ -1212,12 +1254,21 @@ end function elements.text:_horizStretchyReshape (width) local hasStretched = self:_stretchyReshape(width, false) - if hasStretched then - -- HACK: see output routine - self.horizScalingRatio = width:tonumber() / self.width:tonumber() - self.width = width - end - return hasStretched + if not hasStretched and width:tonumber() < self.width:tonumber() then + -- Never shrink glyphs, it looks ugly + return false + end + -- But if stretching couldn't be done, it will be ugly anyway, so we will force + -- a re-scaling of the glyph. + -- (So it slightly different from the vertical case, 'cause MathML just has one stretchy + -- attribute, whether for stretching on the vertical (block) or horizontal (inline) axis, + -- and we cannot know which axis is meant unless we implement yet another mapping table + -- as the one in the MathML Core appendices. Frankly, how many non-normative appendices + -- do we need to implement MathML correctly?) + -- RESCALING HACK: see output routine + self.horizScalingRatio = width:tonumber() / self.width:tonumber() + self.width = width + return true end function elements.text:output (x, y, line) @@ -1356,7 +1407,7 @@ local function newSubscript (spec) end local function newUnderOver (spec) - return elements.underOver(spec.base, spec.sub, spec.sup) + return elements.underOver(spec.attributes, spec.base, spec.sub, spec.sup) end -- TODO replace with penlight equivalent diff --git a/packages/math/init.lua b/packages/math/init.lua index 2fc52250e..0b77dd64a 100644 --- a/packages/math/init.lua +++ b/packages/math/init.lua @@ -427,7 +427,7 @@ The \code{counter} or the direct value \code{number} is passed as a parameter to \paragraph{Missing features} This package still lacks support for some mathematical constructs, but hopefully we’ll get there. -Among unsupported constructs are: decorating symbols with so-called accents, such as arrows or hats, “over” or “under” braces, and line breaking inside a formula. +Among unsupported features, we can mention line breaking inside a formula. \font:remove-fallback \end{document} diff --git a/packages/math/texlike.lua b/packages/math/texlike.lua index f81d68146..2df4ba64e 100644 --- a/packages/math/texlike.lua +++ b/packages/math/texlike.lua @@ -396,9 +396,24 @@ local function isOperatorKind (tree, typeOfAtom) return false end -local function isMoveableLimits (tree) +local function isMoveableLimitsOrAlwaysStacked (tree) + if not tree then + return false -- safeguard + end + if tree.is_always_stacked then + -- We use an internal flag to mark commands that are always stacking + -- their sup/sub arguments, such as brace-like commands. + return true + end if tree.command ~= "mo" then - return false + -- On the recursion: + -- MathML allows movablelimits on elements, but "embellished operators" + -- can be other elements inheriting the property from their "core operator", + -- see MathML Core §3.2.4.1, which is full of intricacies so we are probably + -- not even doing the right thing here. + -- On the hack: + -- See variant commands for limits further down. + return SU.boolean(tree.is_hacked_movablelimits, false) or isMoveableLimitsOrAlwaysStacked(tree[1]) end if tree.options and SU.boolean(tree.options.movablelimits, false) then return true @@ -430,6 +445,9 @@ end local function isAccentSymbol (symbol) return operatorDict[symbol] and operatorDict[symbol].atom == atoms.types.accent end +local function isBottomAccentSymbol (symbol) + return operatorDict[symbol] and operatorDict[symbol].atom == atoms.types.botaccent +end local function compileToMathML_aux (_, arg_env, tree) if type(tree) == "string" then @@ -565,14 +583,15 @@ local function compileToMathML_aux (_, arg_env, tree) end tree.options = {} -- Translate TeX-like sub/superscripts to `munderover` or `msubsup`, - -- depending on whether the base is an operator with moveable limits. - elseif tree.id == "sup" and isMoveableLimits(tree[1]) then + -- depending on whether the base is an operator with moveable limits, + -- or a brace-like command. + elseif tree.id == "sup" and isMoveableLimitsOrAlwaysStacked(tree[1]) then tree.command = "mover" - elseif tree.id == "sub" and isMoveableLimits(tree[1]) then + elseif tree.id == "sub" and isMoveableLimitsOrAlwaysStacked(tree[1]) then tree.command = "munder" - elseif tree.id == "subsup" and isMoveableLimits(tree[1]) then + elseif tree.id == "subsup" and isMoveableLimitsOrAlwaysStacked(tree[1]) then tree.command = "munderover" - elseif tree.id == "supsub" and isMoveableLimits(tree[1]) then + elseif tree.id == "supsub" and isMoveableLimitsOrAlwaysStacked(tree[1]) then tree.command = "munderover" local tmp = tree[2] tree[2] = tree[3] @@ -638,7 +657,7 @@ local function compileToMathML_aux (_, arg_env, tree) elseif tree.id == "command" and symbols[tree.command] then local atom = { id = "atom", [1] = symbols[tree.command] } if isAccentSymbol(symbols[tree.command]) and #tree > 0 then - -- LaTeX-style accents \vec{v} = v + -- LaTeX-style accents \overrightarrow{v} = v local accent = { id = "command", command = "mover", @@ -649,6 +668,18 @@ local function compileToMathML_aux (_, arg_env, tree) accent[1] = compileToMathML_aux(nil, arg_env, tree[1]) accent[2] = compileToMathML_aux(nil, arg_env, atom) tree = accent + elseif isBottomAccentSymbol(symbols[tree.command]) and #tree > 0 then + -- LaTeX-style bottom accents \underleftarrow{v} = v + local accent = { + id = "command", + command = "munder", + options = { + accentunder = "true", + }, + } + accent[1] = compileToMathML_aux(nil, arg_env, tree[1]) + accent[2] = compileToMathML_aux(nil, arg_env, atom) + tree = accent elseif #tree > 0 then -- Play cool with LaTeX-style commands that don't take arguments: -- Edge case for non-accent symbols so we don't loose bracketed groups @@ -728,6 +759,80 @@ registerCommand("mn", { [1] = objType.str }, function (x) return x end) +-- Register a limit-like variant command +-- Variants of superior, inferior, projective and injective limits are special: +-- They accept a sub/sup behaving as a movablelimits, but also have a symbol +-- on top of the limit symbol, which is not a movablelimits. +-- I can't see in the MathML specification how to do this properly: MathML Core +-- seems to only allow movablelimits on elements, and / may +-- inherit that property from their "core operator", but in this case we do not +-- want the accent to be movable, only the limit sup/sub. +-- So we use a hack, and also avoid "\def" here to prevent unwanted mrows. +-- @tparam string name TeX command name +-- @tparam string command MathML command (mover or munder) +-- @tparam number symbol Unicode codepoint for the accent symbol +-- @tparam string text Text representation +local function registerVarLimits (name, command, symbol, text) + registerCommand(name, {}, function () + local options = command == "mover" and { accent = "true" } or { accentunder = "true" } + return { + command = command, + is_hacked_movablelimits = true, -- Internal flag to mark this as a hack + options = options, + { + command = "mo", + options = { atom = "op", movablelimits = false }, + text, + }, + { + command = "mo", + options = { accentunder = "true" }, + luautf8.char(symbol), + }, + } + end) +end +registerVarLimits("varlimsup", "mover", 0x203E, "lim") -- U+203E OVERLINE +registerVarLimits("varliminf", "munder", 0x203E, "lim") -- U+203E OVERLINE +registerVarLimits("varprojlim", "munder", 0x2190, "lim") -- U+2190 LEFTWARDS ARROW +registerVarLimits("varinjlim", "munder", 0x2192, "lim") -- U+2192 RIGHTWARDS ARROW + +-- Register a brace-like commands. +-- Those symbols are accents per-se in MathML, and are non-combining in Unicode. +-- But TeX treats them as "pseudo-accent" stretchy symbols. +-- Moreover, they accept a sub/sup which is always stacked, and not movable. +-- So we use an internal flag. +-- We also avoid "\def" here to prevent unwanted mrows resulting from the +-- compilation of the argument. +-- @tparam string name TeX command name +-- @tparam string command MathML command (mover or munder) +-- @tparam number symbol Unicode codepoint for the brace symbol +local function registerBraceLikeCommands (name, command, symbol) + registerCommand(name, { + [1] = objType.tree, + }, function (tree) + local options = command == "mover" and { accent = "true" } or { accentunder = "true" } + return { + command = command, + is_always_stacked = true, -- Internal flag to mark this as a brace-like command + options = options, + tree[1], + { + command = "mo", + options = { stretchy = "true" }, + luautf8.char(symbol), + }, + } + end) +end +-- Note: the following overriddes the default commands from xml-entities / unicode-math. +registerBraceLikeCommands("overbrace", "mover", 0x23DE) -- U+23DE TOP CURLY BRACKET +registerBraceLikeCommands("underbrace", "munder", 0x23DF) -- U+23DF BOTTOM CURLY BRACKET +registerBraceLikeCommands("overparen", "mover", 0x23DC) -- U+23DC TOP PARENTHESIS +registerBraceLikeCommands("underparen", "munder", 0x23DD) -- U+23DD BOTTOM PARENTHESIS +registerBraceLikeCommands("overbracket", "mover", 0x23B4) -- U+23B4 TOP SQUARE BRACKET +registerBraceLikeCommands("underbracket", "munder", 0x23B5) -- U+23B5 BOTTOM SQUARE BRACKET + compileToMathML( nil, {}, @@ -737,7 +842,6 @@ compileToMathML( \def{sqrt}{\msqrt{#1}} \def{bi}{\mi[mathvariant=bold-italic]{#1}} \def{dsi}{\mi[mathvariant=double-struck]{#1}} - \def{vec}{\mover[accent=true]{#1}{\rightarrow}} % From amsmath: \def{to}{\mo[atom=bin]{→}} diff --git a/packages/math/typesetter.lua b/packages/math/typesetter.lua index 097e91b16..339f2d4f2 100644 --- a/packages/math/typesetter.lua +++ b/packages/math/typesetter.lua @@ -3,6 +3,7 @@ local lpeg = require("lpeg") local atoms = require("packages.math.atoms") local b = require("packages.math.base-elements") local syms = require("packages.math.unicode-symbols") +local accents = require("packages.math.unicode-accents") local mathvariants = require("packages.math.unicode-mathvariants") local mathVariantToScriptType, scriptType = mathvariants.mathVariantToScriptType, mathvariants.scriptType @@ -134,6 +135,12 @@ function ConvertMathML (_, content) or scriptType.upright local text = content[1] local attributes = {} + if luautf8.len(text) == 1 then + -- Re-encode single combining character as non-combining when feasible. + -- HACK: This is for "accents", but it's not what MathML Core expects. + -- See comment on that function. + text = accents.makeNonCombining(text) + end -- Attributes from the (default) operator table if syms.operatorDict[text] then attributes.atom = syms.operatorDict[text].atom @@ -230,21 +237,25 @@ function ConvertMathML (_, content) elseif content.command == "munder" then local children = convertChildren(content) if #children ~= 2 then - SU.error("Wrong number of children in munder") + SU.error("Wrong number of children in munder" .. #children) end - return b.newUnderOver({ base = children[1], sub = children[2] }) + local elt = b.newUnderOver({ attributes = content.options, base = children[1], sub = children[2] }) + elt.movablelimits = content.is_hacked_movablelimits + return elt elseif content.command == "mover" then local children = convertChildren(content) if #children ~= 2 then SU.error("Wrong number of children in mover") end - return b.newUnderOver({ base = children[1], sup = children[2] }) + local elt = b.newUnderOver({ attributes = content.options, base = children[1], sup = children[2] }) + elt.movablelimits = content.is_hacked_movablelimits + return elt elseif content.command == "munderover" then local children = convertChildren(content) if #children ~= 3 then SU.error("Wrong number of children in munderover") end - return b.newUnderOver({ base = children[1], sub = children[2], sup = children[3] }) + return b.newUnderOver({ attributes = content.options, base = children[1], sub = children[2], sup = children[3] }) elseif content.command == "mfrac" then local children = convertChildren(content) if #children ~= 2 then diff --git a/packages/math/unicode-accents.lua b/packages/math/unicode-accents.lua new file mode 100644 index 000000000..879b4d7e4 --- /dev/null +++ b/packages/math/unicode-accents.lua @@ -0,0 +1,107 @@ +-- IMPORTANT: +-- Normally, if we were to take MathML seriously, we would have to use the Unicode combining characters +-- for accents, unsing reverse mapping tables. +-- So our current implementation here is not fully compliant, but the whole thing is a hornet's nest. + +-- Combining character check by Unicode block +-- @tparam number codepoint A Unicode codepoint +-- @treturn boolean true if the codepoint is a combining character, false otherwise +local isCombining = function (codepoint) + return + -- Combining Diacritical Marks (0300–036F), since version 1.0, with modifications in subsequent versions down to 4.1 + (codepoint >= 0x0300 and codepoint <= 0x036F) + -- Combining Diacritical Marks Extended (1AB0–1AFF), version 7.0 + or (codepoint >= 0x1AB0 and codepoint <= 0x1AFF) + -- Combining Diacritical Marks Supplement (1DC0–1DFF), versions 4.1 to 5.2 + or (codepoint >= 0x1DC0 and codepoint <= 0x1DFF) + -- Combining Diacritical Marks for Symbols (20D0–20FF), since version 1.0, with modifications in subsequent versions down to 5.1 + or (codepoint >= 0x20D0 and codepoint <= 0x20FF) + -- Cyrillic Extended-A (2DE0–2DFF), version 5.1 + or (codepoint >= 0x2DE0 and codepoint <= 0x2DFF) + -- Combining Half Marks (FE20–FE2F), versions 1.0, with modifications in subsequent versions down to 8.0 + or (codepoint >= 0xFE20 and codepoint <= 0xFE2F) + or false +end + +-- MathML Core non-normative B.3 (https://www.w3.org/TR/mathml-core/#comb-noncomb) +-- W3C Working Draft 27 November 2023, and MathML Core Editor's Draft 26 November 2024 as well: +-- The table does not seem complete, see ADDED comments below so that we can map TeX atoms +-- accent and bottaccent atoms to non-combining characters... +-- For the ADDED stuff, see report https://github.com/w3c/mathml-core/issues/137#issuecomment-2508344714 +-- See also https://github.com/w3c/mathml/issues/247 on a related issue. +-- Grumpy none: All these standards put together are defective by design. +local nonCombining = { + -- Combining Diacritical Marks (0300–036F) + [0x0300] = 0x0060, -- combining grave accent (above) > grave accent + [0x0301] = 0x00B4, -- combining acute accent (above) > acute accent + [0x0302] = 0x02C6, -- combining circumflex accent (above) > modifier letter circumflex accent + [0x0303] = 0x007E, -- combining tilde (above) > tilde + [0x0304] = 0x00AF, -- combining macron (above) > macron + [0x0305] = 0x203E, -- combining overline (above) > overline + [0x0306] = 0x02D8, -- combining breve (above) > breve + [0x0307] = 0x02D9, -- combining dot (above) > dot above + [0x0308] = 0x00A8, -- combining diaresis (above) > diaresis + [0x030A] = 0x02DA, -- combining ring above > ring above (ADDED) + [0x030B] = 0x02DD, -- combining double acute accent (above) > double acute accent + [0x030C] = 0x02C7, -- combining caron (above) > caron + -- [0x0311] (accent in unicode-math) combining inverted breve (above) has no safe non-combining equivalent + [0x0312] = 0x00B8, -- combining comma (above) > cedilla + [0x0316] = 0x0060, -- combining grave accent (below) > grave accent + [0x0317] = 0x00B4, -- combining acute accent (below) > acute accent + [0x031F] = 0x002B, -- combingin plus sign (below) > plus sign + [0x0320] = 0x002D, -- combining minus sign (below) > minus sign + [0x0323] = 0x002E, -- combining dot (below) > full stop + [0x0324] = 0x00A8, -- combining diaresis (below) > diaresis + [0x0327] = 0x00B8, -- combining cedilla (below) > cedilla + [0x0328] = 0x02DB, -- combining ogonek (below) > ogonek + [0x032C] = 0x02C7, -- combining caron (below) > caron + [0x032D] = 0x005E, -- circumflex accent below + [0x032E] = 0x02D8, -- combining breve (below) > breve + -- [0x032F] (botaccent is unicode-math) combining inverted breve (below) has no safe non-combining equivalent + [0x0330] = 0x007E, -- combining tilde (below) > tilde + [0x0331] = 0x00AF, -- combining macron (below) > macron (ADDED) + [0x0332] = 0x203E, -- combining low line (below) > overline + [0x0333] = 0x2017, -- combining double low line (below) > double low line (ADDED) + [0x0338] = 0x002F, -- combining long solidus overlay (over) > solidus + -- [0x033A] (botaccent is unicode-math) combining inverted bridge below has no safe non-combining equivalent + -- [0x033F] (accent in unicode-math) combining double overline has no safe non-combining equivalent + -- [0x0346] (accent in unicode-math) combining bridge above has no safe non-combining equivalent + [0x034D] = 0x2194, -- combining left-right arrow (below) > left right arrow (ADDED) + --Combining Diacritical Marks for Symbols (20D0–20FF) + [0x20D0] = 0x21BC, -- combining left harpoon (above) > leftwards harpoon with barb up (ADDED) + [0x20D1] = 0x21C0, -- combining right harpoon (above) > rightwards harpoon with barb up (ADDED) + -- [0x20D4] (accent in unicode-math) combining anticlockwise arrow above has no safe non-combining equivalent + -- [0x20D5] (accent in unicode-math) combining clockwise arrow above has no safe non-combining equivalent + [0x20D6] = 0x2190, -- combining left arrow (above) > left arrow [or U+27F5 long leftwards arrow?] (ADDED) + [0x20D7] = 0x2192, -- combining right arrow (above) > right arrow [or U+27F6 long rightwards arrow?] + [0x20DB] = 0x22EF, -- combining triple underdot (above) > midline horizontal ellipsis (ADDED, LIKELY IMPERFECT) + --[0x20DC] (accent in unicode-math) combining four dots above has no safe non-combining equivalent + [0x20E1] = 0x2194, -- combining left right arrow above > left right arrow (ADDED) + -- [0x20E7] (botaccent is unicode-math) combining annuity symbol has no safe non-combining equivalent + [0x20E8] = 0x22EF, -- combining triple underdot (below) > midline horizontal ellipsis (ADDED, LIKELY IMPERFECT) + -- [0x20E9] (botaccent is unicode-math) combining wide bridge above has no safe non-combining equivalent + [0x20EC] = 0x21C1, -- combining rightwards harpoon with barb downwards > rightwards harpoon with barb downwards (ADDED) + [0x20ED] = 0x21BD, -- combining leftwards harpoon with bard downwards > leftwards harpoon with barb downwards (ADDED) + [0x20EE] = 0x2190, -- combining left arrow (below) > left arrow [or U+27F5 long leftwards arrow?] (ADDED) + [0x20EF] = 0x2192, -- combining right arrow (below) > right arrow [or U+27F6 long rightwards arrow?] +} + +-- Make a non-combining equivalent of a combining character +-- @tparam string char A single-character string representing a combining character +-- @treturn string A single-character string representing the non-combining equivalent +local function makeNonCombining (char) + local codepoint = luautf8.codepoint(char, 1) + if isCombining(codepoint) then + local noncombining = nonCombining[codepoint] + if noncombining then + return luautf8.char(noncombining) + end + SU.warn(("No non-combining equivalent for codepoint 0x%x"):format(codepoint)) + end + return char +end + +return { + isCombining = isCombining, + makeNonCombining = makeNonCombining, +} diff --git a/packages/math/unicode-symbols.lua b/packages/math/unicode-symbols.lua index b123be38e..bc1f85960 100644 --- a/packages/math/unicode-symbols.lua +++ b/packages/math/unicode-symbols.lua @@ -2,6 +2,7 @@ local mathml_entities = require("packages.math.mathml-entities") local symbols = mathml_entities.symbols local operatorDict = mathml_entities.operatorDict +local atoms = require("packages.math.atoms") --- Add aliases for symbols that have multiple names. -- We check that the alias is already defined in the generated dictionary, @@ -64,6 +65,21 @@ addAlias("circlearrowleft", "acwcirclearrow") addAlias("circlearrowright", "cwcirclearrow") addAlias("blacklozenge", "mdlgblklozenge") +-- Additional aliases for "accents" +addAlias("overline", "overbar") +addAlias("underline", "mathunderbar") +addAlias("underbar", "mathunderbar") +addAlias("overrightharpoon", "rightharpoonaccent") +addAlias("overleftharpoon", "leftharpoonaccent") +-- Caveat emptor: +-- For some of them, TeX would consider one to be stretchy, the other not... +-- It's completely insane to still have to deal with this in the 21st century, +-- so let's not bother and just make them aliases to at least get something working for now. +addAlias("utilde", "wideutilde") +addAlias("widecheck", "check") +addAlias("widehat", "hat") +addAlias("widetilde", "tilde") + -- (Original-TeX) TeX-like greek letters symbols.alpha = "α" symbols.beta = "β" @@ -128,6 +144,27 @@ symbols.Digamma = "Ϝ" -- Supported by LaTeX's unicode-math -- In our TeX-like syntax, they should however lead to the same symbol. operatorDict["-"] = operatorDict["−"] +-- Override the atom type of a symbol in the operator dictionary. +-- @tparam string symbol Symbol to override +-- @tparam string atom New atom type +local function overrideAtom (symbol, atom) + if not symbols[symbol] then + SU.error("Symbol " .. symbol .. " not defined (operator dictionary is probably broken)") + end + operatorDict[symbols[symbol]].atom = atom +end + +-- In xml-entities's unicode.xml, we derived "ord" for U+034D COMBINING LEFT RIGHT ARROW BELOW +-- as is doesn't have a "D" class (diacritic) +-- Actually it's not the fault of xml-entities, such a character is not covered in the latest +-- Unicode MathClass file: https://www.unicode.org/Public/math/revision-15/MathClassEx-15.html +-- It should clearly be a "botaccent" however, for \underleftrightarrow to work as do other accents. +-- Note the U+20E1 COMBINING LEFT RIGHT ARROW ABOVE has s a "D" and we thus mapped to an "accent", +-- used for \overleftrightarrow, so the asymmetry is at least weird. +-- For future reference, see also https://github.com/w3c/xml-entities/issues/12 but it's probably +-- not where the discussion should happen... +overrideAtom("underleftrightarrow", atoms.types.botaccent) + return { symbols = symbols, operatorDict = operatorDict,