diff --git a/Clippit.Tests/Word/DocumentAssemblerTests.cs b/Clippit.Tests/Word/DocumentAssemblerTests.cs
index da5c92f7..770075fc 100644
--- a/Clippit.Tests/Word/DocumentAssemblerTests.cs
+++ b/Clippit.Tests/Word/DocumentAssemblerTests.cs
@@ -154,6 +154,9 @@ public DocumentAssemblerTests(ITestOutputHelper log)
[InlineData("DA285-ImageSelectNoParagraphFollowedAfterMetadata.docx", "DA-Data-WithImages.xml", true)]
[InlineData("DA285A-ImageSelectNoParagraphFollowedAfterMetadata.docx", "DA-Data-WithImages.xml", true)]
[InlineData("DA-I0038-TemplateWithMultipleXPathResults.docx", "DA-I0038-Data.xml", false)]
+ [InlineData("DA289A-xhtml-formatting.docx", "DA-html-input.xml", false)]
+ [InlineData("DA289B-html-not-supported.docx", "DA-html-input.xml", true)]
+ [InlineData("DA289C-not-well-formed-xhtml.docx", "DA-html-input.xml", true)]
public void DA101(string name, string data, bool err)
{
var templateDocx = new FileInfo(Path.Combine(_sourceDir.FullName, name));
@@ -185,14 +188,8 @@ public void DA259(string name, string data, bool err)
Path.Combine(TempDir, name.Replace(".docx", "-processed-by-DocumentAssembler.docx"))
);
var afterAssembling = new WmlDocument(assembledDocx.FullName);
- var brCount = afterAssembling
- .MainDocumentPart.Element(W.body)
- .Elements(W.p)
- .ElementAt(1)
- .Elements(W.r)
- .Elements(W.br)
- .Count();
- Assert.Equal(4, brCount);
+ var brCount = afterAssembling.MainDocumentPart.Element(W.body).Elements(W.p).Count();
+ Assert.Equal(6, brCount);
}
[Theory]
diff --git a/Clippit/Html/HtmlToWmlConverterCore.cs b/Clippit/Html/HtmlToWmlConverterCore.cs
index 6c785a27..47d333af 100644
--- a/Clippit/Html/HtmlToWmlConverterCore.cs
+++ b/Clippit/Html/HtmlToWmlConverterCore.cs
@@ -824,7 +824,7 @@ private static object NormalizeTransform(XNode node)
return node;
}
- private enum NextExpected
+ internal enum NextExpected
{
Paragraph,
Run,
@@ -2830,7 +2830,7 @@ string pictureDescription
#endif
- private static XElement GetParagraphProperties(
+ internal static XElement GetParagraphProperties(
XElement blockLevelElement,
string styleName,
HtmlToWmlConverterSettings settings
@@ -3041,14 +3041,18 @@ private static XElement[] GetSpacingProperties(XElement paragraph, HtmlToWmlConv
return new XElement[] { spacing, ind, contextualSpacing };
}
- private static XElement GetRunProperties(XText textNode, HtmlToWmlConverterSettings settings)
+ internal static XElement GetRunProperties(XText textNode, HtmlToWmlConverterSettings settings)
{
var parent = textNode.Parent;
- var rPr = GetRunProperties(parent, settings);
- return rPr;
+ if (parent != null)
+ {
+ return GetRunProperties(parent, settings);
+ }
+
+ return new XElement(W.rPr);
}
- private static XElement GetRunProperties(XElement element, HtmlToWmlConverterSettings settings)
+ internal static XElement GetRunProperties(XElement element, HtmlToWmlConverterSettings settings)
{
var colorProperty = element.GetProp("color");
var fontFamilyProperty = element.GetProp("font-family");
@@ -3060,15 +3064,15 @@ private static XElement GetRunProperties(XElement element, HtmlToWmlConverterSet
var letterSpacingProperty = element.GetProp("letter-spacing");
var directionProp = element.GetProp("direction");
- var colorPropertyString = colorProperty.ToString();
+ var colorPropertyString = colorProperty?.ToString();
var fontFamilyString = GetUsedFontFromFontFamilyProperty(fontFamilyProperty);
var fontSizeTPoint = GetUsedSizeFromFontSizeProperty(fontSizeProperty);
- var textDecorationString = textDecorationProperty.ToString();
- var fontStyleString = fontStyleProperty.ToString();
- var fontWeightString = fontWeightProperty.ToString().ToLower();
- var backgroundColorString = backgroundColorProperty.ToString().ToLower();
- var letterSpacingString = letterSpacingProperty.ToString().ToLower();
- var directionString = directionProp.ToString().ToLower();
+ var textDecorationString = textDecorationProperty?.ToString();
+ var fontStyleString = fontStyleProperty?.ToString();
+ var fontWeightString = fontWeightProperty?.ToString().ToLower();
+ var backgroundColorString = backgroundColorProperty?.ToString().ToLower();
+ var letterSpacingString = letterSpacingProperty?.ToString().ToLower();
+ var directionString = directionProp?.ToString().ToLower();
var subAncestor = element.AncestorsAndSelf(XhtmlNoNamespace.sub).Any();
var supAncestor = element.AncestorsAndSelf(XhtmlNoNamespace.sup).Any();
@@ -3085,7 +3089,7 @@ private static XElement GetRunProperties(XElement element, HtmlToWmlConverterSet
dirAttributeString = dirAttribute.Value.ToLower();
XElement shd = null;
- if (backgroundColorString != "transparent")
+ if (backgroundColorString != null && backgroundColorString != "transparent")
shd = new XElement(
W.shd,
new XAttribute(W.val, "clear"),
@@ -3155,7 +3159,7 @@ private static XElement GetRunProperties(XElement element, HtmlToWmlConverterSet
rStyle = new XElement(W.rStyle, new XAttribute(W.val, "Hyperlink"));
XElement spacing = null;
- if (letterSpacingProperty.IsNotNormal)
+ if (letterSpacingProperty != null && letterSpacingProperty.IsNotNormal)
spacing = new XElement(W.spacing, new XAttribute(W.val, (long)(Twip)letterSpacingProperty));
XElement rtl = null;
@@ -3191,9 +3195,9 @@ private static XElement GetRunProperties(XElement element, HtmlToWmlConverterSet
// todo this is not right - needs to be rationalized for all characters in an entire paragraph.
// if there is text like
abc def ghi
then there needs to be just one space between abc and def, and between
// def and ghi.
- private static string GetDisplayText(XText node, bool preserveWhiteSpace)
+ internal static string GetDisplayText(XText node, bool preserveWhiteSpace)
{
- var textTransform = node.Parent.GetProp("text-transform").ToString();
+ var textTransform = node.Parent.GetProp("text-transform")?.ToString();
var isFirst = node.Parent.Name == XhtmlNoNamespace.p && node == node.Parent.FirstNode;
var isLast = node.Parent.Name == XhtmlNoNamespace.p && node == node.Parent.LastNode;
@@ -3884,7 +3888,7 @@ private static XElement GetTableRowProperties(XElement element)
return trPr;
}
- private static XAttribute GetXmlSpaceAttribute(string value)
+ internal static XAttribute GetXmlSpaceAttribute(string value)
{
if (value.StartsWith(" ") || value.EndsWith(" "))
return new XAttribute(XNamespace.Xml + "space", "preserve");
@@ -4331,7 +4335,7 @@ private static XElement GetBackgroundProperty(XElement element)
var color = element.GetProp("background-color");
// todo this really should test against default background color
- if (color.ToString() != "transparent")
+ if (color != null && color.ToString() != "transparent")
{
var hexString = color.ToString();
var shd = new XElement(
diff --git a/Clippit/Word/Assembler/HtmlConverter.cs b/Clippit/Word/Assembler/HtmlConverter.cs
new file mode 100644
index 00000000..4061e6c9
--- /dev/null
+++ b/Clippit/Word/Assembler/HtmlConverter.cs
@@ -0,0 +1,395 @@
+using System.Collections;
+using System.Text.RegularExpressions;
+using System.Xml;
+using System.Xml.Linq;
+using System.Xml.XPath;
+using Clippit.Html;
+using Clippit.Internal;
+using DocumentFormat.OpenXml.Packaging;
+using NextExpected = Clippit.Html.HtmlToWmlConverterCore.NextExpected;
+
+namespace Clippit.Word.Assembler
+{
+ internal static class HtmlConverter
+ {
+ private static readonly HtmlToWmlConverterSettings htmlConverterSettings =
+ HtmlToWmlConverter.GetDefaultSettings();
+
+ private static readonly Regex detectEntityRegEx = new Regex("^&(?:#([0-9]+)|#x([0-9a-fA-F]+)|([0-9a-zA-Z]+));");
+
+ ///
+ /// Method processes a string that contains inline html tags and generates a run with the necessary properties
+ /// Supported inline html tags: b, i, em, strong, u, br, a
+ /// Supported block tags: p, div
+ /// TODO: add support for the following html tags: big, small, sub, sup, span.
+ ///
+ /// Source element.
+ /// Data element with content.
+ /// The paragraph properties.
+ /// Error indicator.
+ internal static IEnumerable