From ef4eeb47e53dc90a90ce81b9a0f5f566eb604ae2 Mon Sep 17 00:00:00 2001 From: Denis Ivanov Date: Mon, 1 Apr 2024 19:23:36 +0300 Subject: [PATCH] Update AngleSharp to 1.1.2 --- ....ContentExtraction.IntegrationTests.csproj | 4 +- .../ContentExtractorTests.cs | 35 +- .../AngleSharp.ContentExtraction.csproj | 10 +- .../ContentExtractor.cs | 741 +++++++++--------- 4 files changed, 394 insertions(+), 396 deletions(-) diff --git a/AngleSharp.ContentExtraction.IntegrationTests/AngleSharp.ContentExtraction.IntegrationTests.csproj b/AngleSharp.ContentExtraction.IntegrationTests/AngleSharp.ContentExtraction.IntegrationTests.csproj index d573b91..9e6aaee 100644 --- a/AngleSharp.ContentExtraction.IntegrationTests/AngleSharp.ContentExtraction.IntegrationTests.csproj +++ b/AngleSharp.ContentExtraction.IntegrationTests/AngleSharp.ContentExtraction.IntegrationTests.csproj @@ -7,9 +7,9 @@ - + - + diff --git a/AngleSharp.ContentExtraction.IntegrationTests/ContentExtractorTests.cs b/AngleSharp.ContentExtraction.IntegrationTests/ContentExtractorTests.cs index 49c3694..5eb4812 100644 --- a/AngleSharp.ContentExtraction.IntegrationTests/ContentExtractorTests.cs +++ b/AngleSharp.ContentExtraction.IntegrationTests/ContentExtractorTests.cs @@ -1,27 +1,26 @@ -using System.Threading.Tasks; using AngleSharp.Html.Dom; using NUnit.Framework; +using System.Threading.Tasks; + +namespace AngleSharp.ContentExtraction.IntegrationTests; -namespace AngleSharp.ContentExtraction.IntegrationTests +[TestFixture] +public class ContentExtractorTests { - [TestFixture] - public class ContentExtractorTests + [Test] + public async Task Extract_IntegrationTest() { - [Test] - public async Task Extract_IntegrationTest() - { - // Arrange - var config = Configuration.Default.WithDefaultLoader(); - var address = "https://lenta.ru/articles/2020/05/13/coronausa/"; - var context = BrowsingContext.New(config); - var document = (IHtmlDocument)await context.OpenAsync(address); - var extractor = new ContentExtractor(); + // Arrange + var config = Configuration.Default.WithDefaultLoader(); + const string address = "https://lenta.ru/articles/2020/05/13/coronausa/"; + var context = BrowsingContext.New(config); + var document = (IHtmlDocument)await context.OpenAsync(address); + var extractor = new ContentExtractor(); - // Act - extractor.Extract(document); + // Act + extractor.Extract(document); - // Assert - Assert.Pass(); - } + // Assert + Assert.Pass(); } } diff --git a/AngleSharp.ContentExtraction/AngleSharp.ContentExtraction.csproj b/AngleSharp.ContentExtraction/AngleSharp.ContentExtraction.csproj index 88f2332..98743ec 100644 --- a/AngleSharp.ContentExtraction/AngleSharp.ContentExtraction.csproj +++ b/AngleSharp.ContentExtraction/AngleSharp.ContentExtraction.csproj @@ -1,12 +1,12 @@ - net6.0 + net6.0;net7.0;net8.0 Denis Ivanov AngleSharp.ContentExtraction - 1.0.0 - 1.0.0 - 1.0.0 + 1.0.1 + 1.0.1 + 1.0.1 Content extraction via text density https://github.com/denis-ivanov/AngleSharp.ContentExtraction MIT @@ -14,7 +14,7 @@ - + diff --git a/AngleSharp.ContentExtraction/ContentExtractor.cs b/AngleSharp.ContentExtraction/ContentExtractor.cs index c84e882..272ec8b 100644 --- a/AngleSharp.ContentExtraction/ContentExtractor.cs +++ b/AngleSharp.ContentExtraction/ContentExtractor.cs @@ -1,494 +1,493 @@ -using System; -using System.Xml; -using AngleSharp.Css.Dom; +using AngleSharp.Css.Dom; using AngleSharp.Dom; using AngleSharp.Html.Dom; using AngleSharp.Text; +using System; +using System.Xml; -namespace AngleSharp.ContentExtraction +namespace AngleSharp.ContentExtraction; + +public class ContentExtractor { - public class ContentExtractor + private const string CharNumber = "char-number"; + private const string TagNumber = "tag-number"; + private const string LinkCharNumber = "linkchar-number"; + private const string LinkTagNumber = "linktag-number"; + private const string TextDensity = "text-density"; + private const string DensitySum = "density-sum"; + private const string MaxDensitySum = "max-density-sum"; + private const string Mark = "mark"; + + protected virtual bool IgnoreElement(IElement element) { - private const string CharNumber = "char-number"; - private const string TagNumber = "tag-number"; - private const string LinkCharNumber = "linkchar-number"; - private const string LinkTagNumber = "linktag-number"; - private const string TextDensity = "text-density"; - private const string DensitySum = "density-sum"; - private const string MaxDensitySum = "max-density-sum"; - private const string Mark = "mark"; - - protected virtual bool IgnoreElement(IElement element) - { - return element.TagName.Is(TagNames.NoScript) || - element.TagName.Is(TagNames.Figcaption) || - element.TagName.Is(TagNames.Figure) || - element.TagName.Is(TagNames.Aside) || - element.TagName.Is(TagNames.Footer) || - element.TagName.Is(TagNames.Footer) || - element.TagName.Is(TagNames.Header) || - element.TagName.Is(TagNames.Svg) || - element.GetStyle()?.GetDisplay() == "none"; - } + return element.TagName.Is(TagNames.NoScript) || + element.TagName.Is(TagNames.Figcaption) || + element.TagName.Is(TagNames.Figure) || + element.TagName.Is(TagNames.Aside) || + element.TagName.Is(TagNames.Footer) || + element.TagName.Is(TagNames.Footer) || + element.TagName.Is(TagNames.Header) || + element.TagName.Is(TagNames.Svg) || + element.GetStyle()?.GetDisplay() == "none"; + } - protected virtual bool IgnoreNode(INode node) - { - return - node is IHtmlBreakRowElement || - node is IHtmlHeadElement || - node is IHtmlHrElement || - node is IHtmlLinkElement || - node is IHtmlMetaElement || - node is IHtmlScriptElement || - node is IHtmlStyleElement || - node is IHtmlInlineFrameElement || - node is IHtmlFormElement || - node.NodeType == NodeType.Comment || - (node is IElement e && IgnoreElement(e)); - } + protected virtual bool IgnoreNode(INode node) + { + return + node is IHtmlBreakRowElement || + node is IHtmlHeadElement || + node is IHtmlHrElement || + node is IHtmlLinkElement || + node is IHtmlMetaElement || + node is IHtmlScriptElement || + node is IHtmlStyleElement || + node is IHtmlInlineFrameElement || + node is IHtmlFormElement || + node.NodeType == NodeType.Comment || + (node is IElement e && IgnoreElement(e)); + } - protected virtual void ProcessDom(INode element) + protected virtual void ProcessDom(INode element) + { + var child = element.FirstChild; + + for (;child != null;) { - var child = element.FirstChild; - - for (;child != null;) + if (IgnoreNode(child)) { - if (IgnoreNode(child)) - { - var removeElement = child; - child = child.NextSibling; - removeElement.RemoveFromParent(); - continue; - } - + var removeElement = child; child = child.NextSibling; + removeElement.RemoveFromParent(); + continue; } - for (child = element.FirstChild; child != null; child = child.NextSibling) - { - ProcessDom(child); - } + child = child.NextSibling; } - protected virtual void RemoveAttribute(IElement element) + for (child = element.FirstChild; child != null; child = child.NextSibling) { - element.RemoveAttribute(CharNumber); - element.RemoveAttribute(TagNumber); - element.RemoveAttribute(LinkCharNumber); - element.RemoveAttribute(LinkTagNumber); - element.RemoveAttribute(TextDensity); - element.RemoveAttribute(DensitySum); - element.RemoveAttribute(MaxDensitySum); - element.RemoveAttribute(Mark); - - for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) - { - RemoveAttribute(child); - } + ProcessDom(child); } + } - protected virtual void CleanTreeByMark(IElement element) + protected virtual void RemoveAttribute(IElement element) + { + element.RemoveAttribute(CharNumber); + element.RemoveAttribute(TagNumber); + element.RemoveAttribute(LinkCharNumber); + element.RemoveAttribute(LinkTagNumber); + element.RemoveAttribute(TextDensity); + element.RemoveAttribute(DensitySum); + element.RemoveAttribute(MaxDensitySum); + element.RemoveAttribute(Mark); + + for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) { - var mark = XmlConvert.ToInt32(element.GetAttribute(Mark)); - - if(0 == mark) - { - element.RemoveFromParent(); - } - else if (1 == mark) - { - return; - } - else - { - for(var child = element.FirstElementChild; child != null;) - { - var removeElement = child; - child = child.NextElementSibling; - CleanTreeByMark(removeElement); - } - } + RemoveAttribute(child); } + } - protected virtual void CountChar(IElement element) - { - long charNum = element.TextContent.Length; - var l2s_char_num = XmlConvert.ToString(charNum); - element.SetAttribute(CharNumber, l2s_char_num); + protected virtual void CleanTreeByMark(IElement element) + { + var mark = XmlConvert.ToInt32(element.GetAttribute(Mark)); - for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) + if(0 == mark) + { + element.RemoveFromParent(); + } + else if (1 == mark) + { + return; + } + else + { + for(var child = element.FirstElementChild; child != null;) { - CountChar(child); + var removeElement = child; + child = child.NextElementSibling; + CleanTreeByMark(removeElement); } } + } + + protected virtual void CountChar(IElement element) + { + long charNum = element.TextContent.Length; + var l2s_char_num = XmlConvert.ToString(charNum); + element.SetAttribute(CharNumber, l2s_char_num); - protected virtual void CountTag(IElement element) + for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) { - long tag_num = 0; - string l2s_tag_num; + CountChar(child); + } + } - if(element.FirstElementChild == null) + protected virtual void CountTag(IElement element) + { + long tag_num = 0; + string l2s_tag_num; + + if(element.FirstElementChild == null) + { + l2s_tag_num = XmlConvert.ToString(0); + element.SetAttribute(TagNumber, l2s_tag_num); + } + else + { + for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) { - l2s_tag_num = XmlConvert.ToString(0); - element.SetAttribute(TagNumber, l2s_tag_num); + CountTag(child); } - else + for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) { - for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) - { - CountTag(child); - } - for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) - { - tag_num += XmlConvert.ToInt64(child.GetAttribute(TagNumber)) + 1; - } - - l2s_tag_num = XmlConvert.ToString(tag_num); - element.SetAttribute(TagNumber, l2s_tag_num); + tag_num += XmlConvert.ToInt64(child.GetAttribute(TagNumber)) + 1; } + + l2s_tag_num = XmlConvert.ToString(tag_num); + element.SetAttribute(TagNumber, l2s_tag_num); } + } - protected virtual void UpdateLinkChar(IElement element) + protected virtual void UpdateLinkChar(IElement element) + { + for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) { - for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) - { - child.SetAttribute(LinkCharNumber, child.GetAttribute(CharNumber)); - UpdateLinkChar(child); - } + child.SetAttribute(LinkCharNumber, child.GetAttribute(CharNumber)); + UpdateLinkChar(child); } + } - protected virtual void CountLinkChar(IElement element) + protected virtual void CountLinkChar(IElement element) + { + long linkchar_num = 0; + var tag_name = element.TagName; + + for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) { - long linkchar_num = 0; - var tag_name = element.TagName; + CountLinkChar(child); + } + //deal with hyperlink and sth like that + if(tag_name == TagNames.A || tag_name == TagNames.Button || tag_name == TagNames.Select) + { + linkchar_num = XmlConvert.ToInt64(element.GetAttribute(CharNumber)); + UpdateLinkChar(element); + } + else + { for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) { - CountLinkChar(child); + linkchar_num += XmlConvert.ToInt64(child.GetAttribute(LinkCharNumber)); } + } - //deal with hyperlink and sth like that - if(tag_name == TagNames.A || tag_name == TagNames.Button || tag_name == TagNames.Select) - { - linkchar_num = XmlConvert.ToInt64(element.GetAttribute(CharNumber)); - UpdateLinkChar(element); - } - else - { - for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) - { - linkchar_num += XmlConvert.ToInt64(child.GetAttribute(LinkCharNumber)); - } - } + var l2s_linkchar_num = XmlConvert.ToString(linkchar_num); + element.SetAttribute(LinkCharNumber, l2s_linkchar_num); + } - var l2s_linkchar_num = XmlConvert.ToString(linkchar_num); - element.SetAttribute(LinkCharNumber, l2s_linkchar_num); - } + protected virtual void CountLinkTag(IElement element) + { + long linktag_num = 0; + string l2s_linktag_num; + var tag_name = element.TagName; - protected virtual void CountLinkTag(IElement element) + for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) { - long linktag_num = 0; - string l2s_linktag_num; - var tag_name = element.TagName; + CountLinkTag(child); + } + //deal with hyperlink and sth like that + if(tag_name == TagNames.A || tag_name == TagNames.Button || tag_name == TagNames.Select) + { + linktag_num = XmlConvert.ToInt64(element.GetAttribute(TagNumber)); + UpdateLinkChar(element); + } + else + { for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) { - CountLinkTag(child); - } + linktag_num += XmlConvert.ToInt64(child.GetAttribute(LinkTagNumber)); + tag_name = child.TagName; - //deal with hyperlink and sth like that - if(tag_name == TagNames.A || tag_name == TagNames.Button || tag_name == TagNames.Select) - { - linktag_num = XmlConvert.ToInt64(element.GetAttribute(TagNumber)); - UpdateLinkChar(element); - } - else - { - for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) + //if a tag is or sth plays similar role in web pages, then anchor number add 1 + if(tag_name == TagNames.A || tag_name == TagNames.Button || tag_name == TagNames.Select) + { + linktag_num++; + } + else { - linktag_num += XmlConvert.ToInt64(child.GetAttribute(LinkTagNumber)); - tag_name = child.TagName; + var child_linktag_num = XmlConvert.ToInt64(child.GetAttribute(LinkTagNumber)); + var child_tag_num = XmlConvert.ToInt64(child.GetAttribute(TagNumber)); + var child_char_num = XmlConvert.ToInt64(child.GetAttribute(CharNumber)); + var child_linkchar_num = XmlConvert.ToInt64(child.GetAttribute(LinkCharNumber)); - //if a tag is or sth plays similar role in web pages, then anchor number add 1 - if(tag_name == TagNames.A || tag_name == TagNames.Button || tag_name == TagNames.Select) + //child_linktag_num != 0: there are some anchor under this child + if(child_linktag_num == child_tag_num && child_char_num == child_linkchar_num && 0 != child_linktag_num) { linktag_num++; } - else - { - var child_linktag_num = XmlConvert.ToInt64(child.GetAttribute(LinkTagNumber)); - var child_tag_num = XmlConvert.ToInt64(child.GetAttribute(TagNumber)); - var child_char_num = XmlConvert.ToInt64(child.GetAttribute(CharNumber)); - var child_linkchar_num = XmlConvert.ToInt64(child.GetAttribute(LinkCharNumber)); - - //child_linktag_num != 0: there are some anchor under this child - if(child_linktag_num == child_tag_num && child_char_num == child_linkchar_num && 0 != child_linktag_num) - { - linktag_num++; - } - } } } - - l2s_linktag_num = XmlConvert.ToString(linktag_num); - element.SetAttribute(LinkTagNumber, l2s_linktag_num); } - protected virtual void ComputeTextDensity(IElement element, double ratio) - { - var char_num = XmlConvert.ToInt64(element.GetAttribute(CharNumber)); - var tag_num = XmlConvert.ToInt64(element.GetAttribute(TagNumber)); - var linkchar_num = XmlConvert.ToInt64(element.GetAttribute(LinkCharNumber)); - var linktag_num = XmlConvert.ToInt64(element.GetAttribute(LinkTagNumber)); + l2s_linktag_num = XmlConvert.ToString(linktag_num); + element.SetAttribute(LinkTagNumber, l2s_linktag_num); + } - var text_density = 0.0; - string d2s_text_density; + protected virtual void ComputeTextDensity(IElement element, double ratio) + { + var char_num = XmlConvert.ToInt64(element.GetAttribute(CharNumber)); + var tag_num = XmlConvert.ToInt64(element.GetAttribute(TagNumber)); + var linkchar_num = XmlConvert.ToInt64(element.GetAttribute(LinkCharNumber)); + var linktag_num = XmlConvert.ToInt64(element.GetAttribute(LinkTagNumber)); + + var text_density = 0.0; + string d2s_text_density; + + if(0 == char_num) + { + text_density = 0; + } + else + { + var un_linkchar_num = char_num - linkchar_num; - if(0 == char_num) + if(0 == tag_num) { - text_density = 0; + tag_num = 1; } - else + if(0 == linkchar_num) + { + linkchar_num = 1; + } + if(0 == linktag_num) { - var un_linkchar_num = char_num - linkchar_num; + linktag_num = 1; + } + if(0 == un_linkchar_num) + { + un_linkchar_num = 1; + } - if(0 == tag_num) - { - tag_num = 1; - } - if(0 == linkchar_num) - { - linkchar_num = 1; - } - if(0 == linktag_num) - { - linktag_num = 1; - } - if(0 == un_linkchar_num) - { - un_linkchar_num = 1; - } - - text_density = (1.0 * char_num / tag_num) * Math.Log((1.0 * char_num * tag_num) / (1.0 * linkchar_num * linktag_num)) - / Math.Log(Math.Log(1.0 * char_num * linkchar_num / un_linkchar_num + ratio * char_num + Math.Exp(1.0))); + text_density = (1.0 * char_num / tag_num) * Math.Log((1.0 * char_num * tag_num) / (1.0 * linkchar_num * linktag_num)) + / Math.Log(Math.Log(1.0 * char_num * linkchar_num / un_linkchar_num + ratio * char_num + Math.Exp(1.0))); // text_density = 1.0 * char_num / tag_num; - } + } - //convert double to QString - d2s_text_density = XmlConvert.ToString(text_density); - element.SetAttribute(TextDensity, d2s_text_density); + //convert double to QString + d2s_text_density = XmlConvert.ToString(text_density); + element.SetAttribute(TextDensity, d2s_text_density); - for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) - { - ComputeTextDensity(child, ratio); - } + for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) + { + ComputeTextDensity(child, ratio); } + } - protected virtual void ComputeDensitySum(IElement element, double ratio) - { - var densitySum = 0.0; - //long char_num = 0; + protected virtual void ComputeDensitySum(IElement element, double ratio) + { + var densitySum = 0.0; + //long char_num = 0; - var content = element.TextContent; - string child_content; - var from = 0; - var index = 0; - var length = 0; + var content = element.TextContent; + string child_content; + var from = 0; + var index = 0; + var length = 0; - if(element.FirstElementChild == null) + if(element.FirstElementChild == null) + { + densitySum = XmlConvert.ToDouble(element.GetAttribute(TextDensity)); + } + else + { + for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) { - densitySum = XmlConvert.ToDouble(element.GetAttribute(TextDensity)); + ComputeDensitySum(child, ratio); } - else + for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) { - for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) - { - ComputeDensitySum(child, ratio); - } - for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) - { - densitySum += XmlConvert.ToDouble(child.GetAttribute(TextDensity)); - XmlConvert.ToInt64(child.GetAttribute(CharNumber)); + densitySum += XmlConvert.ToDouble(child.GetAttribute(TextDensity)); + XmlConvert.ToInt64(child.GetAttribute(CharNumber)); - //text before tag - child_content = child.TextContent; - index = content.IndexOf(child_content, from, StringComparison.Ordinal); - if(index > -1) + //text before tag + child_content = child.TextContent; + index = content.IndexOf(child_content, from, StringComparison.Ordinal); + if(index > -1) + { + length = index - from; + if(length > 0) { - length = index - from; - if(length > 0) - { - densitySum += length * Math.Log(1.0 * length) / Math.Log(Math.Log(ratio * length + Math.Exp(1.0))); - } - from = index + child_content.Length; + densitySum += length * Math.Log(1.0 * length) / Math.Log(Math.Log(ratio * length + Math.Exp(1.0))); } - } - - //text after tag - length = element.TextContent.Length - from; - if(length > 0) - { - densitySum += length * Math.Log(1.0 * length) / Math.Log(Math.Log(ratio * length + Math.Exp(1.0))); + from = index + child_content.Length; } } - var d2SDensitySum = XmlConvert.ToString(densitySum); - element.SetAttribute(DensitySum, d2SDensitySum); - } - - protected virtual double FindMaxDensitySum(IElement element) - { - var maxDensitySum = XmlConvert.ToDouble(element.GetAttribute(DensitySum)); - - for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) + //text after tag + length = element.TextContent.Length - from; + if(length > 0) { - var tempMaxDensitySum = FindMaxDensitySum(child); - if(tempMaxDensitySum - maxDensitySum > double.Epsilon) - { - maxDensitySum = tempMaxDensitySum; - } + densitySum += length * Math.Log(1.0 * length) / Math.Log(Math.Log(ratio * length + Math.Exp(1.0))); } - - //record the max_density_sum under the element - var d2SMaxDensitySum = XmlConvert.ToString(maxDensitySum); - element.SetAttribute(MaxDensitySum, d2SMaxDensitySum); - return maxDensitySum; } - protected virtual IElement SearchTag(IElement element, string attribute, double value) - { - var d2SValue = XmlConvert.ToString(value); - var target = element; + var d2SDensitySum = XmlConvert.ToString(densitySum); + element.SetAttribute(DensitySum, d2SDensitySum); + } + + protected virtual double FindMaxDensitySum(IElement element) + { + var maxDensitySum = XmlConvert.ToDouble(element.GetAttribute(DensitySum)); - var attrValue = XmlConvert.ToDouble(element.GetAttribute(attribute)); - if((attrValue - value > -1 * double.Epsilon) - && (attrValue - value < double.Epsilon)) + for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) + { + var tempMaxDensitySum = FindMaxDensitySum(child); + if(tempMaxDensitySum - maxDensitySum > double.Epsilon) { - return target; + maxDensitySum = tempMaxDensitySum; } + } + + //record the max_density_sum under the element + var d2SMaxDensitySum = XmlConvert.ToString(maxDensitySum); + element.SetAttribute(MaxDensitySum, d2SMaxDensitySum); + return maxDensitySum; + } + + protected virtual IElement SearchTag(IElement element, string attribute, double value) + { + var d2SValue = XmlConvert.ToString(value); + var target = element; - //search the max densitysum element using css selector - var cssSelector = "[" + attribute + "=\"" + d2SValue + "\"]"; - target = element.QuerySelector(cssSelector); + var attrValue = XmlConvert.ToDouble(element.GetAttribute(attribute)); + if((attrValue - value > -1 * double.Epsilon) + && (attrValue - value < double.Epsilon)) + { return target; } - - protected virtual double GetThreshold(IElement element, double maxDensitySum) + + //search the max densitysum element using css selector + var cssSelector = "[" + attribute + "=\"" + d2SValue + "\"]"; + target = element.QuerySelector(cssSelector); + return target; + } + + protected virtual double GetThreshold(IElement element, double maxDensitySum) + { + var threshold = -1.0; + + //search the max densitysum element + var target = SearchTag(element, DensitySum, maxDensitySum); + threshold = XmlConvert.ToDouble(target.GetAttribute(TextDensity)); + SetMark(target, 1); + + var parent = target.ParentElement; + while(true) { - var threshold = -1.0; - - //search the max densitysum element - var target = SearchTag(element, DensitySum, maxDensitySum); - threshold = XmlConvert.ToDouble(target.GetAttribute(TextDensity)); - SetMark(target, 1); - - var parent = target.ParentElement; - while(true) + if(parent.TagName != "HTML") { - if(parent.TagName != "HTML") + var textDensity = XmlConvert.ToDouble(parent.GetAttribute(TextDensity)); + if((threshold - textDensity) > -1 * double.Epsilon) { - var textDensity = XmlConvert.ToDouble(parent.GetAttribute(TextDensity)); - if((threshold - textDensity) > -1 * double.Epsilon) - { - threshold = textDensity; - } - - parent.SetAttribute(Mark, "2"); - parent = parent.ParentElement; - } - else - { - break; + threshold = textDensity; } + + parent.SetAttribute(Mark, "2"); + parent = parent.ParentElement; + } + else + { + break; } - - return threshold; } - protected virtual void SetMark(IElement element, int mark) - { - var i2SMark = XmlConvert.ToString(mark); + return threshold; + } - element.SetAttribute(Mark, i2SMark); + protected virtual void SetMark(IElement element, int mark) + { + var i2SMark = XmlConvert.ToString(mark); - for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) - { - SetMark(child, mark); - } + element.SetAttribute(Mark, i2SMark); + + for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) + { + SetMark(child, mark); } + } + + protected virtual void FindMaxDensitySumTag(IElement element, double maxDensitySum) + { + var target = SearchTag(element, DensitySum, maxDensitySum); - protected virtual void FindMaxDensitySumTag(IElement element, double maxDensitySum) + var mark = XmlConvert.ToInt32(target.GetAttribute(Mark)); + if(1 == mark) { - var target = SearchTag(element, DensitySum, maxDensitySum); + return; + } + + SetMark(target, 1); - var mark = XmlConvert.ToInt32(target.GetAttribute(Mark)); - if(1 == mark) + var parent = target.ParentElement; + while(true) + { + if(parent.TagName != "HTML") { - return; + parent.SetAttribute(Mark, "2"); + parent = parent.ParentElement; } - - SetMark(target, 1); - - var parent = target.ParentElement; - while(true) + else { - if(parent.TagName != "HTML") - { - parent.SetAttribute(Mark, "2"); - parent = parent.ParentElement; - } - else - { - break; - } + break; } } + } - protected virtual void MarkContent(IElement element, double threshold) - { - var textDensity = XmlConvert.ToDouble(element.GetAttribute(TextDensity)); - var maxDensitySum = XmlConvert.ToDouble(element.GetAttribute(MaxDensitySum)); - var mark = XmlConvert.ToInt32(element.GetAttribute(Mark)); + protected virtual void MarkContent(IElement element, double threshold) + { + var textDensity = XmlConvert.ToDouble(element.GetAttribute(TextDensity)); + var maxDensitySum = XmlConvert.ToDouble(element.GetAttribute(MaxDensitySum)); + var mark = XmlConvert.ToInt32(element.GetAttribute(Mark)); - if(mark != 1 && (textDensity - threshold > -1 * double.Epsilon)) + if(mark != 1 && (textDensity - threshold > -1 * double.Epsilon)) + { + FindMaxDensitySumTag(element, maxDensitySum); + for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) { - FindMaxDensitySumTag(element, maxDensitySum); - for(var child = element.FirstElementChild; child != null; child = child.NextElementSibling) - { - MarkContent(child, threshold); - } + MarkContent(child, threshold); } } + } - public void Extract(IHtmlDocument document) - { - var body = document.Body; - ProcessDom(body); + public void Extract(IHtmlDocument document) + { + var body = document.Body; + ProcessDom(body); - CountChar(body); - CountTag(body); - CountLinkChar(body); - CountLinkTag(body); + CountChar(body); + CountTag(body); + CountLinkChar(body); + CountLinkTag(body); - var charNum = XmlConvert.ToDouble(body.GetAttribute(CharNumber)); - var linkCharNum = XmlConvert.ToDouble(body.GetAttribute(LinkCharNumber)); + var charNum = XmlConvert.ToDouble(body.GetAttribute(CharNumber)); + var linkCharNum = XmlConvert.ToDouble(body.GetAttribute(LinkCharNumber)); - if (linkCharNum < double.Epsilon) - { - linkCharNum = 1; - } + if (linkCharNum < double.Epsilon) + { + linkCharNum = 1; + } - var ratio = linkCharNum / charNum; + var ratio = linkCharNum / charNum; - ComputeTextDensity(body, ratio); - ComputeDensitySum(body, ratio); - var maxDensitySum = FindMaxDensitySum(body); - SetMark(body, 0); - var threshold = GetThreshold(body, maxDensitySum); - MarkContent(body, threshold); - CleanTreeByMark(body); - RemoveAttribute(body); - } + ComputeTextDensity(body, ratio); + ComputeDensitySum(body, ratio); + var maxDensitySum = FindMaxDensitySum(body); + SetMark(body, 0); + var threshold = GetThreshold(body, maxDensitySum); + MarkContent(body, threshold); + CleanTreeByMark(body); + RemoveAttribute(body); } }