Below is an example of how you can create a web page scraper to index the content of a web page item that might be rendered using the Page Builder or related content.
Your
HttpClient.BaseAddress
needs to match the web page item's website channel baseUrl. In this example, the crawler only supports 1 website channel with a base URL stored inappsettings.json
. You can extend this to pull this value dynamically from the channel settings of the solution. SeeDefaultLuceneClient.GetAllWebsiteChannels
for an example query.
First add your website url base to the WebCrawlerBaseUrl
appsettings.json
{
// Other settings ...
// Our example runs on localhost with port: 58093
"WebCrawlerBaseUrl": "http://localhost:58093/"
}
public class WebCrawlerService
{
private readonly HttpClient httpClient;
private readonly IEventLogService log;
private readonly IWebPageUrlRetriever webPageUrlRetriever;
public WebCrawlerService(
HttpClient httpClient,
IEventLogService log,
IWebPageUrlRetriever webPageUrlRetriever,
IAppSettingsService appSettingsService)
{
string baseUrl = appSettingsService["WebCrawlerBaseUrl"];
this.httpClient = httpClient;
this.httpClient.DefaultRequestHeaders.Add(HeaderNames.UserAgent, "SearchCrawler");
this.httpClient.BaseAddress = new Uri(baseUrl);
this.log = log;
this.webPageUrlRetriever = webPageUrlRetriever;
}
public async Task<string> CrawlWebPage(IWebPageFieldsSource page)
{
try
{
var url = await urlRetriever.Retrieve(page);
string path = url.RelativePath.TrimStart('~').TrimStart('/');
return await CrawlPage(path);
}
catch (Exception ex)
{
log.LogException(
nameof(WebCrawlerService),
nameof(CrawlWebPage),
ex,
$"Tree Path: {page.SystemFields.WebPageItemTreePath}");
}
return string.Empty;
}
public async Task<string> CrawlPage(string url)
{
try
{
var response = await httpClient.GetAsync(url);
return await response.Content.ReadAsStringAsync();
}
catch (Exception ex)
{
log.LogException(
nameof(WebCrawlerService),
nameof(CrawlPage),
ex,
$"Url: {url}");
}
return string.Empty;
}
}
We'll also want to process and sanitize the scraped HTML, removing any irrelavent content and removing all markup.
using AngleSharp.Html.Parser;
using CMS.Helpers;
public class WebScraperHtmlSanitizer
{
public virtual string SanitizeHtmlDocument(string htmlContent)
{
if (string.IsNullOrWhiteSpace(htmlContent))
{
return string.Empty;
}
var parser = new HtmlParser();
var doc = parser.ParseDocument(htmlContent);
var body = doc.Body;
if (body is null)
{
return string.Empty;
}
foreach (var element in body.QuerySelectorAll("script"))
{
element.Remove();
}
foreach (var element in body.QuerySelectorAll("style"))
{
element.Remove();
}
// Removes elements marked with the default Xperience exclusion attribute
foreach (var element in body.QuerySelectorAll($"*[{"data-ktc-search-exclude"}]"))
{
element.Remove();
}
foreach (var element in body.QuerySelectorAll("header"))
{
element.Remove();
}
foreach (var element in body.QuerySelectorAll("footer"))
{
element.Remove();
}
// Gets the text content of the body element
string textContent = body.TextContent;
// Normalizes and trims whitespace characters
textContent = HTMLHelper.RegexHtmlToTextWhiteSpace.Replace(textContent, " ");
textContent = textContent.Trim();
string title = doc.Head?.QuerySelector("title")?.TextContent ?? string.Empty;
string description = doc.Head?.QuerySelector("meta[name='description']")?.GetAttribute("content") ?? string.Empty;
return string.Join(
" ",
new string[] { title, description, textContent }.Where(i => !string.IsNullOrWhiteSpace(i))
);
}
}
Register these services in the startup:
// Startup.cs or wherever you register services in the DI container
services.AddSingleton<WebScraperHtmlSanitizer>();
services.AddHttpClient<WebCrawlerService>();
Now, use the services in your custom strategy to add the scraped content to the index:
// ExampleSearchIndexingStrategy.cs
private const string CRAWLER_CONTENT_FIELD_NAME = "Content";
private readonly IWebPageQueryResultMapper webPageMapper;
private readonly IContentQueryExecutor queryExecutor;
private readonly WebScraperHtmlSanitizer htmlSanitizer;
private readonly WebCrawlerService webCrawler;
public ExampleSearchIndexingStrategy(
IWebPageQueryResultMapper webPageMapper,
IContentQueryExecutor queryExecutor,
WebScraperHtmlSanitizer htmlSanitizer,
WebCrawlerService webCrawler
)
{
this.webPageMapper = webPageMapper;
this.queryExecutor = queryExecutor;
this.htmlSanitizer = htmlSanitizer;
this.webCrawler = webCrawler;
}
public override async Task<Document?> MapToLuceneDocumentOrNull(IIndexEventItemModel item)
{
// Setup same as examples in Usage-Guide.md
// ...
string content = string.Empty;
if (item is IndexEventWebPageItemModel webpageItem &&
string.Equals(indexedModel.ContentTypeName, ArticlePage.CONTENT_TYPE_NAME, StringComparison.OrdinalIgnorecase))
{
// ...
string rawContent = await webCrawler.CrawlWebPage(page);
content = htmlSanitizer.SanitizeHtmlDocument(rawContent);
}
// Add the scraped content
document.Add(new TextField(CRAWLER_CONTENT_FIELD_NAME, crawlerContent, Field.Store.NO));
// Set other fields
// ...
return document;
}
// ...