From bd3e1fb3845f5f341618f502193352e07c13d4a8 Mon Sep 17 00:00:00 2001 From: Marcel <14852157+Marcel0024@users.noreply.github.com> Date: Thu, 11 Jul 2024 20:45:12 +0200 Subject: [PATCH] Added ability to click inside the browser --- CocoCrawler/CocoCrawler.csproj | 4 +- .../PageBrowserActions/PageActionType.cs | 3 +- .../PageBrowserActions/PageActionsBuilder.cs | 7 ++ CocoCrawler/Crawler/PuppeteerCrawler.cs | 3 + README.md | 21 ++++- .../OpenLinkAndClick/OpenLinkAndClickTests.cs | 78 +++++++++++++++++++ 6 files changed, 109 insertions(+), 7 deletions(-) create mode 100644 Tests/CocoCrawler.IntegrationTests/Scenarios/OpenLinkAndClick/OpenLinkAndClickTests.cs diff --git a/CocoCrawler/CocoCrawler.csproj b/CocoCrawler/CocoCrawler.csproj index ee58877..bac07a9 100644 --- a/CocoCrawler/CocoCrawler.csproj +++ b/CocoCrawler/CocoCrawler.csproj @@ -4,9 +4,9 @@ net8.0 enable enable - 1.0.3 + 1.0.4 Marcel0024 - An easy to use Crawler to parse websites. Supports Headless and Headfull. + An declarative and easy to use web crawler and scraper in C#. Supports Headless and Headfull. CocoCrawler Crawler;Scraper;Coco;Parser;Builder;Site;Chrome https://github.com/Marcel0024/CocoCrawler diff --git a/CocoCrawler/CrawlJob/PageBrowserActions/PageActionType.cs b/CocoCrawler/CrawlJob/PageBrowserActions/PageActionType.cs index d608734..2f3cb54 100644 --- a/CocoCrawler/CrawlJob/PageBrowserActions/PageActionType.cs +++ b/CocoCrawler/CrawlJob/PageBrowserActions/PageActionType.cs @@ -3,5 +3,6 @@ public enum PageActionType { ScrollToEnd, - Wait + Wait, + Click } diff --git a/CocoCrawler/CrawlJob/PageBrowserActions/PageActionsBuilder.cs b/CocoCrawler/CrawlJob/PageBrowserActions/PageActionsBuilder.cs index cb74f5f..f12c639 100644 --- a/CocoCrawler/CrawlJob/PageBrowserActions/PageActionsBuilder.cs +++ b/CocoCrawler/CrawlJob/PageBrowserActions/PageActionsBuilder.cs @@ -20,6 +20,13 @@ public PageActionsBuilder Wait(int milliseconds) return this; } + public PageActionsBuilder Click(string selector) + { + Actions.Add(new PageAction(PageActionType.Click, selector)); + + return this; + } + internal PageActions Build() { if (Actions.Count == 0) diff --git a/CocoCrawler/Crawler/PuppeteerCrawler.cs b/CocoCrawler/Crawler/PuppeteerCrawler.cs index 39aa9a8..cf5e155 100644 --- a/CocoCrawler/Crawler/PuppeteerCrawler.cs +++ b/CocoCrawler/Crawler/PuppeteerCrawler.cs @@ -48,6 +48,9 @@ protected virtual async Task ExecutePageActions(IPage page, PageActions? browser case PageActionType.Wait: await Task.Delay(Convert.ToInt32(action.Parameters)); break; + case PageActionType.Click: + await page.ClickAsync(action.Parameters); + break; } } } diff --git a/README.md b/README.md index 03053c1..d72c78a 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,12 @@ var crawlerEngine = await new CrawlerEngineBuilder() new("Url","a.title", "href") ]) .AddPagination("span.next-button > a") + .ConfigurePageActions(options => // Only for showing the possibilities, not needed for running sample + { + options.ScrollToEnd(); + options.Wait(2000); + // options.Click("span.next-button > a"); + }) .AddOutputToConsole() .AddOutputToCsvFile("results.csv") ) @@ -140,9 +146,9 @@ It's possible to add multiple pages to scrape with the same Tasks. This example starts at `https://old.reddit.com/r/csharp` and `https://old.reddit.com/r/dotnet` and opens each post and scrapes the title, url, upvotes and top comment. It also scrolls to the end of the page and waits 4 seconds before scraping the page. And then it continues with the next pagination page. -## Page Browser Actions +## PageActions - A way to interact with the browser -It's possible to add browser actions to each page. The following actions are available: +Page Actions are a way to interact with the browser. It's possible to add page actions to each page. It's possible to click away popups, or scroll to bottom. The following actions are available: ```csharp var crawlerEngine = await new CrawlerEngineBuilder() @@ -154,6 +160,7 @@ var crawlerEngine = await new CrawlerEngineBuilder() .ConfigurePageActions(ops => { ops.ScrollToEnd(); + ops.Click("button#load-more"); ops.Wait(4000); }); .BuildAsync(cancellationToken); @@ -219,7 +226,10 @@ var crawlerEngine = await new CrawlerEngineBuilder() ```csharp var crawlerEngine = await new CrawlerEngineBuilder() .AddPage(...) - .WithUserAgent("linux browser - example user agent") + .ConfigureEngine(options => + { + options.WithUserAgent("linux browser - example user agent"); + }) .BuildAsync(cancellationToken); ``` Default User Agent is from Chrome browser. @@ -229,7 +239,10 @@ Default User Agent is from Chrome browser. ```csharp var crawlerEngine = await new CrawlerEngineBuilder() .AddPage(...) - .WithIgnoreUrls(["https://example.com", "https://example2.com"])) + .ConfigureEngine(options => + { + options.WithIgnoreUrls(["https://example.com", "https://example2.com"]); + }) .BuildAsync(cancellationToken); ``` diff --git a/Tests/CocoCrawler.IntegrationTests/Scenarios/OpenLinkAndClick/OpenLinkAndClickTests.cs b/Tests/CocoCrawler.IntegrationTests/Scenarios/OpenLinkAndClick/OpenLinkAndClickTests.cs new file mode 100644 index 0000000..e875cef --- /dev/null +++ b/Tests/CocoCrawler.IntegrationTests/Scenarios/OpenLinkAndClick/OpenLinkAndClickTests.cs @@ -0,0 +1,78 @@ +using CocoCrawler.Builders; +using CocoCrawler.Scheduler; +using FluentAssertions; +using WireMock.Server; + +namespace CocoCrawler.IntegrationTests.Scenarios.OpenLinkAndClick; + +[Collection(nameof(BrowserCollection))] +public class OpenLinkAndClickTests +{ + private readonly WireMockServer _wireMockServer = WireMockServer.Start(); + + [Fact] + public async Task DocumentShould_Click_WhenCalled() + { + // Arrange + _wireMockServer.ReturnSuccessWithPage($"{_wireMockServer.Url}/clickme", GeStartPage(_wireMockServer.Url!)); + _wireMockServer.ReturnSuccessWithPage($"{_wireMockServer.Url}/next-page", GetSecondPage()); + + var crawlerEngine = await new CrawlerEngineBuilder() + .AddPage($"{_wireMockServer.Url}/clickme", pageOptions => pageOptions + .OpenLinks("div.content > a", subPageOptions => + { + subPageOptions.ConfigurePageActions(actions => + { + actions.Click("button#clickme"); + }); + subPageOptions.ExtractObject([new("Was i clicked", "div.clicked-now-scraped")]); + }) + .AddOutputToCsvFile("clicked-results.txt", cleanOnStartup: true) + ) + .ConfigureEngine(options => options.WithScheduler(new InMemoryScheduler(totalSecondsTimeoutAfterJob: 2))) + .BuildAsync(); + + // Act + await crawlerEngine.RunAsync(); + + // Assert + var fileOutputContents = File.ReadAllText("clicked-results.txt"); + + var expectedContents = $@"Url,Was i clicked +{_wireMockServer.Url}/next-page,Yes i was! +"; + + fileOutputContents.Should().Be(expectedContents); + } + + private static string GeStartPage(string baseUrl) + { + return $@" + + +
+ Click me +
+ + "; + } + + private static string GetSecondPage() + { + return @" + + + + + + + "; + } +}