From bd3e1fb3845f5f341618f502193352e07c13d4a8 Mon Sep 17 00:00:00 2001
From: Marcel <14852157+Marcel0024@users.noreply.github.com>
Date: Thu, 11 Jul 2024 20:45:12 +0200
Subject: [PATCH] Added ability to click inside the browser
---
CocoCrawler/CocoCrawler.csproj | 4 +-
.../PageBrowserActions/PageActionType.cs | 3 +-
.../PageBrowserActions/PageActionsBuilder.cs | 7 ++
CocoCrawler/Crawler/PuppeteerCrawler.cs | 3 +
README.md | 21 ++++-
.../OpenLinkAndClick/OpenLinkAndClickTests.cs | 78 +++++++++++++++++++
6 files changed, 109 insertions(+), 7 deletions(-)
create mode 100644 Tests/CocoCrawler.IntegrationTests/Scenarios/OpenLinkAndClick/OpenLinkAndClickTests.cs
diff --git a/CocoCrawler/CocoCrawler.csproj b/CocoCrawler/CocoCrawler.csproj
index ee58877..bac07a9 100644
--- a/CocoCrawler/CocoCrawler.csproj
+++ b/CocoCrawler/CocoCrawler.csproj
@@ -4,9 +4,9 @@
net8.0
enable
enable
- 1.0.3
+ 1.0.4
Marcel0024
- An easy to use Crawler to parse websites. Supports Headless and Headfull.
+ An declarative and easy to use web crawler and scraper in C#. Supports Headless and Headfull.
CocoCrawler
Crawler;Scraper;Coco;Parser;Builder;Site;Chrome
https://github.com/Marcel0024/CocoCrawler
diff --git a/CocoCrawler/CrawlJob/PageBrowserActions/PageActionType.cs b/CocoCrawler/CrawlJob/PageBrowserActions/PageActionType.cs
index d608734..2f3cb54 100644
--- a/CocoCrawler/CrawlJob/PageBrowserActions/PageActionType.cs
+++ b/CocoCrawler/CrawlJob/PageBrowserActions/PageActionType.cs
@@ -3,5 +3,6 @@
public enum PageActionType
{
ScrollToEnd,
- Wait
+ Wait,
+ Click
}
diff --git a/CocoCrawler/CrawlJob/PageBrowserActions/PageActionsBuilder.cs b/CocoCrawler/CrawlJob/PageBrowserActions/PageActionsBuilder.cs
index cb74f5f..f12c639 100644
--- a/CocoCrawler/CrawlJob/PageBrowserActions/PageActionsBuilder.cs
+++ b/CocoCrawler/CrawlJob/PageBrowserActions/PageActionsBuilder.cs
@@ -20,6 +20,13 @@ public PageActionsBuilder Wait(int milliseconds)
return this;
}
+ public PageActionsBuilder Click(string selector)
+ {
+ Actions.Add(new PageAction(PageActionType.Click, selector));
+
+ return this;
+ }
+
internal PageActions Build()
{
if (Actions.Count == 0)
diff --git a/CocoCrawler/Crawler/PuppeteerCrawler.cs b/CocoCrawler/Crawler/PuppeteerCrawler.cs
index 39aa9a8..cf5e155 100644
--- a/CocoCrawler/Crawler/PuppeteerCrawler.cs
+++ b/CocoCrawler/Crawler/PuppeteerCrawler.cs
@@ -48,6 +48,9 @@ protected virtual async Task ExecutePageActions(IPage page, PageActions? browser
case PageActionType.Wait:
await Task.Delay(Convert.ToInt32(action.Parameters));
break;
+ case PageActionType.Click:
+ await page.ClickAsync(action.Parameters);
+ break;
}
}
}
diff --git a/README.md b/README.md
index 03053c1..d72c78a 100644
--- a/README.md
+++ b/README.md
@@ -19,6 +19,12 @@ var crawlerEngine = await new CrawlerEngineBuilder()
new("Url","a.title", "href")
])
.AddPagination("span.next-button > a")
+ .ConfigurePageActions(options => // Only for showing the possibilities, not needed for running sample
+ {
+ options.ScrollToEnd();
+ options.Wait(2000);
+ // options.Click("span.next-button > a");
+ })
.AddOutputToConsole()
.AddOutputToCsvFile("results.csv")
)
@@ -140,9 +146,9 @@ It's possible to add multiple pages to scrape with the same Tasks.
This example starts at `https://old.reddit.com/r/csharp` and `https://old.reddit.com/r/dotnet` and opens each post and scrapes the title, url, upvotes and top comment. It also scrolls to the end of the page and waits 4 seconds before scraping the page. And then it continues with the next pagination page.
-## Page Browser Actions
+## PageActions - A way to interact with the browser
-It's possible to add browser actions to each page. The following actions are available:
+Page Actions are a way to interact with the browser. It's possible to add page actions to each page. It's possible to click away popups, or scroll to bottom. The following actions are available:
```csharp
var crawlerEngine = await new CrawlerEngineBuilder()
@@ -154,6 +160,7 @@ var crawlerEngine = await new CrawlerEngineBuilder()
.ConfigurePageActions(ops =>
{
ops.ScrollToEnd();
+ ops.Click("button#load-more");
ops.Wait(4000);
});
.BuildAsync(cancellationToken);
@@ -219,7 +226,10 @@ var crawlerEngine = await new CrawlerEngineBuilder()
```csharp
var crawlerEngine = await new CrawlerEngineBuilder()
.AddPage(...)
- .WithUserAgent("linux browser - example user agent")
+ .ConfigureEngine(options =>
+ {
+ options.WithUserAgent("linux browser - example user agent");
+ })
.BuildAsync(cancellationToken);
```
Default User Agent is from Chrome browser.
@@ -229,7 +239,10 @@ Default User Agent is from Chrome browser.
```csharp
var crawlerEngine = await new CrawlerEngineBuilder()
.AddPage(...)
- .WithIgnoreUrls(["https://example.com", "https://example2.com"]))
+ .ConfigureEngine(options =>
+ {
+ options.WithIgnoreUrls(["https://example.com", "https://example2.com"]);
+ })
.BuildAsync(cancellationToken);
```
diff --git a/Tests/CocoCrawler.IntegrationTests/Scenarios/OpenLinkAndClick/OpenLinkAndClickTests.cs b/Tests/CocoCrawler.IntegrationTests/Scenarios/OpenLinkAndClick/OpenLinkAndClickTests.cs
new file mode 100644
index 0000000..e875cef
--- /dev/null
+++ b/Tests/CocoCrawler.IntegrationTests/Scenarios/OpenLinkAndClick/OpenLinkAndClickTests.cs
@@ -0,0 +1,78 @@
+using CocoCrawler.Builders;
+using CocoCrawler.Scheduler;
+using FluentAssertions;
+using WireMock.Server;
+
+namespace CocoCrawler.IntegrationTests.Scenarios.OpenLinkAndClick;
+
+[Collection(nameof(BrowserCollection))]
+public class OpenLinkAndClickTests
+{
+ private readonly WireMockServer _wireMockServer = WireMockServer.Start();
+
+ [Fact]
+ public async Task DocumentShould_Click_WhenCalled()
+ {
+ // Arrange
+ _wireMockServer.ReturnSuccessWithPage($"{_wireMockServer.Url}/clickme", GeStartPage(_wireMockServer.Url!));
+ _wireMockServer.ReturnSuccessWithPage($"{_wireMockServer.Url}/next-page", GetSecondPage());
+
+ var crawlerEngine = await new CrawlerEngineBuilder()
+ .AddPage($"{_wireMockServer.Url}/clickme", pageOptions => pageOptions
+ .OpenLinks("div.content > a", subPageOptions =>
+ {
+ subPageOptions.ConfigurePageActions(actions =>
+ {
+ actions.Click("button#clickme");
+ });
+ subPageOptions.ExtractObject([new("Was i clicked", "div.clicked-now-scraped")]);
+ })
+ .AddOutputToCsvFile("clicked-results.txt", cleanOnStartup: true)
+ )
+ .ConfigureEngine(options => options.WithScheduler(new InMemoryScheduler(totalSecondsTimeoutAfterJob: 2)))
+ .BuildAsync();
+
+ // Act
+ await crawlerEngine.RunAsync();
+
+ // Assert
+ var fileOutputContents = File.ReadAllText("clicked-results.txt");
+
+ var expectedContents = $@"Url,Was i clicked
+{_wireMockServer.Url}/next-page,Yes i was!
+";
+
+ fileOutputContents.Should().Be(expectedContents);
+ }
+
+ private static string GeStartPage(string baseUrl)
+ {
+ return $@"
+
+
+
+
+ ";
+ }
+
+ private static string GetSecondPage()
+ {
+ return @"
+
+
+
+
+
+
+ ";
+ }
+}