Added ability to click inside the browser

Marcel0024 · Jul 11, 2024 · bd3e1fb · bd3e1fb
1 parent 30cbfca
commit bd3e1fb
Show file tree

Hide file tree

Showing 6 changed files with 109 additions and 7 deletions.
diff --git a/CocoCrawler/CocoCrawler.csproj b/CocoCrawler/CocoCrawler.csproj
@@ -4,9 +4,9 @@
 		<TargetFramework>net8.0</TargetFramework>
 		<ImplicitUsings>enable</ImplicitUsings>
 		<Nullable>enable</Nullable>
-		<Version>1.0.3</Version>
+		<Version>1.0.4</Version>
 		<Authors>Marcel0024</Authors>
-		<Description>An easy to use Crawler to parse websites. Supports Headless and Headfull.</Description>
+		<Description>An declarative and easy to use web crawler and scraper in C#. Supports Headless and Headfull.</Description>
 		<Title>CocoCrawler</Title>
 		<PackageTags>Crawler;Scraper;Coco;Parser;Builder;Site;Chrome</PackageTags>
 		<RepositoryUrl>https://github.com/Marcel0024/CocoCrawler</RepositoryUrl>

diff --git a/CocoCrawler/CrawlJob/PageBrowserActions/PageActionType.cs b/CocoCrawler/CrawlJob/PageBrowserActions/PageActionType.cs
@@ -3,5 +3,6 @@
 public enum PageActionType
 {
     ScrollToEnd,
-    Wait
+    Wait,
+    Click
 }
diff --git a/CocoCrawler/CrawlJob/PageBrowserActions/PageActionsBuilder.cs b/CocoCrawler/CrawlJob/PageBrowserActions/PageActionsBuilder.cs
@@ -20,6 +20,13 @@ public PageActionsBuilder Wait(int milliseconds)
         return this;
     }
 
+    public PageActionsBuilder Click(string selector)
+    {
+        Actions.Add(new PageAction(PageActionType.Click, selector));
+
+        return this;
+    }
+
     internal PageActions Build()
     {
         if (Actions.Count == 0)

diff --git a/CocoCrawler/Crawler/PuppeteerCrawler.cs b/CocoCrawler/Crawler/PuppeteerCrawler.cs
@@ -48,6 +48,9 @@ protected virtual async Task ExecutePageActions(IPage page, PageActions? browser
                 case PageActionType.Wait:
                     await Task.Delay(Convert.ToInt32(action.Parameters));
                     break;
+                case PageActionType.Click:
+                    await page.ClickAsync(action.Parameters);
+                    break;
             }
         }
     }

diff --git a/README.md b/README.md
@@ -19,6 +19,12 @@ var crawlerEngine = await new CrawlerEngineBuilder()
             new("Url","a.title", "href")
         ])
         .AddPagination("span.next-button > a")
+        .ConfigurePageActions(options => // Only for showing the possibilities, not needed for running sample
+        {
+            options.ScrollToEnd();
+            options.Wait(2000);
+            // options.Click("span.next-button > a");
+        })
         .AddOutputToConsole()
         .AddOutputToCsvFile("results.csv")
     )
@@ -140,9 +146,9 @@ It's possible to add multiple pages to scrape with the same Tasks.
 This example starts at `https://old.reddit.com/r/csharp` and `https://old.reddit.com/r/dotnet` and opens each post and scrapes the title, url, upvotes and top comment. It also scrolls to the end of the page and waits 4 seconds before scraping the page. And then it continues with the next pagination page.
 
 
-## Page Browser Actions
+## PageActions - A way to interact with the browser
 
-It's possible to add browser actions to each page. The following actions are available:
+Page Actions are a way to interact with the browser. It's possible to add page actions to each page. It's possible to click away popups, or scroll to bottom. The following actions are available:
 
 ```csharp
 var crawlerEngine = await new CrawlerEngineBuilder()
@@ -154,6 +160,7 @@ var crawlerEngine = await new CrawlerEngineBuilder()
         .ConfigurePageActions(ops =>
         {
             ops.ScrollToEnd();
+            ops.Click("button#load-more");
             ops.Wait(4000);
         });
     .BuildAsync(cancellationToken);
@@ -219,7 +226,10 @@ var crawlerEngine = await new CrawlerEngineBuilder()
 ```csharp
 var crawlerEngine = await new CrawlerEngineBuilder()
     .AddPage(...)
-    .WithUserAgent("linux browser - example user agent")
+    .ConfigureEngine(options =>
+    {
+        options.WithUserAgent("linux browser - example user agent");
+    })
     .BuildAsync(cancellationToken);
 ```
 Default User Agent is from Chrome browser.
@@ -229,7 +239,10 @@ Default User Agent is from Chrome browser.
 ```csharp
 var crawlerEngine = await new CrawlerEngineBuilder()
     .AddPage(...)
-    .WithIgnoreUrls(["https://example.com", "https://example2.com"]))
+    .ConfigureEngine(options =>
+    {
+        options.WithIgnoreUrls(["https://example.com", "https://example2.com"]);
+    })    
     .BuildAsync(cancellationToken);
 ```
 

diff --git a/Tests/CocoCrawler.IntegrationTests/Scenarios/OpenLinkAndClick/OpenLinkAndClickTests.cs b/Tests/CocoCrawler.IntegrationTests/Scenarios/OpenLinkAndClick/OpenLinkAndClickTests.cs
@@ -0,0 +1,78 @@
+using CocoCrawler.Builders;
+using CocoCrawler.Scheduler;
+using FluentAssertions;
+using WireMock.Server;
+
+namespace CocoCrawler.IntegrationTests.Scenarios.OpenLinkAndClick;
+
+[Collection(nameof(BrowserCollection))]
+public class OpenLinkAndClickTests
+{
+    private readonly WireMockServer _wireMockServer = WireMockServer.Start();
+
+    [Fact]
+    public async Task DocumentShould_Click_WhenCalled()
+    {
+        // Arrange
+        _wireMockServer.ReturnSuccessWithPage($"{_wireMockServer.Url}/clickme", GeStartPage(_wireMockServer.Url!));
+        _wireMockServer.ReturnSuccessWithPage($"{_wireMockServer.Url}/next-page", GetSecondPage());
+
+        var crawlerEngine = await new CrawlerEngineBuilder()
+            .AddPage($"{_wireMockServer.Url}/clickme", pageOptions => pageOptions
+                .OpenLinks("div.content > a", subPageOptions =>
+                {
+                    subPageOptions.ConfigurePageActions(actions =>
+                    {
+                        actions.Click("button#clickme");
+                    });
+                    subPageOptions.ExtractObject([new("Was i clicked", "div.clicked-now-scraped")]);
+                })
+                .AddOutputToCsvFile("clicked-results.txt", cleanOnStartup: true)
+            )
+            .ConfigureEngine(options => options.WithScheduler(new InMemoryScheduler(totalSecondsTimeoutAfterJob: 2)))
+            .BuildAsync();
+
+        // Act
+        await crawlerEngine.RunAsync();
+
+        // Assert
+        var fileOutputContents = File.ReadAllText("clicked-results.txt");
+
+        var expectedContents = $@"Url,Was i clicked
+{_wireMockServer.Url}/next-page,Yes i was!
+";
+
+        fileOutputContents.Should().Be(expectedContents);
+    }
+
+    private static string GeStartPage(string baseUrl)
+    {
+        return $@"
+            <html>
+                <body>
+                    <div class=""content"">
+                        <a href='{baseUrl}/next-page'>Click me</a>
+                    </div>
+                </body>
+            </html>";
+    }
+
+    private static string GetSecondPage()
+    {
+        return @"
+            <!DOCTYPE html>
+            <html lang=""en"" xmlns=""http://www.w3.org/1999/xhtml"">
+                <body>
+                   <button id=""clickme""> ClickMe </button>
+                   <script>
+                        document.getElementById('clickme').addEventListener('click', function() {
+                            var div = document.createElement('div');
+                            div.className = 'clicked-now-scraped';
+                            div.textContent = 'Yes i was!'; 
+                            document.body.appendChild(div);
+                        });
+                   </script>
+                </body>              
+            </html>";
+    }
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,5 +3,6 @@ @@
     public enum PageActionType
     {
         ScrollToEnd,
-        Wait
+        Wait,
+        Click
     }