Skip to content

Commit

Permalink
Added ability to click inside the browser
Browse files Browse the repository at this point in the history
  • Loading branch information
Marcel0024 committed Jul 11, 2024
1 parent 30cbfca commit bd3e1fb
Show file tree
Hide file tree
Showing 6 changed files with 109 additions and 7 deletions.
4 changes: 2 additions & 2 deletions CocoCrawler/CocoCrawler.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
<TargetFramework>net8.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<Version>1.0.3</Version>
<Version>1.0.4</Version>
<Authors>Marcel0024</Authors>
<Description>An easy to use Crawler to parse websites. Supports Headless and Headfull.</Description>
<Description>An declarative and easy to use web crawler and scraper in C#. Supports Headless and Headfull.</Description>
<Title>CocoCrawler</Title>
<PackageTags>Crawler;Scraper;Coco;Parser;Builder;Site;Chrome</PackageTags>
<RepositoryUrl>https://github.com/Marcel0024/CocoCrawler</RepositoryUrl>
Expand Down
3 changes: 2 additions & 1 deletion CocoCrawler/CrawlJob/PageBrowserActions/PageActionType.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,6 @@
public enum PageActionType
{
ScrollToEnd,
Wait
Wait,
Click
}
7 changes: 7 additions & 0 deletions CocoCrawler/CrawlJob/PageBrowserActions/PageActionsBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@ public PageActionsBuilder Wait(int milliseconds)
return this;
}

public PageActionsBuilder Click(string selector)
{
Actions.Add(new PageAction(PageActionType.Click, selector));

return this;
}

internal PageActions Build()
{
if (Actions.Count == 0)
Expand Down
3 changes: 3 additions & 0 deletions CocoCrawler/Crawler/PuppeteerCrawler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ protected virtual async Task ExecutePageActions(IPage page, PageActions? browser
case PageActionType.Wait:
await Task.Delay(Convert.ToInt32(action.Parameters));
break;
case PageActionType.Click:
await page.ClickAsync(action.Parameters);
break;
}
}
}
Expand Down
21 changes: 17 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ var crawlerEngine = await new CrawlerEngineBuilder()
new("Url","a.title", "href")
])
.AddPagination("span.next-button > a")
.ConfigurePageActions(options => // Only for showing the possibilities, not needed for running sample
{
options.ScrollToEnd();
options.Wait(2000);
// options.Click("span.next-button > a");
})
.AddOutputToConsole()
.AddOutputToCsvFile("results.csv")
)
Expand Down Expand Up @@ -140,9 +146,9 @@ It's possible to add multiple pages to scrape with the same Tasks.
This example starts at `https://old.reddit.com/r/csharp` and `https://old.reddit.com/r/dotnet` and opens each post and scrapes the title, url, upvotes and top comment. It also scrolls to the end of the page and waits 4 seconds before scraping the page. And then it continues with the next pagination page.

## Page Browser Actions
## PageActions - A way to interact with the browser

It's possible to add browser actions to each page. The following actions are available:
Page Actions are a way to interact with the browser. It's possible to add page actions to each page. It's possible to click away popups, or scroll to bottom. The following actions are available:

```csharp
var crawlerEngine = await new CrawlerEngineBuilder()
Expand All @@ -154,6 +160,7 @@ var crawlerEngine = await new CrawlerEngineBuilder()
.ConfigurePageActions(ops =>
{
ops.ScrollToEnd();
ops.Click("button#load-more");
ops.Wait(4000);
});
.BuildAsync(cancellationToken);
Expand Down Expand Up @@ -219,7 +226,10 @@ var crawlerEngine = await new CrawlerEngineBuilder()
```csharp
var crawlerEngine = await new CrawlerEngineBuilder()
.AddPage(...)
.WithUserAgent("linux browser - example user agent")
.ConfigureEngine(options =>
{
options.WithUserAgent("linux browser - example user agent");
})
.BuildAsync(cancellationToken);
```
Default User Agent is from Chrome browser.
Expand All @@ -229,7 +239,10 @@ Default User Agent is from Chrome browser.
```csharp
var crawlerEngine = await new CrawlerEngineBuilder()
.AddPage(...)
.WithIgnoreUrls(["https://example.com", "https://example2.com"]))
.ConfigureEngine(options =>
{
options.WithIgnoreUrls(["https://example.com", "https://example2.com"]);
})
.BuildAsync(cancellationToken);
```

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
using CocoCrawler.Builders;
using CocoCrawler.Scheduler;
using FluentAssertions;
using WireMock.Server;

namespace CocoCrawler.IntegrationTests.Scenarios.OpenLinkAndClick;

[Collection(nameof(BrowserCollection))]
public class OpenLinkAndClickTests
{
private readonly WireMockServer _wireMockServer = WireMockServer.Start();

[Fact]
public async Task DocumentShould_Click_WhenCalled()
{
// Arrange
_wireMockServer.ReturnSuccessWithPage($"{_wireMockServer.Url}/clickme", GeStartPage(_wireMockServer.Url!));
_wireMockServer.ReturnSuccessWithPage($"{_wireMockServer.Url}/next-page", GetSecondPage());

var crawlerEngine = await new CrawlerEngineBuilder()
.AddPage($"{_wireMockServer.Url}/clickme", pageOptions => pageOptions
.OpenLinks("div.content > a", subPageOptions =>
{
subPageOptions.ConfigurePageActions(actions =>
{
actions.Click("button#clickme");
});
subPageOptions.ExtractObject([new("Was i clicked", "div.clicked-now-scraped")]);
})
.AddOutputToCsvFile("clicked-results.txt", cleanOnStartup: true)
)
.ConfigureEngine(options => options.WithScheduler(new InMemoryScheduler(totalSecondsTimeoutAfterJob: 2)))
.BuildAsync();

// Act
await crawlerEngine.RunAsync();

// Assert
var fileOutputContents = File.ReadAllText("clicked-results.txt");

var expectedContents = $@"Url,Was i clicked
{_wireMockServer.Url}/next-page,Yes i was!
";

fileOutputContents.Should().Be(expectedContents);
}

private static string GeStartPage(string baseUrl)
{
return $@"
<html>
<body>
<div class=""content"">
<a href='{baseUrl}/next-page'>Click me</a>
</div>
</body>
</html>";
}

private static string GetSecondPage()
{
return @"
<!DOCTYPE html>
<html lang=""en"" xmlns=""http://www.w3.org/1999/xhtml"">
<body>
<button id=""clickme""> ClickMe </button>
<script>
document.getElementById('clickme').addEventListener('click', function() {
var div = document.createElement('div');
div.className = 'clicked-now-scraped';
div.textContent = 'Yes i was!';
document.body.appendChild(div);
});
</script>
</body>
</html>";
}
}

0 comments on commit bd3e1fb

Please sign in to comment.