Skip to content

Commit

Permalink
Merge pull request #11 from hashbangcode/10_robots_parse
Browse files Browse the repository at this point in the history
10: Added ability to parse a given robots.txt file into exclusion rules
  • Loading branch information
philipnorton42 authored Feb 24, 2024
2 parents 8c599ee + 8d22bd3 commit 4df169c
Show file tree
Hide file tree
Showing 10 changed files with 224 additions and 10 deletions.
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,17 @@ To prevent anything on the external site `https://www.example2.org` being used.

`php application.php sc:run https://www.example.com/ --exclude='https://www.example2.org/*'`

### Robots

Pass a robots.txt file to the script using `--robots` (or `-t` for short). This will download and parse the robots.txt
file into a set of exclusion rules.

`php application.php sc:run https://www.example.com/ --robots='https://www.example.org/robots.txt'`

Note that the robots.txt parser will only use rules that have been created for the `User-agent` string `*`.

This option can be used in conjunction with the exclude flag to add to the spider exclusion rules.

## Testing

Run `./vendor/bin/phpunit` to run the phpunit tests. All web requests are mocked within the unit tests.
Expand Down
20 changes: 19 additions & 1 deletion src/Command/SitemapChecker.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
use Hashbangcode\SitemapChecker\ResultRender\PlainResultRender;
use Hashbangcode\SitemapChecker\ResultRender\HtmlResultRender;
use Hashbangcode\SitemapChecker\ResultRender\XmlResultRender;
use Hashbangcode\SitemapChecker\RobotsTxtParser;
use Hashbangcode\SitemapChecker\Source\RobotsTxtSource;
use Hashbangcode\SitemapChecker\Source\SitemapXmlSource;
use Hashbangcode\SitemapChecker\Url\UrlCollection;
use HeadlessChromium\BrowserFactory;
Expand Down Expand Up @@ -70,7 +72,8 @@ protected function configure(): void
$this->addOption('result-file', 'r', InputOption::VALUE_OPTIONAL, 'The output file.');
$this->addOption('limit', 'l', InputOption::VALUE_OPTIONAL, 'Limit the number of URLs polled.', -1);
$this->addOption('engine', 'e', InputOption::VALUE_OPTIONAL, 'The engine to use, defaults to guzzle.', 'guzzle');
$this->addOption('exclude', 'x', InputOption::VALUE_OPTIONAL, 'A set of URLs to exclude.', '');
$this->addOption('exclude', 'x', InputOption::VALUE_OPTIONAL, 'A set of URLs to exclude.');
$this->addOption('robots', 't', InputOption::VALUE_OPTIONAL, 'A robots.txt file to download and use as exclusion fules.');
}

protected function execute(InputInterface $input, OutputInterface $output)
Expand All @@ -92,6 +95,8 @@ protected function execute(InputInterface $input, OutputInterface $output)

$io = new SymfonyStyle($input, $output);

$robots = $input->getOption('robots');

if (is_string($sitemap) === FALSE || filter_var($sitemap, FILTER_VALIDATE_URL) === FALSE) {
$io->error('Invalid sitemap URL found.');
return Command::INVALID;
Expand All @@ -107,6 +112,19 @@ protected function execute(InputInterface $input, OutputInterface $output)

$client = $this->getClient();

// Include the robots.txt file exclusion rules.
if (is_string($robots)) {
if (filter_var($sitemap, FILTER_VALIDATE_URL) === FALSE) {
$io->error('Invalid robots.txt URL passed.');
return Command::INVALID;
}
$robotsTxtSource = new RobotsTxtSource($client);
$robotsTxt = $robotsTxtSource->fetch($robots);
$robotsTxtParse = new RobotsTxtParser();
$robotsTxtRules = $robotsTxtParse->parse($robotsTxt, str_replace('/sitemap.xml', '', $sitemap));
$exclude = array_merge($exclude, $robotsTxtRules);
}

$sitemapSource = new SitemapXmlSource($client);
try {
$sitemapData = $sitemapSource->fetch($sitemap);
Expand Down
38 changes: 38 additions & 0 deletions src/RobotsTxtParser.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
<?php

namespace Hashbangcode\SitemapChecker;

class RobotsTxtParser implements RobotsTxtParserInterface {

public function parse(string $robotsTxt, string $rootUrl): array
{
$rules = [];

$genericUserAgentFound = FALSE;

$lines = explode("\n", $robotsTxt);

foreach ($lines as $line) {
if (str_starts_with($line, '#') || trim($line) == '') {
continue;
}
if (str_contains($line, 'User-agent: *')) {
$genericUserAgentFound = true;
continue;
} elseif ($genericUserAgentFound === true && str_contains($line, 'User-agent: ')) {
$genericUserAgentFound = false;
continue;
}
if ($genericUserAgentFound === false) {
continue;
}
if (str_starts_with($line, 'Disallow: ')) {
$line = str_replace('Disallow: ', '', $line);
$rules[] = $rootUrl . trim($line);
$rules[] = $rootUrl . trim($line) . '*';
}
}

return $rules;
}
}
19 changes: 19 additions & 0 deletions src/RobotsTxtParserInterface.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<?php

namespace Hashbangcode\SitemapChecker;

interface RobotsTxtParserInterface {

/**
* Parse a robots.txt file contents to extract the exclusion rules.
*
* @param string $robotsTxt
* The contents of a robots.txt file.
* @param string $rootUrl
* The root domain to prepend to each of the rules.
*
* @return array<string>
* The array of exclusion rules.
*/
public function parse(string $robotsTxt, string $rootUrl): array;
}
7 changes: 7 additions & 0 deletions src/Source/RobotsTxtSource.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<?php

namespace Hashbangcode\SitemapChecker\Source;

class RobotsTxtSource extends SourceBase
{
}
8 changes: 8 additions & 0 deletions src/Source/SourceBase.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
namespace Hashbangcode\SitemapChecker\Source;

use GuzzleHttp\ClientInterface;
use GuzzleHttp\Psr7\Request;

abstract class SourceBase implements SourceInterface
{
Expand All @@ -15,4 +16,11 @@ public function __construct(ClientInterface $client)
{
$this->client = $client;
}

public function fetch(string $sourceFile): string
{
$request = new Request('GET', $sourceFile);
$response = $this->client->send($request);
return (string) $response->getBody();
}
}
10 changes: 1 addition & 9 deletions src/Source/UrlListSource.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,6 @@

namespace Hashbangcode\SitemapChecker\Source;

use GuzzleHttp\Psr7\Request;
class UrlListSource extends SourceBase {

class UrlListSource extends SourceBase
{
public function fetch(string $sourceFile): string
{
$request = new Request('GET', $sourceFile);
$response = $this->client->send($request);
return (string) $response->getBody();
}
}
24 changes: 24 additions & 0 deletions tests/RobotsParserTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<?php

namespace Hashbangcode\SitemapChecker\Test;

use Hashbangcode\SitemapChecker\RobotsTxtParser;
use PHPUnit\Framework\TestCase;

class RobotsParserTest extends TestCase {

public function testRobotsParserParsesFileIntoRules()
{
$robotsTxt = realpath(__DIR__ . '/data/robots1.txt');
$robotsTxt = file_get_contents($robotsTxt);

$domain = 'https://www.example.com';

$robotsParser = new RobotsTxtParser();
$rules = $robotsParser->parse($robotsTxt, $domain);

$this->assertEquals(44, count($rules));
$this->assertEquals($domain . '/core/', $rules[0]);
$this->assertEquals($domain . '/index.php/user/logout/*', $rules[43]);
}
}
36 changes: 36 additions & 0 deletions tests/Source/RobotsTxtSourceTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
<?php

namespace Hashbangcode\SitemapChecker\Test\Source;

use GuzzleHttp\Client;
use GuzzleHttp\Handler\MockHandler;
use GuzzleHttp\HandlerStack;
use GuzzleHttp\Psr7\Response;
use Hashbangcode\SitemapChecker\Parser\UrlListParser;
use Hashbangcode\SitemapChecker\RobotsTxtParser;
use Hashbangcode\SitemapChecker\Source\RobotsTxtSource;
use Hashbangcode\SitemapChecker\Source\UrlListSource;
use PHPUnit\Framework\TestCase;

class RobotsTxtSourceTest extends TestCase {

public function testRobotsTxtSourceCreatesValidRobotsTxtList()
{
$robotsTxt = realpath(__DIR__ . '/../data/robots1.txt');
$robotsTxt = file_get_contents($robotsTxt);

$mock = new MockHandler([
new Response(200, ['Content-Type' => 'txt'], $robotsTxt),
]);
$handlerStack = HandlerStack::create($mock);
$httpClient = new Client(['handler' => $handlerStack]);

$robotsTxtSource = new RobotsTxtSource($httpClient);
$robotsTxtString = $robotsTxtSource->fetch('');

$robotsParse = new RobotsTxtParser();
$result = $robotsParse->parse($robotsTxtString, 'https://www.example.com');

$this->assertEquals(44, count($result));
}
}
61 changes: 61 additions & 0 deletions tests/data/robots1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#
# robots.txt
#
# This file is to prevent the crawling and indexing of certain parts
# of your site by web crawlers and spiders run by sites like Yahoo!
# and Google. By telling these "robots" where not to go on your site,
# you save bandwidth and server resources.
#
# This file will be ignored unless it is at the root of your host:
# Used: http://example.com/robots.txt
# Ignored: http://example.com/site/robots.txt
#
# For more information about the robots.txt standard, see:
# http://www.robotstxt.org/robotstxt.html

User-agent: *
# CSS, JS, Images
Allow: /core/*.css$
Allow: /core/*.css?
Allow: /core/*.js$
Allow: /core/*.js?
Allow: /core/*.gif
Allow: /core/*.jpg
Allow: /core/*.jpeg
Allow: /core/*.png
Allow: /core/*.svg
Allow: /profiles/*.css$
Allow: /profiles/*.css?
Allow: /profiles/*.js$
Allow: /profiles/*.js?
Allow: /profiles/*.gif
Allow: /profiles/*.jpg
Allow: /profiles/*.jpeg
Allow: /profiles/*.png
Allow: /profiles/*.svg
# Directories
Disallow: /core/
Disallow: /profiles/
# Files
Disallow: /README.txt
Disallow: /web.config
# Paths (clean URLs)
Disallow: /admin/
Disallow: /comment/reply/
Disallow: /filter/tips
Disallow: /node/add/
Disallow: /search/
Disallow: /user/register/
Disallow: /user/password/
Disallow: /user/login/
Disallow: /user/logout/
# Paths (no clean URLs)
Disallow: /index.php/admin/
Disallow: /index.php/comment/reply/
Disallow: /index.php/filter/tips
Disallow: /index.php/node/add/
Disallow: /index.php/search/
Disallow: /index.php/user/password/
Disallow: /index.php/user/register/
Disallow: /index.php/user/login/
Disallow: /index.php/user/logout/

0 comments on commit 4df169c

Please sign in to comment.