Merge pull request #11 from hashbangcode/10_robots_parse

10: Added ability to parse a given robots.txt file into exclusion rules
hashbangcode · Feb 24, 2024 · 4df169c · 4df169c
2 parents 8c599ee + 8d22bd3
commit 4df169c
Show file tree

Hide file tree

Showing 10 changed files with 224 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -88,6 +88,17 @@ To prevent anything on the external site `https://www.example2.org` being used.
 
 `php application.php sc:run https://www.example.com/ --exclude='https://www.example2.org/*'`
 
+### Robots
+
+Pass a robots.txt file to the script using `--robots` (or `-t` for short). This will download and parse the robots.txt
+file into a set of exclusion rules.
+
+`php application.php sc:run https://www.example.com/ --robots='https://www.example.org/robots.txt'`
+
+Note that the robots.txt parser will only use rules that have been created for the `User-agent` string `*`.
+
+This option can be used in conjunction with the exclude flag to add to the spider exclusion rules.
+
 ## Testing
 
 Run `./vendor/bin/phpunit` to run the phpunit tests. All web requests are mocked within the unit tests.

diff --git a/src/Command/SitemapChecker.php b/src/Command/SitemapChecker.php
@@ -15,6 +15,8 @@
 use Hashbangcode\SitemapChecker\ResultRender\PlainResultRender;
 use Hashbangcode\SitemapChecker\ResultRender\HtmlResultRender;
 use Hashbangcode\SitemapChecker\ResultRender\XmlResultRender;
+use Hashbangcode\SitemapChecker\RobotsTxtParser;
+use Hashbangcode\SitemapChecker\Source\RobotsTxtSource;
 use Hashbangcode\SitemapChecker\Source\SitemapXmlSource;
 use Hashbangcode\SitemapChecker\Url\UrlCollection;
 use HeadlessChromium\BrowserFactory;
@@ -70,7 +72,8 @@ protected function configure(): void
         $this->addOption('result-file', 'r',  InputOption::VALUE_OPTIONAL, 'The output file.');
         $this->addOption('limit', 'l',  InputOption::VALUE_OPTIONAL, 'Limit the number of URLs polled.', -1);
         $this->addOption('engine', 'e',  InputOption::VALUE_OPTIONAL, 'The engine to use, defaults to guzzle.', 'guzzle');
-        $this->addOption('exclude', 'x', InputOption::VALUE_OPTIONAL, 'A set of URLs to exclude.', '');
+        $this->addOption('exclude', 'x', InputOption::VALUE_OPTIONAL, 'A set of URLs to exclude.');
+        $this->addOption('robots', 't', InputOption::VALUE_OPTIONAL, 'A robots.txt file to download and use as exclusion fules.');
     }
 
     protected function execute(InputInterface $input, OutputInterface $output)
@@ -92,6 +95,8 @@ protected function execute(InputInterface $input, OutputInterface $output)
 
         $io = new SymfonyStyle($input, $output);
 
+        $robots = $input->getOption('robots');
+
         if (is_string($sitemap) === FALSE || filter_var($sitemap, FILTER_VALIDATE_URL) === FALSE) {
             $io->error('Invalid sitemap URL found.');
             return Command::INVALID;
@@ -107,6 +112,19 @@ protected function execute(InputInterface $input, OutputInterface $output)
 
         $client = $this->getClient();
 
+        // Include the robots.txt file exclusion rules.
+        if (is_string($robots)) {
+            if (filter_var($sitemap, FILTER_VALIDATE_URL) === FALSE) {
+                $io->error('Invalid robots.txt URL passed.');
+                return Command::INVALID;
+            }
+            $robotsTxtSource = new RobotsTxtSource($client);
+            $robotsTxt = $robotsTxtSource->fetch($robots);
+            $robotsTxtParse = new RobotsTxtParser();
+            $robotsTxtRules = $robotsTxtParse->parse($robotsTxt, str_replace('/sitemap.xml', '', $sitemap));
+            $exclude = array_merge($exclude, $robotsTxtRules);
+        }
+
         $sitemapSource = new SitemapXmlSource($client);
         try {
             $sitemapData = $sitemapSource->fetch($sitemap);

diff --git a/src/RobotsTxtParser.php b/src/RobotsTxtParser.php
@@ -0,0 +1,38 @@
+<?php
+
+namespace Hashbangcode\SitemapChecker;
+
+class RobotsTxtParser implements RobotsTxtParserInterface {
+
+  public function parse(string $robotsTxt, string $rootUrl): array
+  {
+    $rules = [];
+
+    $genericUserAgentFound = FALSE;
+
+    $lines = explode("\n", $robotsTxt);
+
+    foreach ($lines as $line) {
+      if (str_starts_with($line, '#') || trim($line) == '') {
+        continue;
+      }
+      if (str_contains($line, 'User-agent: *')) {
+        $genericUserAgentFound = true;
+        continue;
+      } elseif ($genericUserAgentFound === true && str_contains($line, 'User-agent: ')) {
+        $genericUserAgentFound = false;
+        continue;
+      }
+      if ($genericUserAgentFound === false) {
+        continue;
+      }
+      if (str_starts_with($line, 'Disallow: ')) {
+        $line = str_replace('Disallow: ', '', $line);
+        $rules[] = $rootUrl . trim($line);
+        $rules[] = $rootUrl . trim($line) . '*';
+      }
+    }
+
+    return $rules;
+  }
+}
diff --git a/src/RobotsTxtParserInterface.php b/src/RobotsTxtParserInterface.php
@@ -0,0 +1,19 @@
+<?php
+
+namespace Hashbangcode\SitemapChecker;
+
+interface RobotsTxtParserInterface {
+
+  /**
+   * Parse a robots.txt file contents to extract the exclusion rules.
+   *
+   * @param string $robotsTxt
+   *   The contents of a robots.txt file.
+   * @param string $rootUrl
+   *   The root domain to prepend to each of the rules.
+   *
+   * @return array<string>
+   *   The array of exclusion rules.
+   */
+  public function parse(string $robotsTxt, string $rootUrl): array;
+}
diff --git a/src/Source/RobotsTxtSource.php b/src/Source/RobotsTxtSource.php
@@ -0,0 +1,7 @@
+<?php
+
+namespace Hashbangcode\SitemapChecker\Source;
+
+class RobotsTxtSource extends SourceBase
+{
+}
diff --git a/src/Source/SourceBase.php b/src/Source/SourceBase.php
@@ -3,6 +3,7 @@
 namespace Hashbangcode\SitemapChecker\Source;
 
 use GuzzleHttp\ClientInterface;
+use GuzzleHttp\Psr7\Request;
 
 abstract class SourceBase implements SourceInterface
 {
@@ -15,4 +16,11 @@ public function __construct(ClientInterface $client)
     {
         $this->client = $client;
     }
+
+  public function fetch(string $sourceFile): string
+  {
+    $request = new Request('GET', $sourceFile);
+    $response = $this->client->send($request);
+    return (string) $response->getBody();
+  }
 }
diff --git a/src/Source/UrlListSource.php b/src/Source/UrlListSource.php
@@ -2,14 +2,6 @@
 
 namespace Hashbangcode\SitemapChecker\Source;
 
-use GuzzleHttp\Psr7\Request;
+class UrlListSource extends SourceBase {
 
-class UrlListSource extends SourceBase
-{
-    public function fetch(string $sourceFile): string
-    {
-        $request = new Request('GET', $sourceFile);
-        $response = $this->client->send($request);
-        return (string) $response->getBody();
-    }
 }
diff --git a/tests/RobotsParserTest.php b/tests/RobotsParserTest.php
@@ -0,0 +1,24 @@
+<?php
+
+namespace Hashbangcode\SitemapChecker\Test;
+
+use Hashbangcode\SitemapChecker\RobotsTxtParser;
+use PHPUnit\Framework\TestCase;
+
+class RobotsParserTest extends TestCase {
+
+  public function testRobotsParserParsesFileIntoRules()
+  {
+    $robotsTxt = realpath(__DIR__ . '/data/robots1.txt');
+    $robotsTxt = file_get_contents($robotsTxt);
+
+    $domain = 'https://www.example.com';
+
+    $robotsParser = new RobotsTxtParser();
+    $rules = $robotsParser->parse($robotsTxt, $domain);
+
+    $this->assertEquals(44, count($rules));
+    $this->assertEquals($domain . '/core/', $rules[0]);
+    $this->assertEquals($domain . '/index.php/user/logout/*', $rules[43]);
+  }
+}
diff --git a/tests/Source/RobotsTxtSourceTest.php b/tests/Source/RobotsTxtSourceTest.php
@@ -0,0 +1,36 @@
+<?php
+
+namespace Hashbangcode\SitemapChecker\Test\Source;
+
+use GuzzleHttp\Client;
+use GuzzleHttp\Handler\MockHandler;
+use GuzzleHttp\HandlerStack;
+use GuzzleHttp\Psr7\Response;
+use Hashbangcode\SitemapChecker\Parser\UrlListParser;
+use Hashbangcode\SitemapChecker\RobotsTxtParser;
+use Hashbangcode\SitemapChecker\Source\RobotsTxtSource;
+use Hashbangcode\SitemapChecker\Source\UrlListSource;
+use PHPUnit\Framework\TestCase;
+
+class RobotsTxtSourceTest extends TestCase {
+
+  public function testRobotsTxtSourceCreatesValidRobotsTxtList()
+  {
+    $robotsTxt = realpath(__DIR__ . '/../data/robots1.txt');
+    $robotsTxt = file_get_contents($robotsTxt);
+
+    $mock = new MockHandler([
+      new Response(200, ['Content-Type' => 'txt'], $robotsTxt),
+    ]);
+    $handlerStack = HandlerStack::create($mock);
+    $httpClient = new Client(['handler' => $handlerStack]);
+
+    $robotsTxtSource = new RobotsTxtSource($httpClient);
+    $robotsTxtString = $robotsTxtSource->fetch('');
+
+    $robotsParse = new RobotsTxtParser();
+    $result = $robotsParse->parse($robotsTxtString, 'https://www.example.com');
+
+    $this->assertEquals(44, count($result));
+  }
+}
diff --git a/tests/data/robots1.txt b/tests/data/robots1.txt
@@ -0,0 +1,61 @@
+#
+# robots.txt
+#
+# This file is to prevent the crawling and indexing of certain parts
+# of your site by web crawlers and spiders run by sites like Yahoo!
+# and Google. By telling these "robots" where not to go on your site,
+# you save bandwidth and server resources.
+#
+# This file will be ignored unless it is at the root of your host:
+# Used:    http://example.com/robots.txt
+# Ignored: http://example.com/site/robots.txt
+#
+# For more information about the robots.txt standard, see:
+# http://www.robotstxt.org/robotstxt.html
+
+User-agent: *
+# CSS, JS, Images
+Allow: /core/*.css$
+Allow: /core/*.css?
+Allow: /core/*.js$
+Allow: /core/*.js?
+Allow: /core/*.gif
+Allow: /core/*.jpg
+Allow: /core/*.jpeg
+Allow: /core/*.png
+Allow: /core/*.svg
+Allow: /profiles/*.css$
+Allow: /profiles/*.css?
+Allow: /profiles/*.js$
+Allow: /profiles/*.js?
+Allow: /profiles/*.gif
+Allow: /profiles/*.jpg
+Allow: /profiles/*.jpeg
+Allow: /profiles/*.png
+Allow: /profiles/*.svg
+# Directories
+Disallow: /core/
+Disallow: /profiles/
+# Files
+Disallow: /README.txt
+Disallow: /web.config
+# Paths (clean URLs)
+Disallow: /admin/
+Disallow: /comment/reply/
+Disallow: /filter/tips
+Disallow: /node/add/
+Disallow: /search/
+Disallow: /user/register/
+Disallow: /user/password/
+Disallow: /user/login/
+Disallow: /user/logout/
+# Paths (no clean URLs)
+Disallow: /index.php/admin/
+Disallow: /index.php/comment/reply/
+Disallow: /index.php/filter/tips
+Disallow: /index.php/node/add/
+Disallow: /index.php/search/
+Disallow: /index.php/user/password/
+Disallow: /index.php/user/register/
+Disallow: /index.php/user/login/
+Disallow: /index.php/user/logout/