-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #11 from hashbangcode/10_robots_parse
10: Added ability to parse a given robots.txt file into exclusion rules
- Loading branch information
Showing
10 changed files
with
224 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
<?php | ||
|
||
namespace Hashbangcode\SitemapChecker; | ||
|
||
class RobotsTxtParser implements RobotsTxtParserInterface { | ||
|
||
public function parse(string $robotsTxt, string $rootUrl): array | ||
{ | ||
$rules = []; | ||
|
||
$genericUserAgentFound = FALSE; | ||
|
||
$lines = explode("\n", $robotsTxt); | ||
|
||
foreach ($lines as $line) { | ||
if (str_starts_with($line, '#') || trim($line) == '') { | ||
continue; | ||
} | ||
if (str_contains($line, 'User-agent: *')) { | ||
$genericUserAgentFound = true; | ||
continue; | ||
} elseif ($genericUserAgentFound === true && str_contains($line, 'User-agent: ')) { | ||
$genericUserAgentFound = false; | ||
continue; | ||
} | ||
if ($genericUserAgentFound === false) { | ||
continue; | ||
} | ||
if (str_starts_with($line, 'Disallow: ')) { | ||
$line = str_replace('Disallow: ', '', $line); | ||
$rules[] = $rootUrl . trim($line); | ||
$rules[] = $rootUrl . trim($line) . '*'; | ||
} | ||
} | ||
|
||
return $rules; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
<?php | ||
|
||
namespace Hashbangcode\SitemapChecker; | ||
|
||
interface RobotsTxtParserInterface { | ||
|
||
/** | ||
* Parse a robots.txt file contents to extract the exclusion rules. | ||
* | ||
* @param string $robotsTxt | ||
* The contents of a robots.txt file. | ||
* @param string $rootUrl | ||
* The root domain to prepend to each of the rules. | ||
* | ||
* @return array<string> | ||
* The array of exclusion rules. | ||
*/ | ||
public function parse(string $robotsTxt, string $rootUrl): array; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
<?php | ||
|
||
namespace Hashbangcode\SitemapChecker\Source; | ||
|
||
class RobotsTxtSource extends SourceBase | ||
{ | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
<?php | ||
|
||
namespace Hashbangcode\SitemapChecker\Test; | ||
|
||
use Hashbangcode\SitemapChecker\RobotsTxtParser; | ||
use PHPUnit\Framework\TestCase; | ||
|
||
class RobotsParserTest extends TestCase { | ||
|
||
public function testRobotsParserParsesFileIntoRules() | ||
{ | ||
$robotsTxt = realpath(__DIR__ . '/data/robots1.txt'); | ||
$robotsTxt = file_get_contents($robotsTxt); | ||
|
||
$domain = 'https://www.example.com'; | ||
|
||
$robotsParser = new RobotsTxtParser(); | ||
$rules = $robotsParser->parse($robotsTxt, $domain); | ||
|
||
$this->assertEquals(44, count($rules)); | ||
$this->assertEquals($domain . '/core/', $rules[0]); | ||
$this->assertEquals($domain . '/index.php/user/logout/*', $rules[43]); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
<?php | ||
|
||
namespace Hashbangcode\SitemapChecker\Test\Source; | ||
|
||
use GuzzleHttp\Client; | ||
use GuzzleHttp\Handler\MockHandler; | ||
use GuzzleHttp\HandlerStack; | ||
use GuzzleHttp\Psr7\Response; | ||
use Hashbangcode\SitemapChecker\Parser\UrlListParser; | ||
use Hashbangcode\SitemapChecker\RobotsTxtParser; | ||
use Hashbangcode\SitemapChecker\Source\RobotsTxtSource; | ||
use Hashbangcode\SitemapChecker\Source\UrlListSource; | ||
use PHPUnit\Framework\TestCase; | ||
|
||
class RobotsTxtSourceTest extends TestCase { | ||
|
||
public function testRobotsTxtSourceCreatesValidRobotsTxtList() | ||
{ | ||
$robotsTxt = realpath(__DIR__ . '/../data/robots1.txt'); | ||
$robotsTxt = file_get_contents($robotsTxt); | ||
|
||
$mock = new MockHandler([ | ||
new Response(200, ['Content-Type' => 'txt'], $robotsTxt), | ||
]); | ||
$handlerStack = HandlerStack::create($mock); | ||
$httpClient = new Client(['handler' => $handlerStack]); | ||
|
||
$robotsTxtSource = new RobotsTxtSource($httpClient); | ||
$robotsTxtString = $robotsTxtSource->fetch(''); | ||
|
||
$robotsParse = new RobotsTxtParser(); | ||
$result = $robotsParse->parse($robotsTxtString, 'https://www.example.com'); | ||
|
||
$this->assertEquals(44, count($result)); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
# | ||
# robots.txt | ||
# | ||
# This file is to prevent the crawling and indexing of certain parts | ||
# of your site by web crawlers and spiders run by sites like Yahoo! | ||
# and Google. By telling these "robots" where not to go on your site, | ||
# you save bandwidth and server resources. | ||
# | ||
# This file will be ignored unless it is at the root of your host: | ||
# Used: http://example.com/robots.txt | ||
# Ignored: http://example.com/site/robots.txt | ||
# | ||
# For more information about the robots.txt standard, see: | ||
# http://www.robotstxt.org/robotstxt.html | ||
|
||
User-agent: * | ||
# CSS, JS, Images | ||
Allow: /core/*.css$ | ||
Allow: /core/*.css? | ||
Allow: /core/*.js$ | ||
Allow: /core/*.js? | ||
Allow: /core/*.gif | ||
Allow: /core/*.jpg | ||
Allow: /core/*.jpeg | ||
Allow: /core/*.png | ||
Allow: /core/*.svg | ||
Allow: /profiles/*.css$ | ||
Allow: /profiles/*.css? | ||
Allow: /profiles/*.js$ | ||
Allow: /profiles/*.js? | ||
Allow: /profiles/*.gif | ||
Allow: /profiles/*.jpg | ||
Allow: /profiles/*.jpeg | ||
Allow: /profiles/*.png | ||
Allow: /profiles/*.svg | ||
# Directories | ||
Disallow: /core/ | ||
Disallow: /profiles/ | ||
# Files | ||
Disallow: /README.txt | ||
Disallow: /web.config | ||
# Paths (clean URLs) | ||
Disallow: /admin/ | ||
Disallow: /comment/reply/ | ||
Disallow: /filter/tips | ||
Disallow: /node/add/ | ||
Disallow: /search/ | ||
Disallow: /user/register/ | ||
Disallow: /user/password/ | ||
Disallow: /user/login/ | ||
Disallow: /user/logout/ | ||
# Paths (no clean URLs) | ||
Disallow: /index.php/admin/ | ||
Disallow: /index.php/comment/reply/ | ||
Disallow: /index.php/filter/tips | ||
Disallow: /index.php/node/add/ | ||
Disallow: /index.php/search/ | ||
Disallow: /index.php/user/password/ | ||
Disallow: /index.php/user/register/ | ||
Disallow: /index.php/user/login/ | ||
Disallow: /index.php/user/logout/ |