Skip to content

Commit

Permalink
Merge pull request #9 from hashbangcode/5_exclusion
Browse files Browse the repository at this point in the history
5: Adding exclusion rules
  • Loading branch information
philipnorton42 authored Feb 18, 2024
2 parents d97a9af + f6100d0 commit a9f0e66
Show file tree
Hide file tree
Showing 11 changed files with 216 additions and 13 deletions.
24 changes: 22 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,19 +55,39 @@ For example, this will only process 10 results, regardless of the number of URLs

`php application.php sc:run -l 10 https://www.example.com/sitemap.xml`

### engine
### Engine

The `--engine` option (or `-e` for short) changes the type of checking engine used.

Options are:
- 'guzzle' : (Default) Run the sitemap checker using Guzzle promises.
- 'chrome' : Run the sitemap checker using headless Chrome. To get this running you'll first need to add the
chrome binary to the location `./chrome/chrome`.
chrome binary to the location `./chrome/chrome` (i.e. within the package).

For example, to change the sitemap checker engine to use headless Chrome use the following.

`php application.php sc:run -e chrome https://www.example.com/sitemap.xml`

### Exclude

Pass a list of URLs to exclude using the `--exclude` (or `-x` for short) flag. This will prevent URLs from being added
to the collections and checked. This can be a comma separated list of URLs to exclude. Wildcards can also be used to
prevent certain inner URLs from being used.

Some examples:

To prevent the path `https://www.example.com/some-page` being used.

`php application.php sc:run https://www.example.com/ --exclude='https://www.hashbangcode.com/some-page.html'`

To prevent anything in `https://www.example.com/sub-dir1` and `https://www.example.com/sub-dir2` from being used:

`php application.php sc:run https://www.example.com/ --exclude='https://www.example.com/sub-dir1/*,https://www.hashbangcode.com/sub-dir2/*'`

To prevent anything on the external site `https://www.example2.org` being used.

`php application.php sc:run https://www.example.com/ --exclude='https://www.example2.org/*'`

## Testing

Run `./vendor/bin/phpunit` to run the phpunit tests. All web requests are mocked within the unit tests.
Expand Down
8 changes: 7 additions & 1 deletion phpunit.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<phpunit xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="https://schema.phpunit.de/10.1/phpunit.xsd" bootstrap="vendor/autoload.php" beStrictAboutOutputDuringTests="true" beStrictAboutChangesToGlobalState="true" colors="true">
<phpunit xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:noNamespaceSchemaLocation="https://schema.phpunit.de/10.1/phpunit.xsd"
bootstrap="vendor/autoload.php"
beStrictAboutOutputDuringTests="true"
beStrictAboutChangesToGlobalState="true"
displayDetailsOnTestsThatTriggerWarnings="true"
colors="true">
<php>
<env name="COLUMNS" value="300"/>
<server name="KERNEL_CLASS" value="\Hashbangcode\SitemapChecker\AppKernel"/>
Expand Down
25 changes: 23 additions & 2 deletions src/Command/SitemapChecker.php
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,26 @@ protected function configure(): void
$this->addOption('result-file', 'r', InputOption::VALUE_OPTIONAL, 'The output file.');
$this->addOption('limit', 'l', InputOption::VALUE_OPTIONAL, 'Limit the number of URLs polled.', -1);
$this->addOption('engine', 'e', InputOption::VALUE_OPTIONAL, 'The engine to use, defaults to guzzle.', 'guzzle');
$this->addOption('exclude', 'x', InputOption::VALUE_OPTIONAL, 'A set of URLs to exclude.', '');
}

protected function execute(InputInterface $input, OutputInterface $output)
{
$sitemap = $input->getArgument('sitemap');
$limit = (int) $input->getOption('limit');
$limit = $input->getOption('limit');
if (is_numeric($limit)) {
$limit = (int) $limit;
}
$engine = $input->getOption('engine');

$exclude = $input->getOption('exclude');
if (is_string($exclude)) {
$exclude = array_filter(explode(',', $exclude));
}
else {
$exclude = [];
}

$io = new SymfonyStyle($input, $output);

if (is_string($sitemap) === FALSE || filter_var($sitemap, FILTER_VALIDATE_URL) === FALSE) {
Expand Down Expand Up @@ -111,6 +123,10 @@ protected function execute(InputInterface $input, OutputInterface $output)

$list = new UrlCollection();

if (count($exclude) > 0) {
$list->setExclusionRules($exclude);
}

foreach ($sitemapList as $sitemapUrl) {
$sitemapData = $sitemapSource->fetch($sitemapUrl->getRawUrl());
$sitemapParser = new SitemapXmlParser();
Expand All @@ -121,7 +137,12 @@ protected function execute(InputInterface $input, OutputInterface $output)
}
} else {
$sitemapParser = new SitemapXmlParser();
$list = $sitemapParser->parse($sitemapData);
$list = $sitemapParser->parse($sitemapData, $exclude);
}

if ($list->count() === 0) {
$output->writeln('No URLs found.');
return Command::SUCCESS;
}

if ($limit !== -1) {
Expand Down
7 changes: 6 additions & 1 deletion src/Parser/ParserInterface.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,10 @@

interface ParserInterface
{
public function parse(string $data): UrlCollectionInterface;
/**
* @param string $data
* @param array<string> $exclusionRules
* @return UrlCollectionInterface
*/
public function parse(string $data, array $exclusionRules = []): UrlCollectionInterface;
}
6 changes: 5 additions & 1 deletion src/Parser/SitemapIndexXmlParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,14 @@

class SitemapIndexXmlParser extends ParserBase
{
public function parse(string $data): UrlCollectionInterface
public function parse(string $data, array $exclusionRules = []): UrlCollectionInterface
{
$linkCollection = new UrlCollection();

if (count($exclusionRules) > 0) {
$linkCollection->setExclusionRules($exclusionRules);
}

$xml = simplexml_load_string($data, null, LIBXML_NOWARNING | LIBXML_NOERROR);

if (isset($xml->sitemap) && count($xml->sitemap) > 0) {
Expand Down
6 changes: 5 additions & 1 deletion src/Parser/SitemapXmlParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,14 @@

class SitemapXmlParser extends ParserBase
{
public function parse(string $data): UrlCollectionInterface
public function parse(string $data, array $exclusionRules = []): UrlCollectionInterface
{
$linkCollection = new UrlCollection();

if (count($exclusionRules) > 0) {
$linkCollection->setExclusionRules($exclusionRules);
}

$xml = simplexml_load_string($data, null, LIBXML_NOWARNING | LIBXML_NOERROR);

if (isset($xml->url) && count($xml->url) > 0) {
Expand Down
6 changes: 5 additions & 1 deletion src/Parser/UrlListParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,14 @@

class UrlListParser extends ParserBase
{
public function parse(string $data): UrlCollectionInterface
public function parse(string $data, array $exclusionRules = []): UrlCollectionInterface
{
$linkCollection = new UrlCollection();

if (count($exclusionRules) > 0) {
$linkCollection->setExclusionRules($exclusionRules);
}

$lines = preg_split("/\r\n|\n|\r/", $data);

if (is_array($lines) && count($lines) > 0) {
Expand Down
45 changes: 45 additions & 0 deletions src/Url/UrlCollection.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,32 @@ class UrlCollection implements UrlCollectionInterface
*/
protected array $urls = [];

/**
* Rules for excluding URLs from the collection.
*
* @var array<string>
*/
protected array $exclusionRules = [];

public function add(UrlInterface $url): void
{
if (count($this->exclusionRules) === 0) {
// There are no exclusion rules, so add the URL and return.
$this->urls[] = $url;
return;
}

foreach ($this->exclusionRules as $rule) {
// Perform a like for like match.
if ($url->getRawUrl() === $rule) {
return;
}
// Perform a wildcard match.
if (str_contains($rule, '*') && preg_match('/^' . $rule . '/i', $url->getRawUrl()) > 0) {
return;
}
}
$this->urls[] = $url;
}

public function delete(int $index): void
Expand Down Expand Up @@ -75,4 +98,26 @@ public function chunk(int $chunkLength) : array
return $collections;
}

/**
* {@inheritDoc}
*/
public function setExclusionRules(array $exclusionRules): self
{
foreach ($exclusionRules as &$rule) {
if (str_contains($rule, '*')) {
$rule = preg_quote($rule, '/');
$rule = str_replace('\*', '.*', $rule);
}
}
$this->exclusionRules = $exclusionRules;
return $this;
}

/**
* {@inheritDoc}
*/
public function getExclusionRules(): array
{
return $this->exclusionRules;
}
}
21 changes: 17 additions & 4 deletions src/Url/UrlCollectionInterface.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,29 @@
*/
interface UrlCollectionInterface extends \Iterator, \Countable
{
public function add(UrlInterface $url): void;
public function add(UrlInterface $url): void;

public function delete(int $index): void;
public function delete(int $index): void;

public function find(int $id) : UrlInterface|false;
public function find(int $id): UrlInterface|false;

/**
* @param int $chunkLength
*
* @return UrlCollectionInterface[]
*/
public function chunk(int $chunkLength) : array;
public function chunk(int $chunkLength): array;

/**
* @param array<string> $exclusionRules
*
* @return self
*/
public function setExclusionRules(array $exclusionRules): self;

/**
* @return array<string>
*/
public function getExclusionRules(): array;

}
15 changes: 15 additions & 0 deletions tests/Parser/SitemapXmlParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,19 @@ public function testSitemapParserParsesData()
$this->assertEquals('/inner-link', $list->current()->getPath());
}

public function testSitemapParserParsesDataAndAdheredToExclusionRules()
{
$sitemapXml = realpath(__DIR__ . '/../data/sitemap.xml');
$sitemapXml = file_get_contents($sitemapXml);

$sitemapXmlParser = new SitemapXmlParser();

$exclusionRules = [
'https://www.example.com/inner-link'
];

$list = $sitemapXmlParser->parse($sitemapXml, $exclusionRules);

$this->assertEquals(1, $list->count());
}
}
66 changes: 66 additions & 0 deletions tests/Url/UrlCollectionTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,70 @@ public function testUrlCollectionChunk() {
$this->assertEquals(1, $collection->count());
}
}

public function testRootWildcardUrlExcludesEverything() {
$urlCollection = new UrlCollection();

$rules = [
'https://www.example.com/*',
];

$urlCollection->setExclusionRules($rules);

$urlCollection->add(new Url('https://www.example.com/'));
$this->assertEquals($urlCollection->count(), 0);

$urlCollection->add(new Url('https://www.example.com/excluded-path'));
$this->assertEquals($urlCollection->count(), 0);

$urlCollection->add(new Url('https://www.example.com/wildcard-path'));
$this->assertEquals($urlCollection->count(), 0);

$urlCollection->add(new Url('https://www.example.com/a-more-complex-yes-wildcarded-path'));
$this->assertEquals($urlCollection->count(), 0);
}

public function testInnerPathUrlIsExcluded() {
$urlCollection = new UrlCollection();

$rules = [
'https://www.example.com/excluded-path',
'https://www.example.com/wildcard-*',
'https://www.example.com/*-more-complex-*-wild*-path'
];

$urlCollection->setExclusionRules($rules);

$urlCollection->add(new Url('https://www.example.com/'));
$this->assertEquals($urlCollection->count(), 1);

$urlCollection->add(new Url('https://www.example.com/excluded-path'));
$this->assertEquals($urlCollection->count(), 1);

$urlCollection->add(new Url('https://www.example.com/wildcard-path'));
$this->assertEquals($urlCollection->count(), 1);

$urlCollection->add(new Url('https://www.example.com/a-more-complex-yes-wildcarded-path'));
$this->assertEquals($urlCollection->count(), 1);

$urlCollection->add(new Url('https://www.example.com/non-excluded-path'));
$this->assertEquals($urlCollection->count(), 2);
}

public function testExclusionRulesExcludeRemoteUrl() {
$urlCollection = new UrlCollection();

$rules = [
'https://www.example.com/some-path',
'https://www.example2.com/some-path',
];

$urlCollection->setExclusionRules($rules);

$urlCollection->add(new Url('https://www.example.com/'));
$this->assertEquals($urlCollection->count(), 1);

$urlCollection->add(new Url('https://www.example2.com/some-path'));
$this->assertEquals($urlCollection->count(), 1);
}
}

0 comments on commit a9f0e66

Please sign in to comment.