Skip to content
This repository has been archived by the owner. It is now read-only.

Commit

Permalink
Fix http errors behaviour using the guzzle middleware (#10)
Browse files Browse the repository at this point in the history
  • Loading branch information
joskfg authored Nov 8, 2018
1 parent 1d9702b commit cae36fa
Show file tree
Hide file tree
Showing 5 changed files with 148 additions and 42 deletions.
28 changes: 28 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,34 @@ To publish the scraper config, you can use
```bash
php artisan vendor:publish --provider="Softonic\LaravelIntelligentScraper\ScraperProvider" --tag=config
```
### Dependencies

This package depends on [goutte](https://packagist.org/packages/fabpot/goutte) that depends on [guzzle](https://packagist.org/packages/guzzle/guzzle), so you can customize the client to
your requisites. The only requirement for this package is that you must include the `http_error` midleware in the
handle stack.

Example:
```php
<?php

use GuzzleHttp\Handler\CurlHandler;
use GuzzleHttp\HandlerStack;
use GuzzleHttp\Middleware;
use Goutte\Client as GoutteClient;
use App\MyMiddleware;

$client = new GoutteClient();
$stack = new HandlerStack();

$stack->setHandler(new CurlHandler());
$stack->push(MyMiddleware::getHandler(), 'my_middleware'); // Your custom middleware
$stack->push(Middleware::httpErrors(), 'http_errors'); // Required middleware for the package

$guzzleClient = new GuzzleClient(['handler' => $stack]);
$client->setClient($guzzleClient);
```

The default stack already has the http_errors middleware, so you only need to do this if you are not using the default stack.

## Configuration

Expand Down
21 changes: 13 additions & 8 deletions src/Scraper/Application/Configurator.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
namespace Softonic\LaravelIntelligentScraper\Scraper\Application;

use Goutte\Client;
use GuzzleHttp\Exception\ConnectException;
use GuzzleHttp\Exception\RequestException;
use Illuminate\Support\Collection;
use Illuminate\Support\Facades\Log;
use Softonic\LaravelIntelligentScraper\Scraper\Events\ConfigurationScraped;
Expand Down Expand Up @@ -74,21 +76,24 @@ public function configureFromDataset($scrapedDataset): Collection

private function getCrawler($scrapedData)
{
Log::info("Request {$scrapedData['url']}");
$crawler = $this->client->request('GET', $scrapedData['url']);
try {
Log::info("Request {$scrapedData['url']}");

$httpCode = $this->client->getInternalResponse()->getStatus();
if ($httpCode !== 200) {
return $this->client->request('GET', $scrapedData['url']);
} catch (ConnectException $e) {
Log::notice(
"Connection error: {$e->getMessage()}",
compact('scrapedData')
);
$scrapedData->delete();
} catch (RequestException $e) {
$httpCode = $e->getResponse()->getStatusCode() ?? null;
Log::notice(
"Response status ({$httpCode}) invalid, so proceeding to delete the scraped data.",
compact('scrapedData')
);
$scrapedData->delete();

return null;
}

return $crawler;
}

/**
Expand Down
28 changes: 20 additions & 8 deletions src/Scraper/Application/XpathFinder.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
namespace Softonic\LaravelIntelligentScraper\Scraper\Application;

use Goutte\Client as GoutteClient;
use GuzzleHttp\Exception\ConnectException;
use GuzzleHttp\Exception\RequestException;
use Illuminate\Support\Facades\Log;
use Softonic\LaravelIntelligentScraper\Scraper\Exceptions\MissingXpathValueException;

Expand All @@ -26,15 +28,9 @@ public function __construct(GoutteClient $client, VariantGenerator $variantGener

public function extract(string $url, $configs): array
{
Log::info("Requesting $url");
$crawler = $this->client->request('GET', $url);
$httpCode = $this->client->getInternalResponse()->getStatus();
if ($httpCode !== 200) {
Log::info('Invalid response http status', ['status' => $httpCode]);
throw new \UnexpectedValueException("Response error from '{$url}' with '{$httpCode}' http code");
}
$crawler = $this->getCrawler($url);

Log::info('Response Received. Starting crawler.');
Log::info('Response Received. Start crawling.');
$result = [];
foreach ($configs as $config) {
Log::info("Searching field {$config['name']}.");
Expand Down Expand Up @@ -68,4 +64,20 @@ public function extract(string $url, $configs): array

return $result;
}

private function getCrawler(string $url)
{
try {
Log::info("Requesting $url");

return $this->client->request('GET', $url);
} catch (ConnectException $e) {
Log::info("Unavailable url '{$url}'", ['message' => $e->getMessage()]);
throw new \UnexpectedValueException("Unavailable url '{$url}'");
} catch (RequestException $e) {
$httpCode = $e->getResponse()->getStatusCode();
Log::info('Invalid response http status', ['status' => $httpCode]);
throw new \UnexpectedValueException("Response error from '{$url}' with '{$httpCode}' http code");
}
}
}
59 changes: 45 additions & 14 deletions tests/Unit/Scraper/Application/ConfiguratorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
namespace Softonic\LaravelIntelligentScraper\Scraper\Application;

use Goutte\Client;
use GuzzleHttp\Exception\ConnectException;
use GuzzleHttp\Exception\RequestException;
use Illuminate\Foundation\Testing\DatabaseMigrations;
use Illuminate\Support\Facades\Log;
use Mockery\Mock;
Expand Down Expand Up @@ -64,7 +66,7 @@ public function setUp()
/**
* @test
*/
public function whenTryToFindNewXpathButUrlFromDatasetIsNotValidThrowAnExceptionAndRemoveIt()
public function whenTryToFindNewXpathButUrlFromDatasetIsNotFoundThrowAnExceptionAndRemoveIt()
{
$posts = [
new ScrapedDataset([
Expand All @@ -77,16 +79,55 @@ public function whenTryToFindNewXpathButUrlFromDatasetIsNotValidThrowAnException
]),
];

$requestException = \Mockery::mock(RequestException::class);
$requestException->shouldReceive('getResponse->getStatusCode')
->once()
->andReturn(404);
$this->client->shouldReceive('request')
->once()
->with(
'GET',
'https://test.c/123456789012'
)
->andReturnSelf();
$this->client->shouldReceive('getInternalResponse->getStatus')
->andThrows($requestException);

$this->configuration->shouldReceive('findByType')
->once()
->andReturn(404);
->with('post')
->andReturn(collect());

try {
$this->configurator->configureFromDataset($posts);
} catch (ConfigurationException $e) {
$this->assertEquals('Field(s) "title,author" not found.', $e->getMessage());
$this->assertDatabaseMissing('scraped_datasets', ['url' => 'https://test.c/123456789012']);
}
}

/**
* @test
*/
public function whenTryToFindNewXpathButUrlFromDatasetIsNotAvailableThrowAnExceptionAndRemoveIt()
{
$posts = [
new ScrapedDataset([
'url' => 'https://test.c/123456789012',
'type' => 'post',
'data' => [
'title' => 'My Title',
'author' => 'My author',
],
]),
];

$connectException = \Mockery::mock(ConnectException::class);
$this->client->shouldReceive('request')
->once()
->with(
'GET',
'https://test.c/123456789012'
)
->andThrows($connectException);

$this->configuration->shouldReceive('findByType')
->once()
Expand Down Expand Up @@ -125,9 +166,6 @@ public function whenTryToFindNewXpathButNotFoundItShouldLogItAndResetVariant()
'https://test.c/123456789012'
)
->andReturnSelf();
$this->client->shouldReceive('getInternalResponse->getStatus')
->once()
->andReturn(200);

$rootElement = new \DOMElement('test');
$this->client->shouldReceive('getNode')
Expand Down Expand Up @@ -191,9 +229,6 @@ public function whenUseSomeOldXpathButNotFoundNewsItShouldLogItAndResetVariant()
'https://test.c/123456789012'
)
->andReturnSelf();
$this->client->shouldReceive('getInternalResponse->getStatus')
->once()
->andReturn(200);

$rootElement = new \DOMElement('test');
$this->client->shouldReceive('getNode')
Expand Down Expand Up @@ -282,8 +317,6 @@ public function whenTryToFindXpathInMultiplepostsAndNotFoundInAnyItShouldThrowAn
'https://test.c/123456789022'
)
->andReturnSelf();
$this->client->shouldReceive('getInternalResponse->getStatus')
->andReturn(200);
$this->client->shouldReceive('getUri')
->andReturn('https://test.c/123456789012');

Expand Down Expand Up @@ -382,8 +415,6 @@ public function whenDiscoverDifferentXpathItShouldGetAllOfThemAndUpdateTheVarian
'https://test.c/123456789033'
)
->andReturnSelf();
$this->client->shouldReceive('getInternalResponse->getStatus')
->andReturn(200);

$rootElement = new \DOMElement('test');
$this->client->shouldReceive('getNode')
Expand Down
54 changes: 42 additions & 12 deletions tests/Unit/Scraper/Application/XpathFinderTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
namespace Softonic\LaravelIntelligentScraper\Scraper\Application;

use Goutte\Client;
use GuzzleHttp\Exception\ConnectException;
use GuzzleHttp\Exception\RequestException;
use Illuminate\Foundation\Testing\DatabaseMigrations;
use Illuminate\Support\Facades\Log;
use Softonic\LaravelIntelligentScraper\Scraper\Exceptions\MissingXpathValueException;
Expand All @@ -23,7 +25,7 @@ public function setUp()
/**
* @test
*/
public function whenExtractUsingAnInvalidUrlItShouldThrowAnException()
public function whenExtractUsingAnInvalidUrlStatusItShouldThrowAnException()
{
$config = [
Configuration::create([
Expand All @@ -34,21 +36,55 @@ public function whenExtractUsingAnInvalidUrlItShouldThrowAnException()
];

$variantGenerator = \Mockery::mock(VariantGenerator::class);

$requestException = \Mockery::mock(RequestException::class);
$requestException->shouldReceive('getResponse->getStatusCode')
->once()
->andReturn(404);

$client = \Mockery::mock(Client::class);
$client->shouldReceive('request')
->once()
->with(
'GET',
'url'
)
->andReturnSelf();
->andThrows($requestException);

$this->expectException(\UnexpectedValueException::class);
$this->expectExceptionMessage('Response error from \'url\' with \'404\' http code');

$xpathFinder = new XpathFinder($client, $variantGenerator);
$xpathFinder->extract('url', $config);
}

/**
* @test
*/
public function whenExtractUsingAnUnavailableUrlItShouldThrowAnException()
{
$config = [
Configuration::create([
'name' => 'title',
'type' => 'post',
'xpaths' => ['//*[@id="title"]'],
]),
];

$variantGenerator = \Mockery::mock(VariantGenerator::class);

$client->shouldReceive('getInternalResponse->getStatus')
$connectException = \Mockery::mock(ConnectException::class);
$client = \Mockery::mock(Client::class);
$client->shouldReceive('request')
->once()
->andReturn(404);
->with(
'GET',
'url'
)
->andThrows($connectException);

$this->expectException(\UnexpectedValueException::class);
$this->expectExceptionMessage('Response error from \'url\' with \'404\' http code');
$this->expectExceptionMessage('Unavailable url \'url\'');

$xpathFinder = new XpathFinder($client, $variantGenerator);
$xpathFinder->extract('url', $config);
Expand All @@ -73,17 +109,14 @@ public function whenXpathIsMissingAValueItShouldThrowAnException()
$internalXpathFinder = \Mockery::mock(\Symfony\Component\DomCrawler\Crawler::class);

$variantGenerator = \Mockery::mock(VariantGenerator::class);
$client = \Mockery::mock(Client::class);
$client = \Mockery::mock(Client::class);
$client->shouldReceive('request')
->once()
->with(
'GET',
'url'
)
->andReturn($internalXpathFinder);
$client->shouldReceive('getInternalResponse->getStatus')
->once()
->andReturn(200);

$internalXpathFinder->shouldReceive('filterXPath')
->once()
Expand Down Expand Up @@ -145,9 +178,6 @@ public function whenXpathsAreFoundItShouldReturnTheFoundValues()
'url'
)
->andReturn($internalXpathFinder);
$client->shouldReceive('getInternalResponse->getStatus')
->once()
->andReturn(200);

$internalXpathFinder->shouldReceive('filterXPath')
->once()
Expand Down

0 comments on commit cae36fa

Please sign in to comment.