Skip to content

Commit

Permalink
Automatic crawling for web navigation (#3)
Browse files Browse the repository at this point in the history
Automatic crawling when you want to navigate through the web getting more data
  • Loading branch information
joskfg authored Dec 11, 2021
1 parent 769e9a1 commit e9b68f5
Show file tree
Hide file tree
Showing 13 changed files with 259 additions and 35 deletions.
2 changes: 1 addition & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
},
"scripts": {
"all-test": "phpunit --coverage-text; php-cs-fixer fix -v --diff --dry-run --allow-risky=yes;",
"test": "phpunit --coverage-text tests/Unit; php-cs-fixer fix -v --diff --dry-run --allow-risky=yes;",
"test": "phpunit --coverage-text --testsuite=Unit; php-cs-fixer fix -v --diff --dry-run --allow-risky=yes;",
"phpunit": "phpunit --coverage-text",
"phpcs": "php-cs-fixer fix -v --diff --dry-run --allow-risky=yes;",
"fix-cs": "php-cs-fixer fix -v --diff --allow-risky=yes;"
Expand Down
7 changes: 5 additions & 2 deletions phpunit.xml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,11 @@
</report>
</coverage>
<testsuites>
<testsuite name="Laravel Intelligent Scraper">
<directory>tests</directory>
<testsuite name="Unit">
<directory>tests/Unit</directory>
</testsuite>
<testsuite name="Integration">
<directory>tests/Integration</directory>
</testsuite>
</testsuites>
<logging>
Expand Down
2 changes: 1 addition & 1 deletion src/Scraper/Application/Configurator.php
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ private function findConfigByScrapedData(ScrapedDataset $scrapedData, Crawler $c
$value = is_array($field->getValue()) ? json_encode($field->getValue(), JSON_THROW_ON_ERROR) : $field->getValue();
} catch (JsonException $e) {
}
Log::notice("Field '{$field->getKey()}' with value '{$field->getValue()}' not found for '{$crawler->getUri()}'.");
Log::notice("Field '{$field->getKey()}' with value '{$value}' not found for '{$crawler->getUri()}'.");
}
}

Expand Down
4 changes: 4 additions & 0 deletions src/Scraper/Application/XpathBuilder.php
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ public function find($documentElement, $values): string
$nodes[] = $this->findNode($documentElement, $value);
}

if (empty($nodes)) {
throw new UnexpectedValueException('Xpath not found for the given values: ' . implode('/', $values));
}

return $this->getXPath($nodes);
}

Expand Down
17 changes: 9 additions & 8 deletions src/Scraper/Application/XpathFinder.php
Original file line number Diff line number Diff line change
Expand Up @@ -37,22 +37,23 @@ public function extract(string $url, Collection $configs): ScrapedData
Log::info(
'Searching field',
[
'field' => $config['name'],
'field' => $config->getAttribute('name'),
]
);
$value = $this->extractValue($config, $crawler);

if (!$config['optional'] && $value === null) {
$missingXpath = implode('\', \'', $config['xpaths']);
if (!$config->getAttribute('optional') && $value === null) {
$missingXpath = implode('\', \'', $config->getAttribute('xpaths'));
throw new MissingXpathValueException(
"Xpath '$missingXpath' for field '{$config['name']}' not found in '$url'."
"Xpath '$missingXpath' for field '{$config->getAttribute('name')}' not found in '$url'."
);
}

$scrapedData->setField(
new Field(
$config['name'],
$value ?? $config['default'],
$config->getAttribute('name'),
$value ?? $config->getAttribute('default'),
$config->getAttribute('chain_type'),
$value !== null,
)
);
Expand Down Expand Up @@ -83,13 +84,13 @@ private function getCrawler(string $url): ?Crawler

private function extractValue(Configuration $config, ?Crawler $crawler): ?array
{
foreach ($config['xpaths'] as $xpath) {
foreach ($config->getAttribute('xpaths') as $xpath) {
Log::debug("Checking xpath $xpath");
$subcrawler = $crawler->evaluate($xpath);

if ($subcrawler->count()) {
Log::debug("Found xpath $xpath");
$this->variantGenerator->addConfig($config['name'], $xpath);
$this->variantGenerator->addConfig($config->getAttribute('name'), $xpath);
return $subcrawler->each(fn ($node) => $node->text());
}
}
Expand Down
29 changes: 22 additions & 7 deletions src/Scraper/Entities/Field.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@ class Field implements \JsonSerializable
{
private string $key;
private $value;
private ?string $chainType;
private bool $found;

public function __construct(string $key, $value, bool $found = true)
public function __construct(string $key, $value, ?string $chainType = null, bool $found = true)
{
$this->key = $key;
$this->value = $value;
$this->found = $found;
$this->key = $key;
$this->value = $value;
$this->found = $found;
$this->chainType = $chainType;
}

public function getKey(): string
Expand All @@ -39,6 +41,18 @@ public function setValue($value)
return $this;
}

public function getChainType(): ?string
{
return $this->chainType;
}

public function setChainType(string $chainType): Field
{
$this->chainType = $chainType;

return $this;
}

public function isFound(): bool
{
return $this->found;
Expand All @@ -54,9 +68,10 @@ public function setFound(bool $found): Field
public function jsonSerialize(): array
{
return [
'key' => $this->getKey(),
'value' => $this->getValue(),
'foind' => $this->isFound(),
'key' => $this->getKey(),
'value' => $this->getValue(),
'chain_type' => $this->getChainType(),
'found' => $this->isFound(),
];
}
}
3 changes: 3 additions & 0 deletions src/Scraper/Entities/ScrapedData.php
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ public function setVariant(?string $variant): ScrapedData
return $this;
}

/**
* @return array<Field>
*/
public function getFields(): array
{
return $this->fields;
Expand Down
34 changes: 34 additions & 0 deletions src/Scraper/Listeners/ScrapedListener.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@

use Exception;
use Illuminate\Contracts\Queue\ShouldQueue;
use Softonic\LaravelIntelligentScraper\Scraper\Entities\Field;
use Softonic\LaravelIntelligentScraper\Scraper\Events\Scraped;
use Softonic\LaravelIntelligentScraper\Scraper\Events\ScrapeRequest;

class ScrapedListener implements ShouldQueue
{
Expand All @@ -19,6 +21,38 @@ public function __construct(array $listeners)
* @throws Exception
*/
public function handle(Scraped $scraped): void
{
$this->requestAutomaticNestedScrapes($scraped);
$this->fireUserListeners($scraped);
}

protected function requestAutomaticNestedScrapes(Scraped $scraped): void
{
$fields = $scraped->scrapedData->getFields();
$fields = array_filter(
$fields,
static fn (Field $field) => $field->isFound() && $field->getChainType() !== null
);

foreach ($fields as $field) {
foreach ($field->getValue() as $value) {
$url = $this->getFullUrl($value, $scraped);
event(new ScrapeRequest($url, $field->getChainType()));
}
}
}

protected function getFullUrl($url, Scraped $scraped): string
{
if (strpos($url, 'http') !== 0) {
$urlParts = parse_url($scraped->scrapeRequest->url);
return $urlParts['scheme'] . '://' . $urlParts['host'] . $url;
}

return $url;
}

protected function fireUserListeners(Scraped $scraped): void
{
if (isset($this->listeners[$scraped->scrapeRequest->type])) {
resolve($this->listeners[$scraped->scrapeRequest->type])->handle($scraped);
Expand Down
1 change: 1 addition & 0 deletions src/Scraper/Models/Configuration.php
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class Configuration extends Model
protected $fillable = [
'name',
'type',
'chain_type',
'xpaths',
'optional',
'default',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,22 @@ class CreateConfigurationsTable extends Migration
public function up()
{
Schema::create('configurations', function (Blueprint $table): void {
$table->string('name')->primary();
$table->string('type');
$table->json('xpaths');
$table->boolean('optional')->nullable()->default(false);
$table->json('default')->nullable()->default(null);
$table->string('name')->primary()
->comment('The name of the field.');
$table->string('type')
->comment('The scrape type.');
$table->json('xpaths')
->comment('Array of XPaths to extract data from scrape.');
$table->string('chain_type')
->comment('Allow automatic scraping of the field scrapped value using another type.')
->nullable()->default(null);
$table->boolean('optional')
->comment('Whether the field is optional.')
->nullable()->default(false);
$table->json('default')
->comment('The default value for the field.')
->nullable()->default(null);

$table->timestamps();
});
}
Expand Down
72 changes: 72 additions & 0 deletions tests/Integration/CrawlingTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,17 @@
use Softonic\LaravelIntelligentScraper\Scraper\Models\ScrapedDataset;
use Tests\TestCase;

/**
* THIS TEST SHOULD BE CONFIGURED FIRST. NOT WORKING AS IT IS.
* FILL ALL PROVIDERS BEFORE RUNNING THE TEST.
*/
class CrawlingTest extends TestCase
{
use DatabaseMigrations;

/**
* Configure this provider to check if the crawler is working as expected.
*/
public function fieldCrawlProvider(): array
{
return [
Expand Down Expand Up @@ -188,4 +195,69 @@ public function configureAutomaticallyCrawlerWithoutDatasetFoundInfoIsNotPossibl

scrape($urlToCrawl, $type);
}

public function getChainedTypesConfigurationProvider(): array
{
return [
[
// Url to be crawled
'urlToCrawl' => '',
// Xpath where to find the next URL to crawl
'urlXpath' => '',
// Final Xpath to crawl in the chained crawl
'finalXpath' => '',
// Final value to be found in the final Xpath in list format
'value' => [],
],
];
}

/**
* @test
* @dataProvider getChainedTypesConfigurationProvider
* @param mixed $urlToCrawl
* @param mixed $urlXpath
* @param mixed $finalXpath
* @param mixed $expectedValue
*/
public function whenCrawlingFieldsWithChainedTypesItShouldContinueCrawlingTheChainedTypes(
$urlToCrawl,
$urlXpath,
$finalXpath,
$expectedValue
): void {
$type = 'type-example';
$fieldName = 'semantic-field-name';
$childType = 'child-type-example';
$childFieldName = 'child-semantic-field-name';

Configuration::create([
'name' => $fieldName,
'type' => $type,
'xpaths' => $urlXpath,
'chain_type' => $childType,
]);

Configuration::create([
'name' => $childFieldName,
'type' => $childType,
'xpaths' => $finalXpath,
]);

Event::listen(
Scraped::class,
function (Scraped $scraped) use ($expectedValue, $childType, $childFieldName) {
if ($scraped->scrapeRequest->type === $childType) {
self::assertSame(
$expectedValue,
$scraped->scrapedData->getField($childFieldName)
->getValue()
);
}
}
);
Event::listen(ScrapeFailed::class, fn () => self::fail('Scrape failed'));

scrape($urlToCrawl, $type);
}
}
21 changes: 12 additions & 9 deletions tests/Unit/Scraper/Application/XpathFinderTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -144,9 +144,10 @@ public function whenXpathsAreFoundItShouldReturnTheFoundValues(): void
{
$config = [
Configuration::create([
'name' => ':field-1:',
'type' => ':type:',
'xpaths' => [
'name' => ':field-1:',
'type' => ':type:',
'chain_type' => ':chain-type:',
'xpaths' => [
':xpath-1:',
':xpath-2:',
],
Expand Down Expand Up @@ -214,12 +215,14 @@ public function whenXpathsAreFoundItShouldReturnTheFoundValues(): void
$extractedData->getVariant()
);

$title = $extractedData->getField(':field-1:');
self::assertSame([':value-2:'], $title->getValue());
self::assertTrue($title->isFound());
$field1 = $extractedData->getField(':field-1:');
self::assertSame([':value-2:'], $field1->getValue());
self::assertSame(':chain-type:', $field1->getChainType());
self::assertTrue($field1->isFound());

$author = $extractedData->getField(':field-2:');
self::assertSame([':value-1:'], $author->getValue());
self::assertTrue($author->isFound());
$field2 = $extractedData->getField(':field-2:');
self::assertSame([':value-1:'], $field2->getValue());
self::assertNull($field2->getChainType());
self::assertTrue($field2->isFound());
}
}
Loading

0 comments on commit e9b68f5

Please sign in to comment.