Skip to content

Commit

Permalink
Fix scraped dataset structure to use a url hash as primary key (#5)
Browse files Browse the repository at this point in the history
  • Loading branch information
joskfg authored Dec 12, 2021
1 parent 6c2de90 commit 1f105a3
Show file tree
Hide file tree
Showing 8 changed files with 55 additions and 37 deletions.
4 changes: 2 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ services:
composer:
volumes:
- ./:/app
image: joskfg/composer-rector
image: softonic/composer-rector
command: composer run test
dev:
volumes:
Expand All @@ -13,5 +13,5 @@ services:
fixcs:
volumes:
- ./:/app
image: joskfg/composer-rector
image: softonic/composer-rector
command: composer run fix-cs
18 changes: 10 additions & 8 deletions src/Scraper/Listeners/UpdateDataset.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,26 @@ class UpdateDataset implements ShouldQueue

public function handle(Scraped $event): void
{
$datasets = ScrapedDataset::where('url', $event->scrapeRequest->url)->get();
$datasets = ScrapedDataset::where('url_hash', hash('sha256', $event->scrapeRequest->url))->first();

if ($datasets->isEmpty()) {
if ($datasets === null) {
$this->addDataset($event);
} else {
$this->updateDataset($datasets->first(), $event);
return ;
}

$this->updateDataset($datasets, $event);
}

private function addDataset(Scraped $event): void
{
Log::info('Adding new information to dataset', ['request' => $event->scrapeRequest]);
ScrapedDataset::create(
[
'url' => $event->scrapeRequest->url,
'type' => $event->scrapeRequest->type,
'variant' => $event->scrapedData->getVariant(),
'fields' => $event->scrapedData->getFields(),
'url_hash' => hash('sha256', $event->scrapeRequest->url),
'url' => $event->scrapeRequest->url,
'type' => $event->scrapeRequest->type,
'variant' => $event->scrapedData->getVariant(),
'fields' => $event->scrapedData->getFields(),
]
);

Expand Down
3 changes: 2 additions & 1 deletion src/Scraper/Models/ScrapedDataset.php
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,15 @@ class ScrapedDataset extends Model
*
* @var string
*/
protected $primaryKey = 'url';
protected $primaryKey = 'url_hash';

/**
* The attributes that are mass assignable.
*
* @var array
*/
protected $fillable = [
'url_hash',
'url',
'type',
'variant',
Expand Down
37 changes: 21 additions & 16 deletions src/database/factories/ScrapedDatasetFactory.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,25 @@
use Joskfg\LaravelIntelligentScraper\Scraper\Models\ScrapedDataset;

/* @var \Illuminate\Database\Eloquent\Factory $factory */
$factory->define(ScrapedDataset::class, fn (Faker\Generator $faker) => [
'url' => $faker->url . $faker->randomDigit,
'type' => 'post',
'variant' => $faker->sha1,
'fields' => [
[
'key' => 'title',
'value' => $faker->word,
'found' => $faker->boolean(),
],
[
'key' => 'author',
'value' => $faker->word,
'found' => $faker->boolean(),
$factory->define(ScrapedDataset::class, function (Faker\Generator $faker) {
$url = $faker->url . $faker->randomDigit;

return [
'url_hash' => hash('sha256', $url),
'url' => $url,
'type' => 'post',
'variant' => $faker->sha1,
'fields' => [
[
'key' => 'title',
'value' => $faker->word,
'found' => $faker->boolean(),
],
[
'key' => 'author',
'value' => $faker->word,
'found' => $faker->boolean(),
],
],
],
]);
];
});
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class CreateConfigurationsTable extends Migration
public function up()
{
Schema::create('configurations', function (Blueprint $table): void {
$table->string('name')->primary()
$table->string('name')
->comment('The name of the field.');
$table->string('type')
->comment('The scrape type.');
Expand All @@ -30,6 +30,7 @@ public function up()
->nullable()->default(null);

$table->timestamps();
$table->primary(['name', 'type']);
});
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ class CreateScrapedDatasetsTable extends Migration
public function up()
{
Schema::create('scraped_datasets', function (Blueprint $table): void {
$table->string('url', 1024)->primary();
$table->char('url_hash', 64)->primary();
$table->string('url', 1024);
$table->string('type');
$table->string('variant', 40)->index()->nullable();
$table->json('fields');
Expand Down
18 changes: 10 additions & 8 deletions tests/Integration/CrawlingTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,11 @@ public function configureAutomaticallyCrawlerAndCrawlAField($dataset, $urlToCraw
$type = 'type-example';

ScrapedDataset::create([
'url' => $dataset['url'],
'type' => $type,
'variant' => Str::random(),
'fields' => [$dataset['field']],
'url_hash' => hash('sha256', $dataset['url']),
'url' => $dataset['url'],
'type' => $type,
'variant' => Str::random(),
'fields' => [$dataset['field']],
]);

Event::listen(
Expand Down Expand Up @@ -168,10 +169,11 @@ public function configureAutomaticallyCrawlerWithoutDatasetFoundInfoIsNotPossibl
$type = 'type-example';

ScrapedDataset::create([
'url' => $dataset['url'],
'type' => $type,
'variant' => Str::random(),
'fields' => [$dataset['field']],
'url_hash' => hash('sha256', $dataset['url']),
'url' => $dataset['url'],
'type' => $type,
'variant' => Str::random(),
'fields' => [$dataset['field']],
]);

Event::listen(
Expand Down
6 changes: 6 additions & 0 deletions tests/Unit/Scraper/Repositories/ConfigurationTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ public function whenRecalculateButThereIsNotAType1DatasetItShouldThrowAnExceptio
public function whenRecalculateItShouldStoreTheNewXpaths(): void
{
ScrapedDataset::create([
'url_hash' => hash('sha256', 'https://test.c/123456789222'),
'url' => 'https://test.c/123456789222',
'type' => ':type-1:',
'variant' => 'b265521fc089ac61b794bfa3a5ce8a657f6833ce',
Expand All @@ -79,6 +80,7 @@ public function whenRecalculateItShouldStoreTheNewXpaths(): void
],
]);
ScrapedDataset::create([
'url_hash' => hash('sha256', 'https://test.c/7675487989076'),
'url' => 'https://test.c/7675487989076',
'type' => ':type-2:',
'variant' => 'b265521fc089ac61b794bfa3a5ce8a657f6833ce',
Expand All @@ -96,6 +98,7 @@ public function whenRecalculateItShouldStoreTheNewXpaths(): void
],
]);
ScrapedDataset::create([
'url_hash' => hash('sha256', 'https://test.c/223456789111'),
'url' => 'https://test.c/223456789111',
'type' => ':type-1:',
'variant' => 'b265521fc089ac61b794bfa3a5ce8a657f6833ce',
Expand Down Expand Up @@ -155,6 +158,7 @@ public function whenRecalculateItShouldStoreTheNewXpaths(): void
public function whenRecalculateFailsItShouldThrowAnException(): void
{
ScrapedDataset::create([
'url_hash' => hash('sha256', 'https://test.c/123456789222'),
'url' => 'https://test.c/123456789222',
'type' => ':type-1:',
'variant' => 'b265521fc089ac61b794bfa3a5ce8a657f6833ce',
Expand All @@ -172,6 +176,7 @@ public function whenRecalculateFailsItShouldThrowAnException(): void
],
]);
ScrapedDataset::create([
'url_hash' => hash('sha256', 'https://test.c/7675487989076'),
'url' => 'https://test.c/7675487989076',
'type' => ':type-2:',
'variant' => 'b265521fc089ac61b794bfa3a5ce8a657f6833ce',
Expand All @@ -189,6 +194,7 @@ public function whenRecalculateFailsItShouldThrowAnException(): void
],
]);
ScrapedDataset::create([
'url_hash' => hash('sha256', 'https://test.c/223456789111'),
'url' => 'https://test.c/223456789111',
'type' => ':type-1:',
'variant' => 'b265521fc089ac61b794bfa3a5ce8a657f6833ce',
Expand Down

0 comments on commit 1f105a3

Please sign in to comment.