Skip to content

Commit

Permalink
#149 - fix data scraping for link (#189)
Browse files Browse the repository at this point in the history
* link refactor

* link refactor

* linter
  • Loading branch information
BarTracz authored Jan 10, 2024
1 parent 652b78e commit d39c6be
Showing 1 changed file with 38 additions and 28 deletions.
66 changes: 38 additions & 28 deletions app/Importers/LinkDataImporter.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
class LinkDataImporter extends DataImporter
{
protected Crawler $sections;
private string $countryName = "";

public function extract(): static
{
Expand All @@ -26,7 +25,7 @@ public function extract(): static
}

$crawler = new Crawler($html);
$this->sections = $crawler->filter(".Main-content .sqs-row.row > .col p > strong");
$this->sections = $crawler->filter(".Main-content .sqs-row.row > .col p");

if (count($this->sections) === 0) {
$this->createImportInfoDetails("204", self::getProviderName());
Expand All @@ -39,48 +38,59 @@ public function extract(): static

public function transform(): void
{
$countryName = "";

$cityName = "";

$states = [
"Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida",
"Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine",
"Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska",
"Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota",
"Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee",
"Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming",
];

if ($this->stopExecution) {
return;
}

$existingCityProviders = [];

foreach ($this->sections as $section) {
foreach ($section->childNodes as $node) {
$countryName = trim($node->nodeValue);

foreach ($node->parentNode->parentNode->parentNode->childNodes as $i => $cityName) {
if ($i === 0 || !trim($cityName->nodeValue)) {
continue;
}

$name = $cityName->nodeValue;
$skipFirstIteration = true;

$cities = [];
foreach ($this->sections as $section) {
if ($skipFirstIteration) {
$skipFirstIteration = false;

if (str_contains($name, "(") && str_contains($name, ")")) {
$names = explode("(", $name)[1];
$names = explode(")", $names)[0];
$names = explode(", ", $names);
continue;
}

foreach ($names as $name) {
$cities[] = str_replace("*", "", $name);
}
foreach ($section->childNodes as $node) {
if ($node->nodeName === "strong") {
if (!in_array($node->nodeValue, $states, true)) {
$countryName = trim($node->nodeValue);
} else {
$cities[] = $name;
$countryName = "United States";
}
}

foreach ($cities as $name) {
$provider = $this->load($name, $countryName);
if ($node->nodeName === "#text") {
$cityName = $node->nodeValue;
} else if ($node->nodeName === "a") {
$cityName = $node->nodeValue;
}

if ($provider !== "") {
$existingCityProviders[] = $provider;
}
}
if ($cityName === " ") {
continue;
}
$provider = $this->load($cityName, $countryName);

if ($provider !== "") {
$existingCityProviders[] = $provider;
}
}
}

$this->deleteMissingProviders(self::getProviderName(), $existingCityProviders);
}
}

0 comments on commit d39c6be

Please sign in to comment.