Skip to content

Commit

Permalink
Fix lidl parsing of products (and use coroutines)
Browse files Browse the repository at this point in the history
  • Loading branch information
StefanBratanov committed Nov 4, 2023
1 parent f3e19bb commit 64b9123
Show file tree
Hide file tree
Showing 9 changed files with 10,094 additions and 6,247 deletions.
1 change: 1 addition & 0 deletions build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ dependencies {
implementation("org.springdoc:springdoc-openapi-starter-webmvc-ui:2.2.0")
implementation("org.jetbrains.kotlin:kotlin-reflect")
implementation("org.jetbrains.kotlin:kotlin-stdlib")
implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.3")
implementation("com.fasterxml.jackson.module:jackson-module-kotlin")
implementation("org.apache.commons:commons-lang3")
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package com.stefanbratanov.sofiasupermarketsapi.extractors

import com.stefanbratanov.sofiasupermarketsapi.common.Log
import com.stefanbratanov.sofiasupermarketsapi.common.Log.Companion.log
import com.stefanbratanov.sofiasupermarketsapi.common.getHtmlDocument
import com.stefanbratanov.sofiasupermarketsapi.common.normalizePrice
import com.stefanbratanov.sofiasupermarketsapi.model.Product
import java.net.URL
import java.time.LocalDate
import java.time.format.DateTimeFormatter
import org.apache.commons.lang3.StringUtils
import org.springframework.stereotype.Component

@Log
@Component
class LidlProductExtractor {

fun extract(url: URL): Product {
log.debug("Processing Lidl product URL: {}", url)

val document = getHtmlDocument(url)

val name = document.selectFirst("h1.keyfacts__title")?.text()
val quantity = document.selectFirst("div.price-footer")?.text()
val price = document.selectFirst("div.m-price__price")?.text()
val oldPrice = document.selectFirst("span.m-price__rrp")?.text()

val dateRange =
document.selectFirst("span[data-v-35dadb86]")?.text()?.trim()?.let { dateSpan ->
"\\d+.\\d+.".toRegex().findAll(dateSpan).map { date ->
val match = date.groupValues[0]
try {
LocalDate.parse(
match.plus(LocalDate.now().year),
DateTimeFormatter.ofPattern("dd.MM.yyyy"),
)
} catch (ex: Exception) {
log.error("Error while parsing $date", ex)
null
}
}
}

return Product(
name = StringUtils.normalizeSpace(name),
quantity = StringUtils.normalizeSpace(quantity),
price = normalizePrice(price),
oldPrice = normalizePrice(oldPrice),
category = null,
picUrl = null,
validFrom = dateRange?.elementAtOrNull(0),
validUntil = dateRange?.elementAtOrNull(1)
)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,86 +2,45 @@ package com.stefanbratanov.sofiasupermarketsapi.extractors

import com.stefanbratanov.sofiasupermarketsapi.common.Log
import com.stefanbratanov.sofiasupermarketsapi.common.Log.Companion.log
import com.stefanbratanov.sofiasupermarketsapi.common.UrlValidator
import com.stefanbratanov.sofiasupermarketsapi.common.getHtmlDocument
import com.stefanbratanov.sofiasupermarketsapi.common.normalizePrice
import com.stefanbratanov.sofiasupermarketsapi.interfaces.UrlProductsExtractor
import com.stefanbratanov.sofiasupermarketsapi.model.Product
import java.net.URL
import java.time.LocalDate
import java.time.format.DateTimeFormatter
import java.util.*
import org.apache.commons.lang3.StringUtils
import kotlinx.coroutines.*
import org.springframework.beans.factory.annotation.Value
import org.springframework.stereotype.Component

@Log
@Component("Lidl")
class LidlProductsExtractor(
private val urlValidator: UrlValidator = UrlValidator(),
@Value("\${lidl.base.url}") private val baseUrl: URL,
val lidlProductExtractor: LidlProductExtractor
) : UrlProductsExtractor {

@OptIn(DelicateCoroutinesApi::class)
override fun extract(url: URL): List<Product> {
log.info("Processing Lidl URL: {}", url.toString())
log.info("Processing Lidl URL: {}", url)

val document = getHtmlDocument(url)

val category = document.selectFirst("meta[property=og:title]")?.attr("content")

return document
.select("article[data-price]")
.filter { element -> !element.select(".lidl-m-pricebox__price").isEmpty() }
.map {
val dateRange =
it.selectFirst(".lidl-m-ribbon-item__text")?.text()?.trim()?.let { dateSpan ->
"\\d+.\\d+.".toRegex().findAll(dateSpan).map { date ->
val match = date.groupValues[0]
try {
LocalDate.parse(
match.plus(LocalDate.now().year),
DateTimeFormatter.ofPattern("dd.MM.yyyy"),
)
} catch (ex: Exception) {
log.error("Error while parsing $date", ex)
null
}
val category = document.title()

val deferredProducts =
document.select("div[data-selector=PRODUCT]").mapNotNull {
val picUrl = it.attr("image").takeIf { url -> url.isNotEmpty() }
it
// retrieve product URL
.attr("canonicalurl")
.takeIf { canonicalUrl -> canonicalUrl.isNotEmpty() }
?.let { canonicalUrl -> baseUrl.toURI().resolve(canonicalUrl).toURL() }
?.let { productUrl ->
GlobalScope.async {
lidlProductExtractor.extract(productUrl).copy(category = category, picUrl = picUrl)
}
}

val name = it.attr("data-name")
val oldPrice =
it
.select(".lidl-m-pricebox__discount-price")
.textNodes()
.takeIf { tn -> tn.isNotEmpty() }
?.first()
?.text()
val newPrice = it.selectFirst(".lidl-m-pricebox__price")?.text()
val quantity = it.selectFirst(".lidl-m-pricebox__basic-quantity")?.text()

var picUrl =
it
.select("picture")
.select("source[data-srcset]")
.eachAttr("data-srcset")
.firstOrNull { srcSet -> srcSet.contains("/sm/") }
?.split(",")
?.map { picUrl -> picUrl.replace(Regex("\\dx\\s*\$"), "").trim() }
?.firstOrNull { picUrl -> urlValidator.isValid(picUrl) }

if (Objects.isNull(picUrl)) {
picUrl = it.select("picture").select("source").attr("srcset")
}

Product(
name = StringUtils.normalizeSpace(name),
price = normalizePrice(newPrice),
oldPrice = normalizePrice(oldPrice),
quantity = StringUtils.normalizeSpace(quantity),
picUrl = picUrl,
category = category,
validFrom = dateRange?.elementAtOrNull(0),
validUntil = dateRange?.elementAtOrNull(1),
)
}

return runBlocking { deferredProducts.awaitAll() }
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package com.stefanbratanov.sofiasupermarketsapi.extractors

import assertk.assertThat
import assertk.assertions.isEqualTo
import assertk.assertions.isEqualToIgnoringGivenProperties
import com.stefanbratanov.sofiasupermarketsapi.getUri
import com.stefanbratanov.sofiasupermarketsapi.model.Product
import java.time.LocalDate
import java.time.Month
import org.junit.jupiter.api.Test

internal class LidlProductExtractorTest {

private val underTest = LidlProductExtractor()

@Test
fun `test extracting product`() {
val testHtmlUrl = getUri("/extractors/lidl/input-single.html").toURL()

val product = underTest.extract(testHtmlUrl)

val expectedProduct =
Product(
name = "Багета Рустик",
quantity = "250 g/бр.",
price = 0.99,
oldPrice = 1.99,
category = null,
picUrl = null
)

assertThat(product)
.isEqualToIgnoringGivenProperties(expectedProduct, Product::validFrom, Product::validUntil)

assertThat(product.validFrom?.dayOfMonth).isEqualTo(30)
assertThat(product.validFrom?.month).isEqualTo(Month.OCTOBER)
assertThat(product.validFrom?.year).isEqualTo(LocalDate.now().year)
assertThat(product.validUntil?.dayOfMonth).isEqualTo(5)
assertThat(product.validUntil?.month).isEqualTo(Month.NOVEMBER)
assertThat(product.validUntil?.year).isEqualTo(LocalDate.now().year)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,57 +3,64 @@ package com.stefanbratanov.sofiasupermarketsapi.extractors
import assertk.assertThat
import assertk.assertions.isNotEmpty
import com.stefanbratanov.sofiasupermarketsapi.getUri
import com.stefanbratanov.sofiasupermarketsapi.model.Product
import com.stefanbratanov.sofiasupermarketsapi.readResource
import com.stefanbratanov.sofiasupermarketsapi.testObjectMapper
import io.mockk.every
import io.mockk.mockk
import java.net.URL
import java.time.LocalDate
import org.junit.jupiter.api.Disabled
import org.junit.jupiter.api.Test
import org.skyscreamer.jsonassert.Customization
import org.skyscreamer.jsonassert.JSONAssert
import org.skyscreamer.jsonassert.JSONCompareMode
import org.skyscreamer.jsonassert.comparator.CustomComparator

internal class LidlProductsExtractorTest {

private val objectMapper = testObjectMapper()

private val underTest = LidlProductsExtractor()
private val lidlProductExtractor: LidlProductExtractor = mockk()

private val underTest = LidlProductsExtractor(URL("https://www.lidl.bg"), lidlProductExtractor)

@Test
fun `test extracting products`() {
every { lidlProductExtractor.extract(any()) } returns
Product(
name = "foo",
"1 кг",
8.99,
10.99,
category = null,
picUrl = null,
validFrom = LocalDate.of(1993, 7, 28),
validUntil = LocalDate.of(2023, 11, 4)
)

val testHtmlUrl = getUri("/extractors/lidl/input.html").toURL()

val products = underTest.extract(testHtmlUrl)

val actualJson = objectMapper.writeValueAsString(products)
val expectedJson = readResource("/extractors/lidl/expected.json")

val customization: (actualField: Any, expectedField: Any) -> Boolean =
{ actualField, expectedField ->
LocalDate.parse(expectedField.toString())
.withYear(LocalDate.now().year)
.equals(LocalDate.parse(actualField.toString()))
}

// use current year for comparison
JSONAssert.assertEquals(
expectedJson,
actualJson,
CustomComparator(
JSONCompareMode.STRICT,
Customization("[*].validFrom", customization),
Customization("[*].validUntil", customization),
),
)
JSONAssert.assertEquals(expectedJson, actualJson, JSONCompareMode.STRICT)
}

@Test
@Disabled("used for manual testing")
fun `test fetching from real url`() {
val lidlUrl = URL("https://www.lidl.bg/bg/c/niska-cena-visoko-kachestvo/c1847/w1")
// use real product extractor
val underTest = LidlProductsExtractor(URL("https://www.lidl.bg"), LidlProductExtractor())
val lidlUrl =
URL(
"https://www.lidl.bg/c/niska-tsena-visoko-kachestvo/a10031916?channel=store&tabCode=Current_Sales_Week"
)
val products = underTest.extract(lidlUrl)

products.forEach { println(it) }

assertThat(products).isNotEmpty()
}
}
Loading

0 comments on commit 64b9123

Please sign in to comment.