Skip to content

Commit

Permalink
Merge pull request #956 from mediathekview/feature/955
Browse files Browse the repository at this point in the history
Feature/955
  • Loading branch information
pidoubleyou authored Jan 7, 2024
2 parents 5a91a39 + d2cf832 commit 462eb30
Show file tree
Hide file tree
Showing 10 changed files with 1,259 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -99,21 +99,24 @@ private Queue<TopicUrlDTO> getLetterEntries() throws InterruptedException, Execu
@Override
protected RecursiveTask<Set<Film>> createCrawlerTask() {
try {
boolean processMoreEpisodes = false;

final Queue<TopicUrlDTO> shows = new ConcurrentLinkedQueue<>();

if (Boolean.TRUE.equals(crawlerConfig.getTopicsSearchEnabled())) {
shows.addAll(getArchiveEntries());

addShows(shows, getLetterEntries());
processMoreEpisodes = true;
} else {
addShows(shows, getDaysEntries());
processMoreEpisodes = false;
}
addShows(shows, getDaysEntries());

printMessage(
ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), shows.size());
getAndSetMaxCount(shows.size());

return new OrfFilmDetailTask(this, shows);
return new OrfFilmDetailTask(this, shows, processMoreEpisodes);
} catch (final InterruptedException ex) {
LOG.debug("{} crawler interrupted.", getSender().getName(), ex);
Thread.currentThread().interrupt();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package de.mediathekview.mserver.crawler.orf.json;

import com.google.gson.JsonDeserializationContext;
import com.google.gson.JsonDeserializer;
import com.google.gson.JsonElement;
import de.mediathekview.mserver.base.utils.JsonUtils;
import de.mediathekview.mserver.base.utils.UrlUtils;
import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO;
import de.mediathekview.mserver.crawler.orf.OrfConstants;

import java.lang.reflect.Type;
import java.util.Optional;

public class OrfMoreEpisodesDeserializer implements JsonDeserializer<CrawlerUrlDTO> {

private static final String ATTRIBUTE_URL = "url";

@Override
public CrawlerUrlDTO deserialize(
JsonElement jsonElement, Type type, JsonDeserializationContext jsonDeserializationContext) {

final Optional<String> url =
JsonUtils.getAttributeAsString(jsonElement.getAsJsonObject(), ATTRIBUTE_URL);
return url.map(s -> new CrawlerUrlDTO(UrlUtils.addDomainIfMissing(s, OrfConstants.URL_BASE))).orElse(null);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package de.mediathekview.mserver.crawler.orf.parser;

import de.mediathekview.mserver.base.HtmlConsts;
import de.mediathekview.mserver.crawler.basic.TopicUrlDTO;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.nodes.Document;

public class OrfMoreEpisodesParser {
private static final String EPISODES_SELECTOR = "article.b-teaser > a.teaser-link";

public List<TopicUrlDTO> parse(final Document document, final String topic) {
final List<TopicUrlDTO> result = new ArrayList<>();

document
.select(EPISODES_SELECTOR)
.forEach(
episode -> {
final String url = episode.attr(HtmlConsts.ATTRIBUTE_HREF);
result.add(new TopicUrlDTO(topic, url));
});

return result;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,18 @@
import de.mediathekview.mlib.daten.GeoLocations;
import de.mediathekview.mlib.daten.Resolution;
import de.mediathekview.mserver.base.utils.HtmlDocumentUtils;
import de.mediathekview.mserver.crawler.basic.AbstractCrawler;
import de.mediathekview.mserver.crawler.basic.AbstractDocumentTask;
import de.mediathekview.mserver.crawler.basic.AbstractUrlTask;
import de.mediathekview.mserver.crawler.basic.TopicUrlDTO;
import de.mediathekview.mserver.crawler.basic.*;
import de.mediathekview.mserver.crawler.orf.OrfEpisodeInfoDTO;
import de.mediathekview.mserver.crawler.orf.OrfVideoInfoDTO;
import de.mediathekview.mserver.crawler.orf.json.OrfMoreEpisodesDeserializer;
import de.mediathekview.mserver.crawler.orf.parser.OrfMoreEpisodesParser;
import de.mediathekview.mserver.crawler.orf.parser.OrfPlaylistDeserializer;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.nodes.Document;

import java.io.IOException;
import java.lang.reflect.Type;
import java.net.MalformedURLException;
import java.net.URL;
Expand All @@ -29,6 +29,7 @@
import java.time.format.DateTimeParseException;
import java.time.temporal.ChronoUnit;
import java.util.*;
import java.util.concurrent.ConcurrentLinkedQueue;

public class OrfFilmDetailTask extends AbstractDocumentTask<Film, TopicUrlDTO> {

Expand All @@ -40,21 +41,25 @@ public class OrfFilmDetailTask extends AbstractDocumentTask<Film, TopicUrlDTO> {
private static final String DURATION_SELECTOR = VIDEO_META_DATA_SELECTOR + " span.duration";
private static final String DESCRIPTION_SELECTOR = ".description-container .description-text";
private static final String VIDEO_SELECTOR = "div.jsb_VideoPlaylist";
private static final String MORE_EPISODES_SELECTOR = "div.more-episodes";

private static final String ATTRIBUTE_DATETIME = "datetime";
private static final String ATTRIBUTE_DATA_JSB = "data-jsb";

private static final String PREFIX_AUDIO_DESCRIPTION = "AD |";

private static final DateTimeFormatter DATE_TIME_FORMATTER =
DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");

private static final Type CRAWLER_URL_TYPE_TOKEN = new TypeToken<CrawlerUrlDTO>() {}.getType();
private static final Type LIST_EPISODEINFO_TYPE_TOKEN =
new TypeToken<List<OrfEpisodeInfoDTO>>() {}.getType();
private final boolean processMoreEpisodes;

public OrfFilmDetailTask(
final AbstractCrawler aCrawler, final Queue<TopicUrlDTO> aUrlToCrawlDtos) {
final AbstractCrawler aCrawler, final Queue<TopicUrlDTO> aUrlToCrawlDtos, boolean processMoreEpisodes) {
super(aCrawler, aUrlToCrawlDtos);

this.processMoreEpisodes = processMoreEpisodes;
}

private static Optional<LocalDateTime> parseDate(final Document aDocument) {
Expand Down Expand Up @@ -147,12 +152,22 @@ protected void processDocument(final TopicUrlDTO aUrlDto, final Document aDocume
episode.getDuration());
}
}

if (processMoreEpisodes) {
final List<TopicUrlDTO> topicUrlDTOS = parseMoreEpisodes(aDocument, aUrlDto.getTopic());
topicUrlDTOS.remove(aUrlDto);
processMoreEpisodes(topicUrlDTOS);
}
}

@Override
protected AbstractUrlTask<Film, TopicUrlDTO> createNewOwnInstance(
final Queue<TopicUrlDTO> aUrlsToCrawl) {
return new OrfFilmDetailTask(crawler, aUrlsToCrawl);
return createNewOwnInstance(aUrlsToCrawl, processMoreEpisodes);
}

private AbstractUrlTask<Film, TopicUrlDTO> createNewOwnInstance(final Queue<TopicUrlDTO> urlsToCrawl, boolean processMoreEpisodes) {
return new OrfFilmDetailTask(crawler, urlsToCrawl, processMoreEpisodes);
}

private void createFilm(
Expand Down Expand Up @@ -255,4 +270,37 @@ private List<OrfEpisodeInfoDTO> parseEpisodes(final Document aDocument) {

return new ArrayList<>();
}

private List<TopicUrlDTO> parseMoreEpisodes(final Document document, final String topic) {
final Optional<String> json = HtmlDocumentUtils.getElementAttributeString(MORE_EPISODES_SELECTOR, ATTRIBUTE_DATA_JSB, document);
if (json.isPresent()) {
final Gson gson =
new GsonBuilder()
.registerTypeAdapter(CRAWLER_URL_TYPE_TOKEN, new OrfMoreEpisodesDeserializer())
.create();

CrawlerUrlDTO moreEpisodesUrl = gson.fromJson(json.get(), CRAWLER_URL_TYPE_TOKEN);
if (moreEpisodesUrl != null) {
try {
final Document moreEpisodesDocument = crawler.requestBodyAsHtmlDocument(moreEpisodesUrl.getUrl());
OrfMoreEpisodesParser parser = new OrfMoreEpisodesParser();
return parser.parse(moreEpisodesDocument, topic);
} catch (IOException e) {
LOG.error("OrfFilmDetailTask: loading more episodes url {} failed.", moreEpisodesUrl.getUrl());
crawler.incrementAndGetErrorCount();
}
}
}

return new ArrayList<>();
}

private void processMoreEpisodes(final List<TopicUrlDTO> moreFilms) {
if (moreFilms != null && !moreFilms.isEmpty()) {
final Queue<TopicUrlDTO> queue = new ConcurrentLinkedQueue<>(moreFilms);
final OrfFilmDetailTask task = (OrfFilmDetailTask) createNewOwnInstance(queue, false);
task.fork();
taskResults.addAll(task.join());
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package de.mediathekview.mserver.crawler.orf.json;

import com.google.gson.JsonElement;
import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO;
import de.mediathekview.mserver.testhelper.JsonFileReader;
import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.*;

class OrfMoreEpisodesDeserializerTest {

@Test
void testDeserialize() {
final JsonElement jsonElement = JsonFileReader.readJson("/orf/orf_film_more_episodes.json");

final OrfMoreEpisodesDeserializer target = new OrfMoreEpisodesDeserializer();
final CrawlerUrlDTO actual = target.deserialize(jsonElement, null, null);

assertNotNull(actual);
assertEquals("https://tvthek.orf.at/lane-plus/other_episodes_of_profile?profileId=13895917&profileSlug=Biester", actual.getUrl());

}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package de.mediathekview.mserver.crawler.orf.parser;

import de.mediathekview.mserver.crawler.basic.TopicUrlDTO;
import de.mediathekview.mserver.testhelper.FileReader;
import org.hamcrest.MatcherAssert;
import org.hamcrest.Matchers;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.junit.jupiter.api.Test;

import java.util.List;

import static org.junit.jupiter.api.Assertions.*;

class OrfMoreEpisodesParserTest {
@Test
void parseDocumentWithEpisodes() {
TopicUrlDTO[] expectedFilms = new TopicUrlDTO[] {
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-9/14207236"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-8/14207235"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-7/14207234"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-6/14207233"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-5/14207232"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-4/14207231"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-3/14207230"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-2/14207229"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Alle-Folgen-jetzt-Biester-1-10/14207227"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-10/14207252"),
};

final Document document = Jsoup.parse(FileReader.readFile("/orf/orf_film_more_episodes.html"));

OrfMoreEpisodesParser target = new OrfMoreEpisodesParser();
final List<TopicUrlDTO> actual = target.parse(document, "Biester");

assertEquals(10, actual.size());
MatcherAssert.assertThat(actual, Matchers.containsInAnyOrder(expectedFilms));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ public OrfFilmDetailTaskTestBase() {
}

protected Set<Film> executeTask(OrfCrawler crawler, String aTheme, String aRequestUrl) {
return new OrfFilmDetailTask(crawler, createCrawlerUrlDto(aTheme, aRequestUrl))
return new OrfFilmDetailTask(crawler, createCrawlerUrlDto(aTheme, aRequestUrl), false)
.invoke();
}
}
Loading

0 comments on commit 462eb30

Please sign in to comment.