Skip to content

Commit

Permalink
search more episodes
Browse files Browse the repository at this point in the history
  • Loading branch information
pidoubleyou committed Jan 6, 2024
1 parent 5a91a39 commit 0c3cb99
Show file tree
Hide file tree
Showing 10 changed files with 1,257 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -99,21 +99,23 @@ private Queue<TopicUrlDTO> getLetterEntries() throws InterruptedException, Execu
@Override
protected RecursiveTask<Set<Film>> createCrawlerTask() {
try {
boolean processMoreEpisodes = false;

final Queue<TopicUrlDTO> shows = new ConcurrentLinkedQueue<>();

if (Boolean.TRUE.equals(crawlerConfig.getTopicsSearchEnabled())) {
shows.addAll(getArchiveEntries());

addShows(shows, getLetterEntries());
processMoreEpisodes = true;
}
addShows(shows, getDaysEntries());

printMessage(
ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), shows.size());
getAndSetMaxCount(shows.size());

return new OrfFilmDetailTask(this, shows);
// TODO Problem mit Sport aktuell u.ä. lösen => more episodes pro show setzen (topic ja, day nein?)
return new OrfFilmDetailTask(this, shows, processMoreEpisodes);
} catch (final InterruptedException ex) {
LOG.debug("{} crawler interrupted.", getSender().getName(), ex);
Thread.currentThread().interrupt();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package de.mediathekview.mserver.crawler.orf.json;

import com.google.gson.JsonDeserializationContext;
import com.google.gson.JsonDeserializer;
import com.google.gson.JsonElement;
import de.mediathekview.mserver.base.utils.JsonUtils;
import de.mediathekview.mserver.base.utils.UrlUtils;
import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO;
import de.mediathekview.mserver.crawler.orf.OrfConstants;

import java.lang.reflect.Type;
import java.util.Optional;

public class OrfMoreEpisodesDeserializer implements JsonDeserializer<CrawlerUrlDTO> {

private static final String ATTRIBUTE_URL = "url";

@Override
public CrawlerUrlDTO deserialize(
JsonElement jsonElement, Type type, JsonDeserializationContext jsonDeserializationContext) {

final Optional<String> url =
JsonUtils.getAttributeAsString(jsonElement.getAsJsonObject(), ATTRIBUTE_URL);
return url.map(s -> new CrawlerUrlDTO(UrlUtils.addDomainIfMissing(s, OrfConstants.URL_BASE))).orElse(null);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package de.mediathekview.mserver.crawler.orf.parser;

import de.mediathekview.mserver.base.HtmlConsts;
import de.mediathekview.mserver.crawler.basic.TopicUrlDTO;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.nodes.Document;

public class OrfMoreEpisodesParser {
private static final String EPISODES_SELECTOR = "article.b-teaser > a.teaser-link";

public List<TopicUrlDTO> parse(final Document document, final String topic) {
final List<TopicUrlDTO> result = new ArrayList<>();

document
.select(EPISODES_SELECTOR)
.forEach(
episode -> {
final String url = episode.attr(HtmlConsts.ATTRIBUTE_HREF);
result.add(new TopicUrlDTO(topic, url));
});

return result;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,18 @@
import de.mediathekview.mlib.daten.GeoLocations;
import de.mediathekview.mlib.daten.Resolution;
import de.mediathekview.mserver.base.utils.HtmlDocumentUtils;
import de.mediathekview.mserver.crawler.basic.AbstractCrawler;
import de.mediathekview.mserver.crawler.basic.AbstractDocumentTask;
import de.mediathekview.mserver.crawler.basic.AbstractUrlTask;
import de.mediathekview.mserver.crawler.basic.TopicUrlDTO;
import de.mediathekview.mserver.crawler.basic.*;
import de.mediathekview.mserver.crawler.orf.OrfEpisodeInfoDTO;
import de.mediathekview.mserver.crawler.orf.OrfVideoInfoDTO;
import de.mediathekview.mserver.crawler.orf.json.OrfMoreEpisodesDeserializer;
import de.mediathekview.mserver.crawler.orf.parser.OrfMoreEpisodesParser;
import de.mediathekview.mserver.crawler.orf.parser.OrfPlaylistDeserializer;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.nodes.Document;

import java.io.IOException;
import java.lang.reflect.Type;
import java.net.MalformedURLException;
import java.net.URL;
Expand All @@ -29,6 +29,7 @@
import java.time.format.DateTimeParseException;
import java.time.temporal.ChronoUnit;
import java.util.*;
import java.util.concurrent.ConcurrentLinkedQueue;

public class OrfFilmDetailTask extends AbstractDocumentTask<Film, TopicUrlDTO> {

Expand All @@ -40,21 +41,25 @@ public class OrfFilmDetailTask extends AbstractDocumentTask<Film, TopicUrlDTO> {
private static final String DURATION_SELECTOR = VIDEO_META_DATA_SELECTOR + " span.duration";
private static final String DESCRIPTION_SELECTOR = ".description-container .description-text";
private static final String VIDEO_SELECTOR = "div.jsb_VideoPlaylist";
private static final String MORE_EPISODES_SELECTOR = "div.more-episodes";

private static final String ATTRIBUTE_DATETIME = "datetime";
private static final String ATTRIBUTE_DATA_JSB = "data-jsb";

private static final String PREFIX_AUDIO_DESCRIPTION = "AD |";

private static final DateTimeFormatter DATE_TIME_FORMATTER =
DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");

private static final Type CRAWLER_URL_TYPE_TOKEN = new TypeToken<CrawlerUrlDTO>() {}.getType();
private static final Type LIST_EPISODEINFO_TYPE_TOKEN =
new TypeToken<List<OrfEpisodeInfoDTO>>() {}.getType();
private final boolean processMoreEpisodes;

public OrfFilmDetailTask(
final AbstractCrawler aCrawler, final Queue<TopicUrlDTO> aUrlToCrawlDtos) {
final AbstractCrawler aCrawler, final Queue<TopicUrlDTO> aUrlToCrawlDtos, boolean processMoreEpisodes) {
super(aCrawler, aUrlToCrawlDtos);

this.processMoreEpisodes = processMoreEpisodes;
}

private static Optional<LocalDateTime> parseDate(final Document aDocument) {
Expand Down Expand Up @@ -147,12 +152,22 @@ protected void processDocument(final TopicUrlDTO aUrlDto, final Document aDocume
episode.getDuration());
}
}

if (processMoreEpisodes) {
final List<TopicUrlDTO> topicUrlDTOS = parseMoreEpisodes(aDocument, aUrlDto.getTopic());
topicUrlDTOS.remove(aUrlDto);
processMoreEpisodes(topicUrlDTOS);
}
}

@Override
protected AbstractUrlTask<Film, TopicUrlDTO> createNewOwnInstance(
final Queue<TopicUrlDTO> aUrlsToCrawl) {
return new OrfFilmDetailTask(crawler, aUrlsToCrawl);
return createNewOwnInstance(aUrlsToCrawl, true);
}

private AbstractUrlTask<Film, TopicUrlDTO> createNewOwnInstance(final Queue<TopicUrlDTO> urlsToCrawl, boolean processMoreEpisodes) {
return new OrfFilmDetailTask(crawler, urlsToCrawl, processMoreEpisodes);
}

private void createFilm(
Expand Down Expand Up @@ -255,4 +270,37 @@ private List<OrfEpisodeInfoDTO> parseEpisodes(final Document aDocument) {

return new ArrayList<>();
}

private List<TopicUrlDTO> parseMoreEpisodes(final Document document, final String topic) {
final Optional<String> json = HtmlDocumentUtils.getElementAttributeString(MORE_EPISODES_SELECTOR, ATTRIBUTE_DATA_JSB, document);
if (json.isPresent()) {
final Gson gson =
new GsonBuilder()
.registerTypeAdapter(CRAWLER_URL_TYPE_TOKEN, new OrfMoreEpisodesDeserializer())
.create();

CrawlerUrlDTO moreEpisodesUrl = gson.fromJson(json.get(), CRAWLER_URL_TYPE_TOKEN);
if (moreEpisodesUrl != null) {
try {
final Document moreEpisodesDocument = crawler.requestBodyAsHtmlDocument(moreEpisodesUrl.getUrl());
OrfMoreEpisodesParser parser = new OrfMoreEpisodesParser();
return parser.parse(moreEpisodesDocument, topic);
} catch (IOException e) {
LOG.error("OrfFilmDetailTask: loading more episodes url {} failed.", moreEpisodesUrl.getUrl());
crawler.incrementAndGetErrorCount();
}
}
}

return new ArrayList<>();
}

private void processMoreEpisodes(final List<TopicUrlDTO> moreFilms) {
if (moreFilms != null && !moreFilms.isEmpty()) {
final Queue<TopicUrlDTO> queue = new ConcurrentLinkedQueue<>(moreFilms);
final OrfFilmDetailTask task = (OrfFilmDetailTask) createNewOwnInstance(queue, false);
task.fork();
taskResults.addAll(task.join());
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package de.mediathekview.mserver.crawler.orf.json;

import com.google.gson.JsonElement;
import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO;
import de.mediathekview.mserver.testhelper.JsonFileReader;
import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.*;

class OrfMoreEpisodesDeserializerTest {

@Test
void testDeserialize() {
final JsonElement jsonElement = JsonFileReader.readJson("/orf/orf_film_more_episodes.json");

final OrfMoreEpisodesDeserializer target = new OrfMoreEpisodesDeserializer();
final CrawlerUrlDTO actual = target.deserialize(jsonElement, null, null);

assertNotNull(actual);
assertEquals("https://tvthek.orf.at/lane-plus/other_episodes_of_profile?profileId=13895917&profileSlug=Biester", actual.getUrl());

}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package de.mediathekview.mserver.crawler.orf.parser;

import de.mediathekview.mserver.crawler.basic.TopicUrlDTO;
import de.mediathekview.mserver.testhelper.FileReader;
import org.hamcrest.MatcherAssert;
import org.hamcrest.Matchers;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.junit.jupiter.api.Test;

import java.util.List;

import static org.junit.jupiter.api.Assertions.*;

class OrfMoreEpisodesParserTest {
@Test
void parseDocumentWithEpisodes() {
TopicUrlDTO[] expectedFilms = new TopicUrlDTO[] {
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-9/14207236"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-8/14207235"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-7/14207234"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-6/14207233"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-5/14207232"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-4/14207231"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-3/14207230"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-2/14207229"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Alle-Folgen-jetzt-Biester-1-10/14207227"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-10/14207252"),
};

final Document document = Jsoup.parse(FileReader.readFile("/orf/orf_film_more_episodes.html"));

OrfMoreEpisodesParser target = new OrfMoreEpisodesParser();
final List<TopicUrlDTO> actual = target.parse(document, "Biester");

assertEquals(10, actual.size());
MatcherAssert.assertThat(actual, Matchers.containsInAnyOrder(expectedFilms));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ public OrfFilmDetailTaskTestBase() {
}

protected Set<Film> executeTask(OrfCrawler crawler, String aTheme, String aRequestUrl) {
return new OrfFilmDetailTask(crawler, createCrawlerUrlDto(aTheme, aRequestUrl))
return new OrfFilmDetailTask(crawler, createCrawlerUrlDto(aTheme, aRequestUrl), false)
.invoke();
}
}
Loading

0 comments on commit 0c3cb99

Please sign in to comment.