From 8c2b920b3548f95146c25e297b9371a0793b30fd Mon Sep 17 00:00:00 2001 From: Conner Bradley Date: Thu, 8 Sep 2022 15:32:58 -0400 Subject: [PATCH 1/2] Fix breaking changes between JSoup 1.14.2 to 1.15.3 JSoup replaced org.jsoup.safety.Whitelist with org.jsoup.safety.Safelist. Purely a naming change, no difference of functionality. This bumps the JSoup version (re: https://github.com/jagrosh/JLyrics/pull/10) and handles the breaking changes. This is needed as the related MusicBot project has recently bumped the JSoup version to 1.15.3 (re: https://github.com/jagrosh/MusicBot/commit/a7be2c4602680cdfc48b7d19cb90cb19b3ba7151) causing runtime errors as it depends on JLyrics, which depends on a different JSoup version. --- pom.xml | 2 +- src/main/java/com/jagrosh/jlyrics/LyricsClient.java | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index c0e895f..8f88d44 100644 --- a/pom.xml +++ b/pom.xml @@ -10,7 +10,7 @@ org.jsoup jsoup - 1.14.2 + 1.15.3 jar diff --git a/src/main/java/com/jagrosh/jlyrics/LyricsClient.java b/src/main/java/com/jagrosh/jlyrics/LyricsClient.java index 319b0d4..a7169df 100644 --- a/src/main/java/com/jagrosh/jlyrics/LyricsClient.java +++ b/src/main/java/com/jagrosh/jlyrics/LyricsClient.java @@ -31,7 +31,7 @@ import org.jsoup.nodes.Document; import org.jsoup.nodes.Document.OutputSettings; import org.jsoup.nodes.Element; -import org.jsoup.safety.Whitelist; +import org.jsoup.safety.Safelist; /** * @@ -42,7 +42,7 @@ public class LyricsClient private final Config config = ConfigFactory.load(); private final HashMap cache = new HashMap<>(); private final OutputSettings noPrettyPrint = new OutputSettings().prettyPrint(false); - private final Whitelist newlineWhitelist = Whitelist.none().addTags("br", "p"); + private final Safelist newlineSafelist = Safelist.none().addTags("br", "p"); private final Executor executor; private final String defaultSource, userAgent; private final int timeout; @@ -178,6 +178,6 @@ public CompletableFuture getLyrics(String search, String source) private String cleanWithNewlines(Element element) { - return Jsoup.clean(Jsoup.clean(element.html(), newlineWhitelist), "", Whitelist.none(), noPrettyPrint); + return Jsoup.clean(Jsoup.clean(element.html(), newlineSafelist), "", Safelist.none(), noPrettyPrint); } } From c7a9b3f0aad14c6b10f68e79b7a9de05bd00b88c Mon Sep 17 00:00:00 2001 From: Conner Bradley Date: Mon, 12 Sep 2022 10:03:03 -0400 Subject: [PATCH 2/2] Fix errors from azLyrics and genius #9 To fix test suite errors related to genius, the configuration selectors had to be updated. To fix azLyrics tests: - AZLyrics generates a token that is embedded on the page - Token does not come from a standard API endpoint, rather, it is inlined in a JS file that then embeds the token on input form elements as a hidden attribute (unfortunately can't extract it from the initial page returned from the server) To resolve, I added an optional "token" config section for AZ lyrics that fetches the JS file, extracts the token, and then uses it on subsequent requests. --- .../com/jagrosh/jlyrics/LyricsClient.java | 130 +++++++++++++----- src/main/resources/reference.conf | 13 +- 2 files changed, 105 insertions(+), 38 deletions(-) diff --git a/src/main/java/com/jagrosh/jlyrics/LyricsClient.java b/src/main/java/com/jagrosh/jlyrics/LyricsClient.java index a7169df..a245118 100644 --- a/src/main/java/com/jagrosh/jlyrics/LyricsClient.java +++ b/src/main/java/com/jagrosh/jlyrics/LyricsClient.java @@ -23,6 +23,9 @@ import java.util.concurrent.CompletableFuture; import java.util.concurrent.Executor; import java.util.concurrent.Executors; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + import org.json.JSONException; import org.json.JSONObject; import org.json.XML; @@ -122,49 +125,60 @@ public CompletableFuture getLyrics(String search, String source) return CompletableFuture.completedFuture(cache.get(cacheKey)); try { - String searchUrl = String.format(config.getString("lyrics." + source + ".search.url"), search); + CompletableFuture futureToken; boolean jsonSearch = config.getBoolean("lyrics." + source + ".search.json"); String select = config.getString("lyrics." + source + ".search.select"); String titleSelector = config.getString("lyrics." + source + ".parse.title"); String authorSelector = config.getString("lyrics." + source + ".parse.author"); String contentSelector = config.getString("lyrics." + source + ".parse.content"); - return CompletableFuture.supplyAsync(() -> - { - try + + if (config.hasPath("lyrics." + source + ".token")) { + futureToken = getToken(source); + } else { + futureToken = CompletableFuture.completedFuture(""); + } + + return futureToken.thenCompose(token -> { + String searchUrl = String.format(config.getString("lyrics." + source + ".search.url"), search, token); + + return CompletableFuture.supplyAsync(() -> { - Document doc; - Connection connection = Jsoup.connect(searchUrl).userAgent(userAgent).timeout(timeout); - if(jsonSearch) + try { - String body = connection.ignoreContentType(true).execute().body(); - JSONObject json = new JSONObject(body); - doc = Jsoup.parse(XML.toString(json)); + Document doc; + Connection connection = Jsoup.connect(searchUrl).userAgent(userAgent).timeout(timeout); + if(jsonSearch) + { + String body = connection.ignoreContentType(true).execute().body(); + JSONObject json = new JSONObject(body); + doc = Jsoup.parse(XML.toString(json)); + } + else + doc = connection.get(); + + Element urlElement = doc.selectFirst(select); + String url; + if(jsonSearch) + url = urlElement.text(); + else + url = urlElement.attr("abs:href"); + if(url==null || url.isEmpty()) + return null; + doc = Jsoup.connect(url).userAgent(userAgent).timeout(timeout).get(); + Lyrics lyrics = new Lyrics(doc.selectFirst(titleSelector).ownText(), + doc.selectFirst(authorSelector).ownText(), + cleanWithNewlines(doc.selectFirst(contentSelector)), + url, + source); + cache.put(cacheKey, lyrics); + return lyrics; } - else - doc = connection.get(); - - Element urlElement = doc.selectFirst(select); - String url; - if(jsonSearch) - url = urlElement.text(); - else - url = urlElement.attr("abs:href"); - if(url==null || url.isEmpty()) + catch(IOException | NullPointerException | JSONException ex) + { return null; - doc = Jsoup.connect(url).userAgent(userAgent).timeout(timeout).get(); - Lyrics lyrics = new Lyrics(doc.selectFirst(titleSelector).ownText(), - doc.selectFirst(authorSelector).ownText(), - cleanWithNewlines(doc.selectFirst(contentSelector)), - url, - source); - cache.put(cacheKey, lyrics); - return lyrics; - } - catch(IOException | NullPointerException | JSONException ex) - { - return null; - } - }, executor); + } + }, executor); + }); } catch(ConfigException ex) { @@ -175,7 +189,53 @@ public CompletableFuture getLyrics(String search, String source) return null; } } - + + private CompletableFuture getToken(String source) { + try { + String tokenUrl = config.getString("lyrics." + source + ".token.url"); + String select = config.getString("lyrics." + source + ".token.select"); + boolean textSearch = config.getBoolean("lyrics." + source + ".token.text"); + + return CompletableFuture.supplyAsync(() -> { + try { + Pattern pattern = null; + + // Optional regex for post-processing + // Helpful if token is not accessible using HTML accessors (e.g, inlined in a JS file) + if (config.hasPath("lyrics." + source + ".token.regex")) { + String regexPattern = config.getString("lyrics." + source + ".token.regex"); + pattern = Pattern.compile(regexPattern); + } + + Connection connection = Jsoup.connect(tokenUrl).userAgent(userAgent).timeout(timeout); + String body; + + if (textSearch) { + body = connection.ignoreContentType(true).execute().body(); + } else { + // HTML -- apply selectors to derive body string + Document doc = connection.get(); + body = doc.selectFirst(select).ownText(); + } + + if (pattern != null) { + Matcher matcher = pattern.matcher(body); + if (matcher.find()) { + return matcher.group(); + } + } + return null; + } catch (IOException | NullPointerException ex) { + return null; + } + }, executor); + } catch (ConfigException ex) { + throw new IllegalArgumentException(String.format("Source '%s' does not exist or is not configured correctly", source)); + } catch (Exception ignored) { + return null; + } + } + private String cleanWithNewlines(Element element) { return Jsoup.clean(Jsoup.clean(element.html(), newlineSafelist), "", Safelist.none(), noPrettyPrint); diff --git a/src/main/resources/reference.conf b/src/main/resources/reference.conf index aef3322..fd961cf 100644 --- a/src/main/resources/reference.conf +++ b/src/main/resources/reference.conf @@ -6,9 +6,16 @@ lyrics A-Z Lyrics { + token + { + url = "https://www.azlyrics.com/geo.js" + text = true + select = "" + regex = """(?<=\"value\",\s\").*(?=\")""" + } search { - url = "https://search.azlyrics.com/search.php?q=%s" + url = "https://search.azlyrics.com/search.php?q=%s&x=%s" json = false select = "a[href*=/lyrics/]" } @@ -30,8 +37,8 @@ lyrics } parse { - title = "h1[class^=SongHeader__Title]" - author = "a[class*=SongHeader__Artist]" + title = "h1[class*=__Title] > span" + author = "a[class*=__Artist]" content = "div[class^=Lyrics__Container]" } }