From 305ba3c7c4bdcf6c12c3776848530da8d8ffa63b Mon Sep 17 00:00:00 2001 From: Pierce Thompson Date: Sun, 7 May 2023 18:15:04 -0400 Subject: [PATCH] Rewrite the lyric scraper The old lyric scraper would put each text element on it's own line, meaning annotations would cause incorrect line breaks. The new parser only splits lines at line breaks. --- CHANGELOG.md | 7 +++--- src/lyrics.rs | 68 ++++++++++++++++++++++++++++++--------------------- 2 files changed, 44 insertions(+), 31 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 21f5ea6..8a673de 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,10 +8,11 @@ - Added a description meta tag to the home page. - Added logging for internal server errors. - Fixed font scaling on smaller devices. -- Fixed browsers not invalidating static assets between Intellectual versions. +- Fixed browsers not invalidating static assets between versions. - Fixed multiple panics from invalid requests/responses. -- Fixed the Intellectual logo being hard to see in light mode. -- Fixed the lyric parser sometimes returning empty lines. +- Fixed the logo being hard to see in light mode. +- Fixed the lyric parser sometimes creating empty lines. +- Fixed the lyric parser creating new lines where annotations start/end. - Changed default address to `0.0.0.0`. diff --git a/src/lyrics.rs b/src/lyrics.rs index 89576c4..52d13e7 100644 --- a/src/lyrics.rs +++ b/src/lyrics.rs @@ -2,7 +2,8 @@ use actix_web::{get, web, Responder, Result}; use askama::Template; use futures::future; use once_cell::sync::Lazy; -use scraper::{Html, Selector}; + +use scraper::{Html, Node, Selector}; use serde::Deserialize; use crate::genius::GeniusSong; @@ -78,38 +79,49 @@ fn get_song_id(document: &Html) -> crate::Result { } fn scrape_lyrics(document: &Html) -> Vec { - let text_iter = document.select(&LYRIC_SELECTOR).flat_map(|x| x.text()); - - let mut verses = Vec::with_capacity(text_iter.size_hint().0); + let mut verses = Vec::new(); + let mut current_verse: Option = None; + let mut new_line = false; - for text in text_iter { - if text.starts_with('[') && text.ends_with(']') { - verses.push(Verse { - title: text.to_string(), - lyrics: Vec::new(), - }); - continue; - } - let trimmed = text.trim(); - if trimmed.is_empty() { - continue; - } - if verses.is_empty() { - verses.push(Verse { - title: String::new(), - lyrics: Vec::new(), - }) - } - let idx = verses.len() - 1; - if let Some(verse) = verses.get_mut(idx) { - verse.lyrics.push(trimmed.to_owned()) + for child in document + .select(&LYRIC_SELECTOR) + .flat_map(|e| e.descendants()) + { + match child.value() { + Node::Element(e) if e.name() == "br" => { + new_line = true; + } + Node::Text(text) => { + let text: &str = text; + let is_title = text.starts_with('[') && text.ends_with(']'); + if is_title { + if let Some(curr) = current_verse { + verses.push(curr); + } + current_verse = Some(Verse { + title: text.to_string(), + lyrics: Vec::new(), + }); + } else if let Some(curr) = current_verse.as_mut() { + let last = curr.lyrics.last_mut(); + if new_line || last.is_none() { + curr.lyrics.push(text.to_owned()); + new_line = false; + } else if let Some(lyric) = last { + lyric.push_str(text); + } + } + } + _ => {} } } - if verses.is_empty() { + if let Some(curr) = current_verse { + verses.push(curr); + } else { verses.push(Verse { - title: "This song has no lyrics".to_owned(), - lyrics: Vec::new(), + title: String::new(), + lyrics: vec!["This song has no lyrics.".to_owned()], }) }