Skip to content

Commit

Permalink
Rewrite the lyric scraper
Browse files Browse the repository at this point in the history
The old lyric scraper would put each text element on it's own line, meaning annotations would cause incorrect line breaks. The new parser only splits lines at line breaks.
  • Loading branch information
Insprill committed May 7, 2023
1 parent 574ddb8 commit 305ba3c
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 31 deletions.
7 changes: 4 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@
- Added a description meta tag to the home page.
- Added logging for internal server errors.
- Fixed font scaling on smaller devices.
- Fixed browsers not invalidating static assets between Intellectual versions.
- Fixed browsers not invalidating static assets between versions.
- Fixed multiple panics from invalid requests/responses.
- Fixed the Intellectual logo being hard to see in light mode.
- Fixed the lyric parser sometimes returning empty lines.
- Fixed the logo being hard to see in light mode.
- Fixed the lyric parser sometimes creating empty lines.
- Fixed the lyric parser creating new lines where annotations start/end.
- Changed default address to `0.0.0.0`.


Expand Down
68 changes: 40 additions & 28 deletions src/lyrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ use actix_web::{get, web, Responder, Result};
use askama::Template;
use futures::future;
use once_cell::sync::Lazy;
use scraper::{Html, Selector};

use scraper::{Html, Node, Selector};
use serde::Deserialize;

use crate::genius::GeniusSong;
Expand Down Expand Up @@ -78,38 +79,49 @@ fn get_song_id(document: &Html) -> crate::Result<u32> {
}

fn scrape_lyrics(document: &Html) -> Vec<Verse> {
let text_iter = document.select(&LYRIC_SELECTOR).flat_map(|x| x.text());

let mut verses = Vec::with_capacity(text_iter.size_hint().0);
let mut verses = Vec::new();
let mut current_verse: Option<Verse> = None;
let mut new_line = false;

for text in text_iter {
if text.starts_with('[') && text.ends_with(']') {
verses.push(Verse {
title: text.to_string(),
lyrics: Vec::new(),
});
continue;
}
let trimmed = text.trim();
if trimmed.is_empty() {
continue;
}
if verses.is_empty() {
verses.push(Verse {
title: String::new(),
lyrics: Vec::new(),
})
}
let idx = verses.len() - 1;
if let Some(verse) = verses.get_mut(idx) {
verse.lyrics.push(trimmed.to_owned())
for child in document
.select(&LYRIC_SELECTOR)
.flat_map(|e| e.descendants())
{
match child.value() {
Node::Element(e) if e.name() == "br" => {
new_line = true;
}
Node::Text(text) => {
let text: &str = text;
let is_title = text.starts_with('[') && text.ends_with(']');
if is_title {
if let Some(curr) = current_verse {
verses.push(curr);
}
current_verse = Some(Verse {
title: text.to_string(),
lyrics: Vec::new(),
});
} else if let Some(curr) = current_verse.as_mut() {
let last = curr.lyrics.last_mut();
if new_line || last.is_none() {
curr.lyrics.push(text.to_owned());
new_line = false;
} else if let Some(lyric) = last {
lyric.push_str(text);
}
}
}
_ => {}
}
}

if verses.is_empty() {
if let Some(curr) = current_verse {
verses.push(curr);
} else {
verses.push(Verse {
title: "This song has no lyrics".to_owned(),
lyrics: Vec::new(),
title: String::new(),
lyrics: vec!["This song has no lyrics.".to_owned()],
})
}

Expand Down

0 comments on commit 305ba3c

Please sign in to comment.