From 5ffbfaa50b20b52b4e876d35f01affc689c3804a Mon Sep 17 00:00:00 2001 From: Whispersilk Date: Sat, 22 Oct 2022 11:38:50 -0400 Subject: [PATCH] Split client into its own file to fix HTTP 429. Get AO3 from single-page view. Allow adding multiple stories at once. Start fixing panics in parsers. --- Cargo.lock | 9 +- Cargo.toml | 1 + src/args.rs | 6 +- src/client.rs | 77 +++++++++++++ src/error.rs | 7 ++ src/main.rs | 49 +++++--- src/parser/ao3.rs | 242 ++++++++++++++++++-------------------- src/parser/katalepsis.rs | 27 ++--- src/parser/mod.rs | 72 ++++++------ src/parser/royalroad.rs | 27 ++--- src/sql.rs | 243 +++++++++++++++++++++------------------ src/structs.rs | 44 ++----- 12 files changed, 429 insertions(+), 375 deletions(-) create mode 100644 src/client.rs diff --git a/Cargo.lock b/Cargo.lock index c69bd09..7c1f336 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -353,6 +353,7 @@ dependencies = [ "reqwest", "rusqlite", "select", + "serde", "tokio", ] @@ -1462,18 +1463,18 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.143" +version = "1.0.146" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53e8e5d5b70924f74ff5c6d64d9a5acd91422117c60f48c4e07855238a254553" +checksum = "6df50b7a60a0ad48e1b42eb38373eac8ff785d619fb14db917b4e63d5439361f" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.143" +version = "1.0.146" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3d8e8de557aee63c26b85b947f5e59b690d0454c753f3adeb5cd7835ab88391" +checksum = "a714fd32ba1d66047ce7d53dabd809e9922d538f9047de13cc4cffca47b36205" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index e6289e2..20db962 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,4 +18,5 @@ regex = "1.6.0" reqwest = { version = "0.11", features = ["cookies", "json"] } rusqlite = { version = "0.28.0", features = ["bundled-full"] } select = "0.5" +serde = "1.0.146" tokio = { version = "1", features = ["full"] } diff --git a/src/args.rs b/src/args.rs index f792c8d..d54d487 100644 --- a/src/args.rs +++ b/src/args.rs @@ -9,10 +9,10 @@ pub(crate) struct Args { #[derive(Debug, Subcommand)] pub(crate) enum Commands { - /// Add a story to the archive. + /// Add one or more stories to the archive. Add { - /// The URL of the story to add. - story: String, + /// The URLs of the story or stories to add. + stories: Vec, }, /// Check for updates to stories in the archive. diff --git a/src/client.rs b/src/client.rs new file mode 100644 index 0000000..305d5cd --- /dev/null +++ b/src/client.rs @@ -0,0 +1,77 @@ +use reqwest::{Client, Response, StatusCode}; +use serde::ser::Serialize; + +use std::time::Duration; + +use crate::error::ArchiveError; + +static CLIENT: once_cell::sync::OnceCell = once_cell::sync::OnceCell::new(); + +pub async fn get(url: &str) -> Result { + let client: &Client = + CLIENT.get_or_init(|| Client::builder().cookie_store(true).build().unwrap()); + let mut response = client.get(url).send().await?; + loop { + match response.status() { + StatusCode::TOO_MANY_REQUESTS => { + let base_url = &url[url.find("://").unwrap() + 3..]; + let base_url = &base_url[0..base_url.find("/").unwrap_or(base_url.len())]; + let time_to_wait: String = response.headers().get("retry-after").map_or_else( + || "60".to_owned(), + |v| { + v.to_str() + .map(|ok| ok.to_owned()) + .unwrap_or("60".to_owned()) + }, + ); + let time_to_wait = u64::from_str_radix(&time_to_wait, 10).expect(&format!( + "retry-after header {} is not a number", + time_to_wait + )); + println!( + "Too many requests to {}. Sleeping for {} seconds.", + base_url, time_to_wait + ); + tokio::time::sleep(Duration::from_secs(time_to_wait)).await; + response = client.get(url).send().await?; + } + _ => break Ok(response), + } + } +} + +pub async fn get_with_query( + url: &str, + query: &T, +) -> Result { + let client: &Client = + CLIENT.get_or_init(|| Client::builder().cookie_store(true).build().unwrap()); + let mut response = client.get(url).query(query).send().await?; + loop { + match response.status() { + StatusCode::TOO_MANY_REQUESTS => { + let base_url = &url[url.find("://").unwrap() + 3..]; + let base_url = &base_url[0..base_url.find("/").unwrap_or(base_url.len())]; + let time_to_wait: String = response.headers().get("retry-after").map_or_else( + || "60".to_owned(), + |v| { + v.to_str() + .map(|ok| ok.to_owned()) + .unwrap_or("60".to_owned()) + }, + ); + let time_to_wait = u64::from_str_radix(&time_to_wait, 10).expect(&format!( + "retry-after header {} is not a number", + time_to_wait + )); + println!( + "Too many requests to {}. Sleeping for {} seconds.", + base_url, time_to_wait + ); + tokio::time::sleep(Duration::from_secs(time_to_wait)).await; + response = client.get(url).query(query).send().await?; + } + _ => break Ok(response), + } + } +} diff --git a/src/error.rs b/src/error.rs index d4cb28f..04e3a82 100644 --- a/src/error.rs +++ b/src/error.rs @@ -4,6 +4,8 @@ use std::{error::Error, fmt}; pub enum ArchiveError { Internal(String), BadSource(String), + NoIdInSource(String, String), + PageError(String), StoryNotExists(String), Io(std::io::Error), Request(reqwest::Error), @@ -16,6 +18,11 @@ impl fmt::Display for ArchiveError { match *self { Self::Internal(ref s) => write!(f, "Internal error: {}", s), Self::BadSource(ref s) => write!(f, "Could not convert URL {} to a story source", s), + Self::NoIdInSource(ref url, ref name) => write!( + f, + "Url {url} maps to source {name} and must contain a story ID, but does not" + ), + Self::PageError(ref s) => write!(f, "{}", s), Self::StoryNotExists(ref s) => write!( f, "Story {} does not exist in the archive. Try adding it first.", diff --git a/src/main.rs b/src/main.rs index 340f640..158adbc 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,6 +8,7 @@ use self::error::ArchiveError; use self::structs::{Content, StorySource, SOURCES_LIST}; mod args; +mod client; mod error; mod parser; mod sql; @@ -19,7 +20,7 @@ async fn main() -> Result<(), ArchiveError> { let conn = Connection::open("/home/daniel/Documents/Code/fic_archive/test_db.db")?; match args.command { - Commands::Add { story } => add_story(story, &conn).await?, + Commands::Add { stories } => add_stories(stories, &conn).await?, Commands::Update { story, force_refresh, @@ -72,18 +73,40 @@ async fn main() -> Result<(), ArchiveError> { Ok(()) } -async fn add_story(story: String, conn: &Connection) -> Result<(), ArchiveError> { - let source = StorySource::from_url(story.as_str())?; - if sql::story_exists_with_id(conn, story.as_str())? { - println!("Story already exists in the archive. Updating..."); - update_archive(Some(story), false, conn).await +async fn add_stories(stories: Vec, conn: &Connection) -> Result<(), ArchiveError> { + let mut errors: Vec = Vec::new(); + for story in stories.iter() { + match StorySource::from_url(&story) { + Ok(source) => match add_story(source, conn).await { + Ok(_) => (), + Err(err) => errors.push(err), + }, + Err(err) => errors.push(err), + }; + } + errors.into_iter().next().map(|e| Err(e)).unwrap_or(Ok(())) +} + +async fn add_story(source: StorySource, conn: &Connection) -> Result<(), ArchiveError> { + let exists = sql::story_exists_with_id(conn, &source.to_id())?; + let url = source.to_url(); + if exists { + let new_chapters = update_story(source, false, conn).await?; + println!( + "Updated story at {} with {} new chapters.", + url, new_chapters + ); } else { - let parser = source.parser(); - let story = parser.get_story(source).await?; + let story = source.parser().get_story(source).await?; sql::save_story(conn, &story)?; - println!("Saved {} ({} chapters)", story.name, story.num_chapters()); - Ok(()) + println!( + "Added story {} ({} chapter{})", + story.name, + story.num_chapters(), + if story.num_chapters() == 1 { "" } else { "s" } + ); } + Ok(()) } async fn update_archive( @@ -110,8 +133,7 @@ async fn update_story( let existing_story = sql::get_story_by_id(conn, source.to_id().as_str())? .ok_or_else(|| ArchiveError::StoryNotExists(source.to_url()))?; let parser = source.parser(); - let client = parser.get_client(); - let new_skeleton = parser.get_skeleton(&client, source).await?; + let new_skeleton = parser.get_skeleton(source).await?; // Get a list of existing chapters and a list of fetched chapters, then filter to only fetched chapters that aren't saved. let mut existing_chapters: HashSet = @@ -133,7 +155,7 @@ async fn update_story( // If there are any new chapters, fetch the story and save them. let mut added_chapters = 0; if !new_chapters.is_empty() { - let new_story = parser.fill_skeleton(&client, new_skeleton).await?; + let new_story = parser.fill_skeleton(new_skeleton).await?; for chapter in new_chapters.into_iter() { match new_story.find_chapter(chapter) { Some(found) => { @@ -166,6 +188,7 @@ async fn delete_story(search: String, conn: &Connection) -> Result<(), ArchiveEr match matches.len() { 0 => println!("No matching stories found. Please try another search."), // 1 => sql::delete_story_by_id(matches[0])?, + 1 => println!("Got one story back! Id: {}", matches[0]), _ => todo!(), } Ok(()) diff --git a/src/parser/ao3.rs b/src/parser/ao3.rs index 54eeee8..f51ccec 100644 --- a/src/parser/ao3.rs +++ b/src/parser/ao3.rs @@ -3,16 +3,14 @@ use chrono::{ naive::NaiveDate, offset::{FixedOffset, Local, TimeZone}, }; -use futures::future::join; -use futures::future::join_all; use regex::Regex; -use reqwest::Client; use select::{ document::Document, predicate::{self, Predicate}, }; use crate::{ + client::get_with_query, error::ArchiveError, parser::Parser, structs::{Author, Chapter, ChapterText, Content, Story, StorySource}, @@ -25,83 +23,85 @@ pub(crate) struct AO3Parser; #[async_trait] impl Parser for AO3Parser { - fn get_client(&self) -> Client { - Client::builder().cookie_store(true).build().unwrap() - } - - async fn get_skeleton( - &self, - client: &Client, - source: StorySource, - ) -> Result { - let main_page = async { - Ok(client - .get(source.to_url()) - .query(&[("view_adult", "true")]) - .send() - .await? - .text() - .await?) - }; - let navigate = async { - Ok(client - .get(format!("{}/navigate", source.to_url())) - .query(&[("view_adult", "true")]) - .send() - .await? - .text() - .await?) - }; - let (main_result, nav_result) = join(main_page, navigate).await; - if let Err(e) = main_result { - return Err(e); - } else if let Err(e) = nav_result { - return Err(e); - } - let main_page = Document::from_read(main_result.unwrap().as_bytes())?; - let navigate = Document::from_read(nav_result.unwrap().as_bytes())?; + async fn get_skeleton(&self, source: StorySource) -> Result { + let main_page = get_with_query( + &source.to_url(), + &[("view_adult", "true"), ("view_full_work", "true")], + ) + .await? + .text() + .await?; + let navigate = get_with_query( + &format!("{}/navigate", source.to_url()), + &[("view_adult", "true")], + ) + .await? + .text() + .await?; + let main_page = Document::from_read(main_page.as_bytes())?; + let navigate = Document::from_read(navigate.as_bytes())?; let name = main_page .find(predicate::Class("title").and(predicate::Class("heading"))) .next() - .expect("Story did not have a title") + .ok_or(ArchiveError::PageError(format!( + "AO3: Could not find title (.title.heading) for story at {}", + source.to_url(), + )))? .text(); + let author = main_page - .find(predicate::Attr("rel", "author")) + .find(predicate::Attr("rel", "author").and(predicate::Attr("href", ()))) .next() - .expect("Story did not have author"); + .ok_or(ArchiveError::PageError(format!( + "AO3: Could not find author ([rel=\"author\"]) for {} at {}", + name, + source.to_url(), + )))?; let author_url = author .attr("href") - .expect("Author did not have link") - .replace("/users/", ""); - let mut author_url_split = author_url.splitn(2, "/pseuds/"); - let (base_author, pseud) = (author_url_split.next(), author_url_split.next()); + .expect("Author link should have href because of find() conditions"); let author = Author { name: author.text(), id: format!( - "ao3:{}:{}", - base_author.expect("Could not find author"), - pseud.unwrap_or("") + "ao3{}", + author_url + .replace("/users/", "") + .splitn(2, "/pseuds/") + .fold(String::new(), |mut acc, s| { + acc.push(':'); + acc.push_str(s); + acc + }), ), }; + let description = main_page .find(predicate::Class("summary").child(predicate::Class("userstuff"))) .next() .map(|n| n.children().map(|elem| elem.inner_html()).collect()); let url = source.to_url(); let tags = get_tags(&main_page); - let chapters = navigate - .find(predicate::Class("chapter").and(predicate::Class("index"))) - .next() - .expect("Navigation page must have chapter index") - .children() - .filter(|node| node.is(predicate::Name("li"))) - .map(|li| { - let chap = li + + let chapters = main_page + .find(predicate::Attr("id", "chapters").child(predicate::Class("chapter"))) + .map(|chapter| { + let title_h3 = chapter + .descendants() + .find(|n| n.is(predicate::Class("title"))) + .expect("Chapter should have title."); + let href = title_h3 .children() - .find(|c| c.is(predicate::Name("a"))) - .expect("Chapter should have "); - let href = chap.attr("href").expect("Chapterlink should have link"); + .find_map(|n| n.attr("href")) + .expect("Chapter should have link."); + let name = title_h3.text(); + let mut name_pieces = name.splitn(2, ":"); + let (chapter_num, chapter_name) = (name_pieces.next(), name_pieces.next()); + let name = chapter_name + .or(chapter_num) + .expect("Chapter should have a name or number") + .trim() + .to_owned(); let chap_id = CHAPTER_REGEX .1 .get_or_init(|| Regex::new(CHAPTER_REGEX.0).unwrap()) @@ -111,27 +111,61 @@ impl Parser for AO3Parser { .expect("Chapter url must contain id") .as_str(); - let posted_on = li + let posted_on = navigate + .find(predicate::Attr("href", href)) + .next() + .expect("Navigation page should have a link with this chapter's URL") + .parent() + .unwrap() .children() - .find(|c| c.is(predicate::Name("span"))) - .expect("Chapter should have date posted") - .text(); + .find_map(|c| { + if c.is(predicate::Class("datetime")) { + Some(c.text()) + } else { + None + } + }) + .expect("Navigation page should have a datetime span for this chapter"); let posted_on = posted_on.trim(); let timezone = FixedOffset::west(Local::now().offset().utc_minus_local()); + let date_posted = timezone + .from_local_datetime( + &NaiveDate::parse_from_str(&posted_on[1..posted_on.len() - 1], "%F") + .expect("Could not parse datestring to date") + .and_hms(3, 0, 0), + ) + .earliest() + .expect("Could not turn naive to full date"); + + let top_notes = chapter + .children() + .find(|c| c.is(predicate::Attr("id", "notes"))); + let bottom_notes = chapter + .children() + .find(|c| c.is(predicate::Class("end").and(predicate::Class("notes")))); + let chapter_text = chapter.children().find(|c| { + c.is(predicate::Class("userstuff").and(predicate::Attr("role", "article"))) + }); + + let chapter_text = format!( + "{}{}{}", + top_notes.map(|n| n.inner_html()).unwrap_or_default(), + chapter_text + .expect("Chapter has no text area") + .children() + .filter(|node| !node.is(predicate::Attr("id", "work"))) + .map(|node| node.html()) + .collect::(), + bottom_notes.map(|n| n.inner_html()).unwrap_or_default() + ); + Content::Chapter(Chapter { id: format!("{}:{}", source.to_id(), chap_id), - name: chap.text(), + name, description: None, - text: ChapterText::Dehydrated, + text: ChapterText::Hydrated(chapter_text), url: format!("https://archiveofourown.org{}", href), - date_posted: timezone - .from_local_datetime( - &NaiveDate::parse_from_str(&posted_on[1..posted_on.len() - 1], "%F") - .expect("Could not parse datestring to date") - .and_hms(3, 0, 0), - ) - .earliest() - .expect("Could not turn naive to full date"), + date_posted, }) }) .collect(); @@ -147,72 +181,16 @@ impl Parser for AO3Parser { }) } - async fn fill_skeleton( - &self, - client: &Client, - mut skeleton: Story, - ) -> Result { - let hydrate = skeleton - .chapters - .iter_mut() - .filter_map(|con| match con { - Content::Section(_) => None, - Content::Chapter(c) => Some(c), - }) - .map(|chapter| async { - let page = client.get(&chapter.url).send().await?.text().await?; - Ok((chapter, page)) - }); - - let results = join_all(hydrate).await; - if results - .iter() - .any(|res: &Result<(_, _), ArchiveError>| res.is_err()) - { - return Err(ArchiveError::Internal("Oopsie!".to_owned())); - } - - let mut results: Vec<(&mut Chapter, String)> = - results.into_iter().map(|r| r.unwrap()).collect(); - rayon::scope(|s| { - for (chapter, page) in results.iter_mut() { - s.spawn(|_| { - let document = Document::from_read(page.as_bytes()) - .expect("Couldn't read page to a document"); - let top_notes = document.find(predicate::Attr("id", "notes")).next(); - let bottom_notes = document - .find(predicate::Class("end").and(predicate::Class("notes"))) - .next(); - let chapter_text = document - .find(predicate::Class("userstuff").and(predicate::Attr("role", "article"))) - .next(); - - let chapter_text = format!( - "{}{}{}", - top_notes.map(|n| n.inner_html()).unwrap_or_default(), - chapter_text - .expect("Chapter has no text area") - .children() - .filter(|node| !node.is(predicate::Attr("id", "work"))) - .map(|node| node.html()) - .collect::(), - bottom_notes.map(|n| n.inner_html()).unwrap_or_default() - ); - - chapter.text = ChapterText::Hydrated(chapter_text); - }); - } - }); + async fn fill_skeleton(&self, skeleton: Story) -> Result { Ok(skeleton) } async fn get_story(&self, source: StorySource) -> Result { - let client = self.get_client(); - let story = self.get_skeleton(&client, source).await?; - self.fill_skeleton(&client, story).await + self.get_skeleton(source).await } } +/// TODO Support series listings and collections at some point? fn get_tags(document: &Document) -> Vec { document .find( diff --git a/src/parser/katalepsis.rs b/src/parser/katalepsis.rs index 8133424..d161d7e 100644 --- a/src/parser/katalepsis.rs +++ b/src/parser/katalepsis.rs @@ -1,7 +1,6 @@ use async_trait::async_trait; use chrono::{DateTime, FixedOffset, TimeZone}; use futures::future::join_all; -use reqwest::Client; use select::{ document::Document, node::Data::Text, @@ -12,6 +11,7 @@ use select::{ use std::iter; use crate::{ + client::get, error::ArchiveError, parser::Parser, structs::{Author, Chapter, ChapterText, Content, Section, Story, StorySource}, @@ -21,16 +21,8 @@ pub(crate) struct KatalepsisParser; #[async_trait] impl Parser for KatalepsisParser { - fn get_client(&self) -> Client { - Client::new() - } - - async fn get_skeleton( - &self, - client: &Client, - source: StorySource, - ) -> Result { - let main_page = client.get(source.to_url()).send().await?.text().await?; + async fn get_skeleton(&self, source: StorySource) -> Result { + let main_page = get(&source.to_url()).await?.text().await?; let main_page = Document::from_read(main_page.as_bytes())?; let name = "Katalepsis".to_owned(); @@ -125,11 +117,7 @@ impl Parser for KatalepsisParser { }) } - async fn fill_skeleton( - &self, - client: &Client, - mut skeleton: Story, - ) -> Result { + async fn fill_skeleton(&self, mut skeleton: Story) -> Result { let mut chapters: Vec<&mut Chapter> = Vec::with_capacity(skeleton.num_chapters()); for content in skeleton.chapters.iter_mut() { match content { @@ -139,7 +127,7 @@ impl Parser for KatalepsisParser { } let hydrate = chapters.into_iter().map(|chap| async { - let page = client.get(&chap.url).send().await?.text().await?; + let page = get(&chap.url).await?.text().await?; let document = Document::from_read(page.as_bytes())?; let mut cw_empty_owner; @@ -244,9 +232,8 @@ impl Parser for KatalepsisParser { } async fn get_story(&self, source: StorySource) -> Result { - let client = self.get_client(); - let story = self.get_skeleton(&client, source).await?; - self.fill_skeleton(&client, story).await + let story = self.get_skeleton(source).await?; + self.fill_skeleton(story).await } } diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 5e10ac7..0dada4c 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1,11 +1,10 @@ use async_trait::async_trait; -use html2md::parse_html; -use pandoc::{InputFormat, InputKind, OutputFormat, OutputKind, PandocOutput}; -use reqwest::Client; +// use html2md::parse_html; +// use pandoc::{InputFormat, InputKind, OutputFormat, OutputKind, PandocOutput}; use crate::{ error::ArchiveError, - structs::{Story, StorySource, TextFormat}, + structs::{Story, StorySource}, }; pub mod ao3; @@ -14,42 +13,37 @@ pub mod royalroad; #[async_trait] pub trait Parser { - fn get_client(&self) -> Client; - async fn get_skeleton( - &self, - client: &Client, - source: StorySource, - ) -> Result; - async fn fill_skeleton(&self, client: &Client, skeleton: Story) -> Result; + async fn get_skeleton(&self, source: StorySource) -> Result; + async fn fill_skeleton(&self, skeleton: Story) -> Result; async fn get_story(&self, source: StorySource) -> Result; } -fn convert_to_format(html: String, format: TextFormat) -> String { - custom_convert_to_format(html, format, None) -} +// fn convert_to_format(html: String, format: TextFormat) -> String { +// custom_convert_to_format(html, format, None) +// } -fn custom_convert_to_format( - html: String, - format: TextFormat, - custom_behavior: Option String>>, -) -> String { - let initial_text = match format { - TextFormat::Html => html, - TextFormat::Markdown => { - let mut pandoc = pandoc::new(); - pandoc - .set_input_format(InputFormat::Html, Vec::new()) - .set_output_format(OutputFormat::MarkdownStrict, Vec::new()) - .set_input(InputKind::Pipe(html.clone())) - .set_output(OutputKind::Pipe); - match pandoc.execute() { - Ok(PandocOutput::ToBuffer(text)) => text, - _ => parse_html(html.as_str()), - } - } - }; - match custom_behavior { - Some(f) => f(initial_text, format), - None => initial_text, - } -} +// fn custom_convert_to_format( +// html: String, +// format: TextFormat, +// custom_behavior: Option String>>, +// ) -> String { +// let initial_text = match format { +// TextFormat::Html => html, +// TextFormat::Markdown => { +// let mut pandoc = pandoc::new(); +// pandoc +// .set_input_format(InputFormat::Html, Vec::new()) +// .set_output_format(OutputFormat::MarkdownStrict, Vec::new()) +// .set_input(InputKind::Pipe(html.clone())) +// .set_output(OutputKind::Pipe); +// match pandoc.execute() { +// Ok(PandocOutput::ToBuffer(text)) => text, +// _ => parse_html(html.as_str()), +// } +// } +// }; +// match custom_behavior { +// Some(f) => f(initial_text, format), +// None => initial_text, +// } +// } diff --git a/src/parser/royalroad.rs b/src/parser/royalroad.rs index ae167f2..749ce3f 100644 --- a/src/parser/royalroad.rs +++ b/src/parser/royalroad.rs @@ -2,10 +2,10 @@ use async_trait::async_trait; use chrono::DateTime; use futures::future::join_all; use regex::Regex; -use reqwest::Client; use select::{document::Document, predicate, predicate::Predicate}; use crate::{ + client::get, error::ArchiveError, parser::Parser, structs::{Author, Chapter, ChapterText, Content, Story, StorySource}, @@ -18,16 +18,8 @@ pub(crate) struct RoyalRoadParser; #[async_trait] impl Parser for RoyalRoadParser { - fn get_client(&self) -> Client { - Client::new() - } - - async fn get_skeleton( - &self, - client: &Client, - source: StorySource, - ) -> Result { - let main_page = client.get(&source.to_url()).send().await?.text().await?; + async fn get_skeleton(&self, source: StorySource) -> Result { + let main_page = get(&source.to_url()).await?.text().await?; let main_page = Document::from_read(main_page.as_bytes())?; let chapters = main_page .find( @@ -150,11 +142,7 @@ impl Parser for RoyalRoadParser { }) } - async fn fill_skeleton( - &self, - client: &Client, - mut skeleton: Story, - ) -> Result { + async fn fill_skeleton(&self, mut skeleton: Story) -> Result { let hydrate = skeleton .chapters .iter_mut() @@ -163,7 +151,7 @@ impl Parser for RoyalRoadParser { Content::Chapter(c) => Some(c), }) .map(|chapter| async { - let page = client.get(&chapter.url).send().await?.text().await?; + let page = get(&chapter.url).await?.text().await?; Ok((chapter, page)) }); @@ -194,8 +182,7 @@ impl Parser for RoyalRoadParser { } async fn get_story(&self, source: StorySource) -> Result { - let client = self.get_client(); - let story = self.get_skeleton(&client, source).await?; - self.fill_skeleton(&client, story).await + let story = self.get_skeleton(source).await?; + self.fill_skeleton(story).await } } diff --git a/src/sql.rs b/src/sql.rs index f5b479b..710f782 100644 --- a/src/sql.rs +++ b/src/sql.rs @@ -1,6 +1,6 @@ use chrono::DateTime; use rayon::prelude::ParallelSliceMut; -use rusqlite::{Connection, Error, Result}; +use rusqlite::{types::Type, Connection, Error, Result, Row}; use std::ops::{Deref, DerefMut}; use std::sync::Mutex; @@ -24,28 +24,33 @@ pub fn create_tables(conn: &Connection) -> Result<(), Error> { name TEXT NOT NULL )", (), - )?; + ) + .unwrap(); conn.execute( "CREATE TABLE IF NOT EXISTS stories ( id TEXT NOT NULL PRIMARY KEY, name TEXT NOT NULL, description TEXT, url TEXT NOT NULL, - author_id TEXT NOT NULL REFERENCES authors(id) + author_id TEXT NOT NULL, + FOREIGN KEY (author_id) REFERENCES authors(id) )", (), - )?; + ) + .unwrap(); conn.execute( "CREATE TABLE IF NOT EXISTS sections ( id TEXT PRIMARY KEY, name TEXT NOT NULL, description TEXT, url TEXT, - story_id TEXT NOT NULL REFERENCES stories(id), - parent_id TEXT + story_id TEXT NOT NULL, + parent_id TEXT, + FOREIGN KEY (story_id) REFERENCES stories(id) )", (), - )?; + ) + .unwrap(); conn.execute( "CREATE TABLE IF NOT EXISTS chapters ( id TEXT PRIMARY KEY, @@ -54,35 +59,43 @@ pub fn create_tables(conn: &Connection) -> Result<(), Error> { text TEXT NOT NULL, url TEXT NOT NULL, date_posted TEXT NOT NULL, - story_id TEXT NOT NULL REFERENCES stories(id), - section_id TEXT REFERENCES sections(id) + story_id TEXT NOT NULL, + section_id TEXT, + FOREIGN KEY (story_id) REFERENCES stories(id), + FOREIGN KEY (section_id) REFERENCES sections(id) )", (), - )?; + ) + .unwrap(); conn.execute( "CREATE TABLE IF NOT EXISTS tags ( id TEXT PRIMARY KEY, name TEXT NOT NULL )", (), - )?; + ) + .unwrap(); conn.execute( "CREATE TABLE IF NOT EXISTS tag_uses ( - tag_id TEXT NOT NULL REFERENCES tags(id), - story_id TEXT NOT NULL REFERENCES stories(id) + tag_id TEXT NOT NULL, + story_id TEXT NOT NULL, + FOREIGN KEY (tag_id) REFERENCES tags(id), + FOREIGN KEY (story_id) REFERENCES stories(id) )", (), - )?; + ) + .unwrap(); *lock.deref_mut() = true; } Ok(()) } pub fn get_all_stories(conn: &Connection) -> Result, ArchiveError> { - create_tables(conn)?; + create_tables(conn).unwrap(); let mut failed_stories = 0; - let mut stmt = conn.prepare( - "SELECT + let mut stmt = conn + .prepare( + "SELECT stories.name, authors.name, COUNT(chapters.id) AS chapter_count @@ -90,15 +103,17 @@ pub fn get_all_stories(conn: &Connection) -> Result, ArchiveErr INNER JOIN authors ON stories.author_id = authors.id INNER JOIN chapters ON stories.id = chapters.story_id GROUP BY stories.id", - )?; + ) + .unwrap(); let stories: Vec = stmt .query_map([], |row| { Ok(ListedStory { - name: row.get(0)?, - author: row.get(1)?, - chapter_count: row.get(2)?, + name: row.get(0).unwrap(), + author: row.get(1).unwrap(), + chapter_count: row.get(2).unwrap(), }) - })? + }) + .unwrap() .filter_map(|listed| match listed { Ok(story) => Some(story), Err(_) => { @@ -116,8 +131,10 @@ pub fn get_all_stories(conn: &Connection) -> Result, ArchiveErr } pub fn story_exists_with_id(conn: &Connection, id: &str) -> Result { - create_tables(conn)?; - let mut stmt = conn.prepare("SELECT COUNT(1) FROM stories WHERE id = :id")?; + create_tables(conn).unwrap(); + let mut stmt = conn + .prepare("SELECT COUNT(*) FROM stories WHERE id = :id") + .unwrap(); let story_exists = stmt .query_row(&[(":id", id)], |row| match row.get(0) { Ok(0) => Ok(None), @@ -129,91 +146,85 @@ pub fn story_exists_with_id(conn: &Connection, id: &str) -> Result Result, ArchiveError> { - create_tables(conn)?; - let mut stmt = conn.prepare( - "SELECT stories.id + create_tables(conn).unwrap(); + let mut stmt = conn + .prepare( + "SELECT stories.id FROM stories INNER JOIN authors ON stories.author_id = authors.id WHERE stories.name LIKE %:search% OR stories.id = :search OR authors.name LIKE %:search%", - )?; + ) + .unwrap(); let matches = stmt - .query_map(&[(":search", search)], |row| Ok(row.get(0)?))? + .query_map(&[(":search", search)], |row| Ok(row.get(0).unwrap())) + .unwrap() .filter_map(|id| id.ok()) .collect(); Ok(matches) } pub fn get_story_by_id(conn: &Connection, id: &str) -> Result, ArchiveError> { - if !story_exists_with_id(conn, id)? { + if !story_exists_with_id(conn, id).unwrap() { Ok(None) } else { let mut stmt = conn.prepare( "SELECT id, name, description, url, parent_id FROM sections WHERE story_id = :story_id", - )?; + ).unwrap(); let mut sections: Vec<(Option, Section)> = stmt .query_map(&[(":story_id", id)], |row| { - let section_id = row.get::(4)?; - let section_id = match section_id.as_str() { - "NULL" => None, - _ => Some(section_id), - }; Ok(( - section_id, + // ID of parent section, if one exists + match is_null(row, 4) { + true => None, + false => Some(row.get(4)?), + }, Section { - id: row.get(0)?, - name: row.get(1)?, - description: { - let desc: String = row.get(2)?; - match desc.as_str() { - "NULL" => None, - _ => Some(desc), - } + id: row.get(0).unwrap(), + name: row.get(1).unwrap(), + description: match is_null(row, 2) { + true => None, + false => Some(row.get(2)?), }, chapters: Vec::new(), - url: { - let url: String = row.get(2)?; - match url.as_str() { - "NULL" => None, - _ => Some(url), - } + url: match is_null(row, 3) { + true => None, + false => Some(row.get(3)?), }, }, )) - })? + }) + .unwrap() .map(|sec| sec.unwrap()) .collect(); - stmt = conn.prepare( - "SELECT id, name, description, text, url, date_posted, section_id + stmt = conn + .prepare( + "SELECT id, name, description, text, url, date_posted, section_id FROM chapters WHERE story_id = :story_id", - )?; + ) + .unwrap(); let mut chapters: Vec<(Option, Chapter)> = stmt .query_map(&[(":story_id", id)], |row| { - let section_id = row.get::(6)?; - let section_id = match section_id.as_str() { - "NULL" => None, - _ => Some(section_id), - }; - Ok(( - section_id, + // ID of parent section, if one exists + match is_null(row, 6) { + true => None, + false => Some(row.get(6)?), + }, Chapter { - id: row.get(0)?, - name: row.get(1)?, - description: { - let desc: String = row.get(2)?; - match desc.as_str() { - "NULL" => None, - _ => Some(desc), - } + id: row.get(0).unwrap(), + name: row.get(1).unwrap(), + description: match is_null(row, 2) { + true => None, + false => Some(row.get(2)?), }, - text: ChapterText::Hydrated(row.get(3)?), - url: row.get(4)?, + text: ChapterText::Hydrated(row.get(3).unwrap()), + url: row.get(4).unwrap(), date_posted: DateTime::parse_from_rfc3339( - row.get::(5)?.as_str(), + row.get::(5).unwrap().as_str(), ) .unwrap_or_else(|_| { panic!( @@ -223,7 +234,8 @@ pub fn get_story_by_id(conn: &Connection, id: &str) -> Result, Arc }), }, )) - })? + }) + .unwrap() .map(|chap| chap.unwrap()) .collect(); @@ -271,14 +283,17 @@ pub fn get_story_by_id(conn: &Connection, id: &str) -> Result, Arc .collect(); story_chapters.par_sort_unstable_by(|a, b| a.id().cmp(b.id())); - stmt = conn.prepare( - "SELECT tags.name + stmt = conn + .prepare( + "SELECT tags.name FROM tag_uses INNER JOIN tags ON tags.id = tag_uses.tag_id WHERE tag_uses.story_id = :story_id", - )?; + ) + .unwrap(); let story_tags: Vec = stmt - .query_map(&[(":story_id", id)], |row| row.get::(0))? + .query_map(&[(":story_id", id)], |row| row.get::(0)) + .unwrap() .filter(|res| res.is_ok()) .map(|res| res.unwrap()) .collect(); @@ -288,26 +303,27 @@ pub fn get_story_by_id(conn: &Connection, id: &str) -> Result, Arc FROM stories INNER JOIN authors ON stories.author_id = authors.id WHERE stories.id = :story_id", - )?; + ).unwrap(); let mut story = stmt .query_row(&[(":story_id", id)], |row| { - let source = - StorySource::from_url(row.get::(2)?.as_str()).map_err(|e| { + let source = StorySource::from_url(row.get::(2).unwrap().as_str()) + .map_err(|e| { rusqlite::Error::FromSqlConversionFailure( 2, rusqlite::types::Type::Text, Box::new(e), ) - })?; + }) + .unwrap(); Ok(( - row.get::(2)?, + row.get::(2).unwrap(), Story { - name: row.get(0)?, - description: row.get(1)?, - url: row.get(2)?, + name: row.get(0).unwrap(), + description: row.get(1).unwrap(), + url: row.get(2).unwrap(), author: Author { - id: row.get(3)?, - name: row.get(4)?, + id: row.get(3).unwrap(), + name: row.get(4).unwrap(), }, chapters: story_chapters, tags: story_tags, @@ -315,41 +331,46 @@ pub fn get_story_by_id(conn: &Connection, id: &str) -> Result, Arc }, )) }) - .map_err(ArchiveError::from)?; - story.1.source = StorySource::from_url(story.0.as_str())?; + .map_err(ArchiveError::from) + .unwrap(); + story.1.source = StorySource::from_url(story.0.as_str()).unwrap(); Ok(Some(story.1)) } } pub fn save_story(conn: &Connection, story: &Story) -> Result<(), ArchiveError> { - create_tables(conn)?; + create_tables(conn).unwrap(); conn.execute( "INSERT OR IGNORE INTO authors (id, name) VALUES (?1, ?2)", (&story.author.id, &story.author.name), - )?; + ) + .unwrap(); conn.execute( "INSERT INTO stories (id, name, description, url, author_id) VALUES (?1, ?2, ?3, ?4, ?5)", ( &story.source.to_id(), &story.name, - some_or_null(&story.description), + &story.description, &story.url, &story.author.id, ), - )?; + ) + .unwrap(); for content in story.chapters.iter().as_ref() { - save_content(conn, content, &story.source.to_id(), None)?; + save_content(conn, content, &story.source.to_id(), None).unwrap(); } for tag in story.tags.iter().as_ref() { let tag_id = tag.to_lowercase(); conn.execute( "INSERT OR IGNORE INTO tags (id, name) VALUES (?1, ?2)", (&tag_id, &tag), - )?; + ) + .unwrap(); conn.execute( "INSERT OR IGNORE INTO tag_uses (tag_id, story_id) VALUES (?1, ?2)", (&tag_id, &story.source.to_id()), - )?; + ) + .unwrap(); } Ok(()) } @@ -372,14 +393,14 @@ pub fn save_content( ( id, name, - some_or_null(description), - some_or_null(url), + description, + url, story_id, - if parent_id.is_none() { "NULL" } else { parent_id.unwrap() } + parent_id ) - )?; + ).unwrap(); for inner in chapters.iter() { - save_content(conn, inner, story_id, Some(id))?; + save_content(conn, inner, story_id, Some(id)).unwrap(); } } Content::Chapter(Chapter { @@ -394,24 +415,22 @@ pub fn save_content( ( id, name, - some_or_null(description), + description, text.as_str(), url, &date_posted.to_rfc3339(), story_id, - if parent_id.is_none() { "NULL" } else { parent_id.unwrap() } + parent_id ) - )?; + ).expect(format!("Failed to add chapter with values\nid: {}\nname: {}\nurl: {}\ndate_posted: {}\nstory_id: {}\nsection_id {}", id, name, url, date_posted.to_rfc3339(), story_id, "NULL").as_str()); } } Ok(()) } -#[inline] -fn some_or_null(optstr: &Option) -> &str { - if optstr.is_none() { - "NULL" - } else { - optstr.as_ref().unwrap() - } +fn is_null(row: &Row, column: usize) -> bool { + matches!( + row.get::(column), + Err(Error::InvalidColumnType(_, _, Type::Null)) + ) } diff --git a/src/structs.rs b/src/structs.rs index ab1d707..ccd4a05 100644 --- a/src/structs.rs +++ b/src/structs.rs @@ -1,6 +1,6 @@ use chrono::{DateTime, FixedOffset}; use once_cell::sync::OnceCell; -use regex::{Match, Regex}; +use regex::Regex; use crate::error::ArchiveError; use crate::parser::{ @@ -152,13 +152,6 @@ impl Author { } } -#[derive(Debug, Clone, Copy)] -#[allow(dead_code)] -pub enum TextFormat { - Html, - Markdown, -} - #[derive(Debug, Clone)] pub enum StorySource { AO3(String), @@ -193,17 +186,21 @@ impl StorySource { .map(|(src, reg_src)| (src, Regex::new(reg_src).unwrap())) .collect() }); - let maybe_match = regex_map.iter().find(|(_, regex)| regex.is_match(url)); - match maybe_match { + match regex_map.iter().find(|(_, regex)| regex.is_match(url)) { Some((name, regex)) => { let id = regex.captures(url).unwrap().name("id"); - let maybe_error = &format!( - "Url {url} maps to source {name} and must contain a story ID, but does not" - ); Ok(match *name { - "ao3" => Self::AO3(require_story_source_id(id, maybe_error)), + "ao3" => Self::AO3( + id.ok_or(ArchiveError::NoIdInSource(url.to_owned(), name.to_string()))? + .as_str() + .to_owned(), + ), "katalepsis" => Self::Katalepsis, - "rr" => Self::RoyalRoad(require_story_source_id(id, maybe_error)), + "rr" => Self::RoyalRoad( + id.ok_or(ArchiveError::NoIdInSource(url.to_owned(), name.to_string()))? + .as_str() + .to_owned(), + ), _ => panic!("URL matched source {name}, which has not been fully implemented"), }) } @@ -237,20 +234,3 @@ impl StorySource { } } } - -fn require_story_source_id(id_match: Option, errormsg: &str) -> String { - id_match.expect(errormsg).as_str().to_owned() -} - -#[derive(Clone, Debug)] -pub struct StoryBase { - pub title: String, - pub author: Author, - pub chapter_links: Vec, -} - -#[derive(Clone, Debug)] -pub struct ChapterLink { - pub url: String, - pub title: String, -}