From ef8ceaf491f5576a3f0a030609708bed944de2d3 Mon Sep 17 00:00:00 2001 From: Lukas <50193931+lmuenter@users.noreply.github.com> Date: Wed, 25 Sep 2024 20:53:44 +0200 Subject: [PATCH] Fix tag list (#5) * refactor and add metadata parsing method * fix failing tests --------- Co-authored-by: lukas --- pelican/plugins/lm_pelican_quarto/quarto.py | 87 +------------------ pelican/plugins/lm_pelican_quarto/readers.py | 77 ++++++++++++++++ .../lm_pelican_quarto/tests/test_quarto.py | 12 +-- 3 files changed, 86 insertions(+), 90 deletions(-) create mode 100644 pelican/plugins/lm_pelican_quarto/readers.py diff --git a/pelican/plugins/lm_pelican_quarto/quarto.py b/pelican/plugins/lm_pelican_quarto/quarto.py index 7eadbf0..f13a3a6 100644 --- a/pelican/plugins/lm_pelican_quarto/quarto.py +++ b/pelican/plugins/lm_pelican_quarto/quarto.py @@ -1,102 +1,19 @@ -from datetime import date, datetime import logging from pathlib import Path -import re from bs4 import BeautifulSoup -import markdown -import pytz -import yaml -from pelican import readers, signals -from pelican.contents import Author, Category +from pelican import signals from pelican.generators import ArticlesGenerator from .adapters import Quarto from .parsers import QuartoHTML +from .readers import QuartoReader logger = logging.getLogger(__name__) QUARTO_EXTENSION = "qmd" -class QuartoReader(readers.BaseReader): - """Read QMD Files using a Pelican Reader.""" - - file_extensions = [QUARTO_EXTENSION] - - def read(self, filename): - """Read QMD Files.""" - with open(filename, encoding="utf-8") as file: - content = file.read() - - # extract yaml header and content body - _, front_matter, markdown_body = re.split(r"^---\s*$", content, 2, re.MULTILINE) - - metadata = yaml.load(front_matter, Loader=yaml.FullLoader) - - # ensure correct datetime format for date - metadata["date"] = self.parse_date(metadata["date"]) - - if "category" in metadata: - metadata["category"] = Category( - metadata["category"], settings=self.settings - ) - if "author" in metadata: - metadata["author"] = Author(metadata["author"], settings=self.settings) - - article_content = markdown.markdown(markdown_body) - metadata["summary"] = self.generate_article_summary( - metadata.get("summary"), article_content - ) - return article_content, metadata - - def parse_date(self, date_input): - """Ensure date has timezone information.""" - if isinstance(date_input, datetime): - return ( - date_input if date_input.tzinfo else date_input.replace(tzinfo=pytz.UTC) - ) - if isinstance(date_input, date): - return datetime( - year=date_input.year, - month=date_input.month, - day=date_input.day, - tzinfo=pytz.UTC, - ) - if isinstance(date_input, str): - return datetime.strptime(date_input, "%Y-%m-%d").replace(tzinfo=pytz.UTC) - logger.error("Invalid date format or type") - return None - - def generate_article_summary(self, existing_summary, content): - """Generate a summary if one does not exist.""" - if existing_summary: - return existing_summary - - # strip code blocks - content_no_code = re.sub(r"```.*?```", "", content, flags=re.DOTALL) - html_content = markdown.markdown(content_no_code) - - soup = BeautifulSoup(html_content, "html.parser") - - max_paragraphs = self.settings.get("SUMMARY_MAX_PARAGRAPHS", None) - if max_paragraphs is not None: - paragraphs = soup.find_all("p")[:max_paragraphs] - html_content = "".join(str(p) for p in paragraphs) - soup = BeautifulSoup(html_content, "html.parser") - - max_length = self.settings.get("SUMMARY_MAX_LENGTH", None) - end_suffix = self.settings.get("SUMMARY_END_SUFFIX", "...") - - text_content = soup.get_text() - if max_length is not None: - words = text_content.split() - if len(words) > max_length: - text_content = " ".join(words[:max_length]) + end_suffix - - return text_content - - def setup_quarto_project(pelican_instance): """Set up the Quarto project if a .qmd file is found.""" content_path = Path(pelican_instance.settings["PATH"]) diff --git a/pelican/plugins/lm_pelican_quarto/readers.py b/pelican/plugins/lm_pelican_quarto/readers.py new file mode 100644 index 0000000..498f62d --- /dev/null +++ b/pelican/plugins/lm_pelican_quarto/readers.py @@ -0,0 +1,77 @@ +import logging +import re + +from bs4 import BeautifulSoup +import markdown +import yaml + +from pelican import readers + +logger = logging.getLogger(__name__) +QUARTO_EXTENSION = "qmd" + + +class QuartoReader(readers.BaseReader): + """Read QMD Files using a Pelican Reader.""" + + enabled = True + file_extensions = [QUARTO_EXTENSION] + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._source_path = None + + def read(self, filename): + """Read QMD Files.""" + with open(filename, encoding="utf-8") as file: + content = file.read() + + # extract yaml header and content body + _, front_matter, markdown_body = re.split(r"^---\s*$", content, 2, re.MULTILINE) + + metadata = yaml.load(front_matter, Loader=yaml.FullLoader) + metadata = self._parse_metadata(metadata) + + article_content = markdown.markdown(markdown_body) + metadata["summary"] = self.generate_article_summary( + metadata.get("summary"), article_content + ) + + return article_content, metadata + + def _parse_metadata(self, meta): + """Parse and format the metadata from YAML.""" + output = {} + for name, value in meta.items(): + key = name.lower() + output[key] = self.process_metadata(key, value) + + return output + + def generate_article_summary(self, existing_summary, content): + """Generate a summary if one does not exist.""" + if existing_summary: + return existing_summary + + # strip code blocks + content_no_code = re.sub(r"```.*?```", "", content, flags=re.DOTALL) + html_content = markdown.markdown(content_no_code) + + soup = BeautifulSoup(html_content, "html.parser") + + max_paragraphs = self.settings.get("SUMMARY_MAX_PARAGRAPHS", None) + if max_paragraphs is not None: + paragraphs = soup.find_all("p")[:max_paragraphs] + html_content = "".join(str(p) for p in paragraphs) + soup = BeautifulSoup(html_content, "html.parser") + + max_length = self.settings.get("SUMMARY_MAX_LENGTH", None) + end_suffix = self.settings.get("SUMMARY_END_SUFFIX", "...") + + text_content = soup.get_text() + if max_length is not None: + words = text_content.split() + if len(words) > max_length: + text_content = " ".join(words[:max_length]) + end_suffix + + return text_content diff --git a/pelican/plugins/lm_pelican_quarto/tests/test_quarto.py b/pelican/plugins/lm_pelican_quarto/tests/test_quarto.py index a38098e..e6f5cd4 100644 --- a/pelican/plugins/lm_pelican_quarto/tests/test_quarto.py +++ b/pelican/plugins/lm_pelican_quarto/tests/test_quarto.py @@ -28,8 +28,9 @@ def create_article(temp_path): article_content = """ --- title: testqmd -date: 2024-06-02 -category: test +date: "2024-06-02" +category: "test" +tags: ["arts", "b"] --- Hi @@ -58,9 +59,10 @@ def create_nested_article(temp_path): article_content = """ --- -title: testqmd -date: 2024-06-02 -category: test +title: "testqmd" +date: "2024-06-02" +category: "test" +tags: ["a", "b"] --- Hi