Skip to content

Commit

Permalink
Fix tag list (#5)
Browse files Browse the repository at this point in the history
* refactor and add metadata parsing method

* fix failing tests

---------

Co-authored-by: lukas <[email protected]>
  • Loading branch information
lmuenter and lukas authored Sep 25, 2024
1 parent 25ceb6c commit ef8ceaf
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 90 deletions.
87 changes: 2 additions & 85 deletions pelican/plugins/lm_pelican_quarto/quarto.py
Original file line number Diff line number Diff line change
@@ -1,102 +1,19 @@
from datetime import date, datetime
import logging
from pathlib import Path
import re

from bs4 import BeautifulSoup
import markdown
import pytz
import yaml

from pelican import readers, signals
from pelican.contents import Author, Category
from pelican import signals
from pelican.generators import ArticlesGenerator

from .adapters import Quarto
from .parsers import QuartoHTML
from .readers import QuartoReader

logger = logging.getLogger(__name__)
QUARTO_EXTENSION = "qmd"


class QuartoReader(readers.BaseReader):
"""Read QMD Files using a Pelican Reader."""

file_extensions = [QUARTO_EXTENSION]

def read(self, filename):
"""Read QMD Files."""
with open(filename, encoding="utf-8") as file:
content = file.read()

# extract yaml header and content body
_, front_matter, markdown_body = re.split(r"^---\s*$", content, 2, re.MULTILINE)

metadata = yaml.load(front_matter, Loader=yaml.FullLoader)

# ensure correct datetime format for date
metadata["date"] = self.parse_date(metadata["date"])

if "category" in metadata:
metadata["category"] = Category(
metadata["category"], settings=self.settings
)
if "author" in metadata:
metadata["author"] = Author(metadata["author"], settings=self.settings)

article_content = markdown.markdown(markdown_body)
metadata["summary"] = self.generate_article_summary(
metadata.get("summary"), article_content
)
return article_content, metadata

def parse_date(self, date_input):
"""Ensure date has timezone information."""
if isinstance(date_input, datetime):
return (
date_input if date_input.tzinfo else date_input.replace(tzinfo=pytz.UTC)
)
if isinstance(date_input, date):
return datetime(
year=date_input.year,
month=date_input.month,
day=date_input.day,
tzinfo=pytz.UTC,
)
if isinstance(date_input, str):
return datetime.strptime(date_input, "%Y-%m-%d").replace(tzinfo=pytz.UTC)
logger.error("Invalid date format or type")
return None

def generate_article_summary(self, existing_summary, content):
"""Generate a summary if one does not exist."""
if existing_summary:
return existing_summary

# strip code blocks
content_no_code = re.sub(r"```.*?```", "", content, flags=re.DOTALL)
html_content = markdown.markdown(content_no_code)

soup = BeautifulSoup(html_content, "html.parser")

max_paragraphs = self.settings.get("SUMMARY_MAX_PARAGRAPHS", None)
if max_paragraphs is not None:
paragraphs = soup.find_all("p")[:max_paragraphs]
html_content = "".join(str(p) for p in paragraphs)
soup = BeautifulSoup(html_content, "html.parser")

max_length = self.settings.get("SUMMARY_MAX_LENGTH", None)
end_suffix = self.settings.get("SUMMARY_END_SUFFIX", "...")

text_content = soup.get_text()
if max_length is not None:
words = text_content.split()
if len(words) > max_length:
text_content = " ".join(words[:max_length]) + end_suffix

return text_content


def setup_quarto_project(pelican_instance):
"""Set up the Quarto project if a .qmd file is found."""
content_path = Path(pelican_instance.settings["PATH"])
Expand Down
77 changes: 77 additions & 0 deletions pelican/plugins/lm_pelican_quarto/readers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import logging
import re

from bs4 import BeautifulSoup
import markdown
import yaml

from pelican import readers

logger = logging.getLogger(__name__)
QUARTO_EXTENSION = "qmd"


class QuartoReader(readers.BaseReader):
"""Read QMD Files using a Pelican Reader."""

enabled = True
file_extensions = [QUARTO_EXTENSION]

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._source_path = None

def read(self, filename):
"""Read QMD Files."""
with open(filename, encoding="utf-8") as file:
content = file.read()

# extract yaml header and content body
_, front_matter, markdown_body = re.split(r"^---\s*$", content, 2, re.MULTILINE)

metadata = yaml.load(front_matter, Loader=yaml.FullLoader)
metadata = self._parse_metadata(metadata)

article_content = markdown.markdown(markdown_body)
metadata["summary"] = self.generate_article_summary(
metadata.get("summary"), article_content
)

return article_content, metadata

def _parse_metadata(self, meta):
"""Parse and format the metadata from YAML."""
output = {}
for name, value in meta.items():
key = name.lower()
output[key] = self.process_metadata(key, value)

return output

def generate_article_summary(self, existing_summary, content):
"""Generate a summary if one does not exist."""
if existing_summary:
return existing_summary

# strip code blocks
content_no_code = re.sub(r"```.*?```", "", content, flags=re.DOTALL)
html_content = markdown.markdown(content_no_code)

soup = BeautifulSoup(html_content, "html.parser")

max_paragraphs = self.settings.get("SUMMARY_MAX_PARAGRAPHS", None)
if max_paragraphs is not None:
paragraphs = soup.find_all("p")[:max_paragraphs]
html_content = "".join(str(p) for p in paragraphs)
soup = BeautifulSoup(html_content, "html.parser")

max_length = self.settings.get("SUMMARY_MAX_LENGTH", None)
end_suffix = self.settings.get("SUMMARY_END_SUFFIX", "...")

text_content = soup.get_text()
if max_length is not None:
words = text_content.split()
if len(words) > max_length:
text_content = " ".join(words[:max_length]) + end_suffix

return text_content
12 changes: 7 additions & 5 deletions pelican/plugins/lm_pelican_quarto/tests/test_quarto.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,9 @@ def create_article(temp_path):
article_content = """
---
title: testqmd
date: 2024-06-02
category: test
date: "2024-06-02"
category: "test"
tags: ["arts", "b"]
---
Hi
Expand Down Expand Up @@ -58,9 +59,10 @@ def create_nested_article(temp_path):

article_content = """
---
title: testqmd
date: 2024-06-02
category: test
title: "testqmd"
date: "2024-06-02"
category: "test"
tags: ["a", "b"]
---
Hi
Expand Down

0 comments on commit ef8ceaf

Please sign in to comment.