Skip to content

Commit

Permalink
Add scraper code.
Browse files Browse the repository at this point in the history
Should have added this earlier, whoops.
  • Loading branch information
emmiegit committed Jan 13, 2024
1 parent 9f04bbd commit 4a94f40
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 0 deletions.
1 change: 1 addition & 0 deletions yellowstone/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pugsql

from .config import Config, getenv
from .jobs import site_home_raw
from .s3 import S3
from .wikidot import Wikidot

Expand Down
23 changes: 23 additions & 0 deletions yellowstone/scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""
Utilities to assist with scraping.
"""

import re

import requests

from .exceptions import ScrapingError


def download_html(url: str) -> str:
r = requests.get(url)
r.raise_for_status()
return r.text()


def regex_extract(source: str, body: str, regex: re.Pattern) -> re.Match:
match = regex.search(body)
if match is None:
raise ScrapingError(f"Pattern {regex.pattern} failed for {source}")

return match

0 comments on commit 4a94f40

Please sign in to comment.