Skip to content

Commit

Permalink
feat(cl_back_scrape_oral_arguments): support optional backscraping args
Browse files Browse the repository at this point in the history
- Abstract into a function the backscraper `parser.add_argument` calls,
to prevent duplication
- Add dynamic backscraping support to oral argument scrapers
- Support days_interval optional argument, as expected in freelawproject/juriscraper#1108
  • Loading branch information
grossir committed Oct 11, 2024
1 parent 9f2dcd7 commit aa6d3f6
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 13 deletions.
40 changes: 28 additions & 12 deletions cl/scrapers/management/commands/cl_back_scrape_opinions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,34 @@
from cl.scrapers.management.commands import cl_scrape_opinions


def add_backscraper_arguments(parser) -> None:
"""Adds backscraper specific optional arguments to the parser"""
parser.add_argument(
"--backscrape-start",
dest="backscrape_start",
help="Starting value for backscraper iterable creation. "
"Each scraper handles the parsing of the argument,"
"since the value may represent a year, a string, a date, etc.",
)
parser.add_argument(
"--backscrape-end",
dest="backscrape_end",
help="End value for backscraper iterable creation.",
)
parser.add_argument(
"--days-interval",
help="Days between each (start, end) date pairs in "
"the back_scrape_iterable. Useful to shorten the ranges"
"when there are too many opinions in a range, and the source"
"imposes a limit of returned documents",
type=int,
)


class Command(cl_scrape_opinions.Command):
def add_arguments(self, parser):
super().add_arguments(parser)
parser.add_argument(
"--backscrape-start",
dest="backscrape_start",
help="Starting value for backscraper iterable creation. "
"Each scraper handles the parsing of the argument,"
"since the value may represent a year, a string, a date, etc.",
)
parser.add_argument(
"--backscrape-end",
dest="backscrape_end",
help="End value for backscraper iterable creation.",
)
add_backscraper_arguments(parser)

def parse_and_scrape_site(
self,
Expand All @@ -35,6 +48,8 @@ def parse_and_scrape_site(
which is parsed and used by a scraper as start value for the
range to be backscraped
- backscrape_end: end value for backscraper range
- days_interval: days between each (start, end) date pairs in the
Site.back_scrape_iterable
:return: None
"""
Expand All @@ -44,6 +59,7 @@ def parse_and_scrape_site(
mod.Site(
backscrape_start=options.get("backscrape_start"),
backscrape_end=options.get("backscrape_end"),
days_interval=options.get("days_interval"),
).back_scrape_iterable,
mod,
):
Expand Down
16 changes: 15 additions & 1 deletion cl/scrapers/management/commands/cl_back_scrape_oral_arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,27 @@
from juriscraper.lib.importer import site_yielder

from cl.scrapers.management.commands import cl_scrape_oral_arguments
from cl.scrapers.management.commands.cl_back_scrape_opinions import (
add_backscraper_arguments,
)


class Command(cl_scrape_oral_arguments.Command):
def add_arguments(self, parser):
super().add_arguments(parser)
add_backscraper_arguments(parser)

def parse_and_scrape_site(self, mod, options: dict):
court_str = mod.__name__.split(".")[-1].split("_")[0]
logger.info(f'Using court_str: "{court_str}"')

for site in site_yielder(mod.Site().back_scrape_iterable, mod):
for site in site_yielder(
mod.Site(
backscrape_start=options.get("backscrape_start"),
backscrape_end=options.get("backscrape_end"),
days_interval=options.get("days_interval"),
).back_scrape_iterable,
mod,
):
site.parse()
self.scrape_court(site, full_crawl=True, backscrape=True)

0 comments on commit aa6d3f6

Please sign in to comment.