diff --git a/osm/_utils.py b/osm/_utils.py index 647d3d72..b4d9b2cf 100644 --- a/osm/_utils.py +++ b/osm/_utils.py @@ -87,7 +87,6 @@ def compose_up(): print("Waiting for containers to be ready...") docker.compose.up(detach=True, wait=True, pull="always") print("Containers ready!") - sleep(5) def compose_down(): diff --git a/osm/cli.py b/osm/cli.py index 3109a71f..2917d4a4 100644 --- a/osm/cli.py +++ b/osm/cli.py @@ -93,7 +93,7 @@ def main(): ), ), ) - pipeline.run() + pipeline.run(user_managed_compose=args.user_managed_compose) finally: if not args.user_managed_compose: compose_down() diff --git a/osm/pipeline/core.py b/osm/pipeline/core.py index 7f2d43d6..0a9cd063 100644 --- a/osm/pipeline/core.py +++ b/osm/pipeline/core.py @@ -99,9 +99,9 @@ def __init__( self.xml_path = xml_path self.metrics_path = metrics_path - def run(self): + def run(self,user_managed_compose:bool=False): for parser in self.parsers: - parsed_data = parser.run(self.file_data) + parsed_data = parser.run(self.file_data,user_managed_compose=user_managed_compose) if isinstance(parsed_data, bytes): self.savers.save_file(parsed_data, self.xml_path) for extractor in self.extractors: diff --git a/osm/pipeline/parsers.py b/osm/pipeline/parsers.py index b06a3977..73f236c3 100644 --- a/osm/pipeline/parsers.py +++ b/osm/pipeline/parsers.py @@ -1,5 +1,5 @@ import requests - +import time from osm.schemas.custom_fields import LongBytes from .core import Component @@ -28,13 +28,19 @@ class PMCParser(NoopParser): class ScienceBeamParser(Component): - def _run(self, data: bytes) -> str: + def _run(self, data: bytes,user_managed_compose=False) -> str: self.sample = LongBytes(data) headers = {"Accept": "application/tei+xml", "Content-Type": "application/pdf"} files = {'file': ('input.pdf', io.BytesIO(data), 'application/pdf')} - - response = requests.post(SCIENCEBEAM_URL, files=files, headers=headers) - if response.status_code == 200: - return response.content - else: - response.raise_for_status() + for attempt in range(5): + try: + if not user_managed_compose: + time.sleep(10) + response = requests.post(SCIENCEBEAM_URL, files=files, headers=headers) + except requests.exceptions.RequestException as e: + print(f"Attempt {attempt + 1} for parsing the file failed. This can happen while the container is starting up. Retrying in 5 seconds.") + continue + if response.status_code == 200: + return response.content + else: + response.raise_for_status()