Skip to content

Commit

Permalink
retry parsing
Browse files Browse the repository at this point in the history
sciencebeam container starts up slowly
this change can help by retrying but sometime the container
dies as a result of a failed request.
  • Loading branch information
leej3 committed Aug 27, 2024
1 parent c1abcf7 commit b33f705
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 12 deletions.
1 change: 0 additions & 1 deletion osm/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ def compose_up():
print("Waiting for containers to be ready...")
docker.compose.up(detach=True, wait=True, pull="always")
print("Containers ready!")
sleep(5)


def compose_down():
Expand Down
2 changes: 1 addition & 1 deletion osm/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def main():
),
),
)
pipeline.run()
pipeline.run(user_managed_compose=args.user_managed_compose)
finally:
if not args.user_managed_compose:
compose_down()
Expand Down
4 changes: 2 additions & 2 deletions osm/pipeline/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,9 @@ def __init__(
self.xml_path = xml_path
self.metrics_path = metrics_path

def run(self):
def run(self,user_managed_compose:bool=False):
for parser in self.parsers:
parsed_data = parser.run(self.file_data)
parsed_data = parser.run(self.file_data,user_managed_compose=user_managed_compose)
if isinstance(parsed_data, bytes):
self.savers.save_file(parsed_data, self.xml_path)
for extractor in self.extractors:
Expand Down
22 changes: 14 additions & 8 deletions osm/pipeline/parsers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import requests

import time
from osm.schemas.custom_fields import LongBytes

from .core import Component
Expand Down Expand Up @@ -28,13 +28,19 @@ class PMCParser(NoopParser):


class ScienceBeamParser(Component):
def _run(self, data: bytes) -> str:
def _run(self, data: bytes,user_managed_compose=False) -> str:
self.sample = LongBytes(data)
headers = {"Accept": "application/tei+xml", "Content-Type": "application/pdf"}
files = {'file': ('input.pdf', io.BytesIO(data), 'application/pdf')}

response = requests.post(SCIENCEBEAM_URL, files=files, headers=headers)
if response.status_code == 200:
return response.content
else:
response.raise_for_status()
for attempt in range(5):
try:
if not user_managed_compose:
time.sleep(10)
response = requests.post(SCIENCEBEAM_URL, files=files, headers=headers)
except requests.exceptions.RequestException as e:
print(f"Attempt {attempt + 1} for parsing the file failed. This can happen while the container is starting up. Retrying in 5 seconds.")
continue
if response.status_code == 200:
return response.content
else:
response.raise_for_status()

0 comments on commit b33f705

Please sign in to comment.