Skip to content

Commit

Permalink
Merge pull request #48 from nimh-dsst/fix-intermittent-parser-error
Browse files Browse the repository at this point in the history
retry parsing
  • Loading branch information
leej3 authored Aug 27, 2024
2 parents c1abcf7 + b33f705 commit 1c62f2d
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 12 deletions.
1 change: 0 additions & 1 deletion osm/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ def compose_up():
print("Waiting for containers to be ready...")
docker.compose.up(detach=True, wait=True, pull="always")
print("Containers ready!")
sleep(5)


def compose_down():
Expand Down
2 changes: 1 addition & 1 deletion osm/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def main():
),
),
)
pipeline.run()
pipeline.run(user_managed_compose=args.user_managed_compose)
finally:
if not args.user_managed_compose:
compose_down()
Expand Down
4 changes: 2 additions & 2 deletions osm/pipeline/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,9 @@ def __init__(
self.xml_path = xml_path
self.metrics_path = metrics_path

def run(self):
def run(self,user_managed_compose:bool=False):
for parser in self.parsers:
parsed_data = parser.run(self.file_data)
parsed_data = parser.run(self.file_data,user_managed_compose=user_managed_compose)
if isinstance(parsed_data, bytes):
self.savers.save_file(parsed_data, self.xml_path)
for extractor in self.extractors:
Expand Down
22 changes: 14 additions & 8 deletions osm/pipeline/parsers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import requests

import time
from osm.schemas.custom_fields import LongBytes

from .core import Component
Expand Down Expand Up @@ -28,13 +28,19 @@ class PMCParser(NoopParser):


class ScienceBeamParser(Component):
def _run(self, data: bytes) -> str:
def _run(self, data: bytes,user_managed_compose=False) -> str:
self.sample = LongBytes(data)
headers = {"Accept": "application/tei+xml", "Content-Type": "application/pdf"}
files = {'file': ('input.pdf', io.BytesIO(data), 'application/pdf')}

response = requests.post(SCIENCEBEAM_URL, files=files, headers=headers)
if response.status_code == 200:
return response.content
else:
response.raise_for_status()
for attempt in range(5):
try:
if not user_managed_compose:
time.sleep(10)
response = requests.post(SCIENCEBEAM_URL, files=files, headers=headers)
except requests.exceptions.RequestException as e:
print(f"Attempt {attempt + 1} for parsing the file failed. This can happen while the container is starting up. Retrying in 5 seconds.")
continue
if response.status_code == 200:
return response.content
else:
response.raise_for_status()

0 comments on commit 1c62f2d

Please sign in to comment.