Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

retry parsing #48

Merged
merged 1 commit into from
Aug 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion osm/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ def compose_up():
print("Waiting for containers to be ready...")
docker.compose.up(detach=True, wait=True, pull="always")
print("Containers ready!")
sleep(5)


def compose_down():
Expand Down
2 changes: 1 addition & 1 deletion osm/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def main():
),
),
)
pipeline.run()
pipeline.run(user_managed_compose=args.user_managed_compose)
finally:
if not args.user_managed_compose:
compose_down()
Expand Down
4 changes: 2 additions & 2 deletions osm/pipeline/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,9 @@ def __init__(
self.xml_path = xml_path
self.metrics_path = metrics_path

def run(self):
def run(self,user_managed_compose:bool=False):
for parser in self.parsers:
parsed_data = parser.run(self.file_data)
parsed_data = parser.run(self.file_data,user_managed_compose=user_managed_compose)
if isinstance(parsed_data, bytes):
self.savers.save_file(parsed_data, self.xml_path)
for extractor in self.extractors:
Expand Down
22 changes: 14 additions & 8 deletions osm/pipeline/parsers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import requests

import time
from osm.schemas.custom_fields import LongBytes

from .core import Component
Expand Down Expand Up @@ -28,13 +28,19 @@ class PMCParser(NoopParser):


class ScienceBeamParser(Component):
def _run(self, data: bytes) -> str:
def _run(self, data: bytes,user_managed_compose=False) -> str:
self.sample = LongBytes(data)
headers = {"Accept": "application/tei+xml", "Content-Type": "application/pdf"}
files = {'file': ('input.pdf', io.BytesIO(data), 'application/pdf')}

response = requests.post(SCIENCEBEAM_URL, files=files, headers=headers)
if response.status_code == 200:
return response.content
else:
response.raise_for_status()
for attempt in range(5):
try:
if not user_managed_compose:
time.sleep(10)
response = requests.post(SCIENCEBEAM_URL, files=files, headers=headers)
except requests.exceptions.RequestException as e:
print(f"Attempt {attempt + 1} for parsing the file failed. This can happen while the container is starting up. Retrying in 5 seconds.")
continue
if response.status_code == 200:
return response.content
else:
response.raise_for_status()
Loading