diff --git a/osm/_utils.py b/osm/_utils.py index b4d9b2cf..d94c680b 100644 --- a/osm/_utils.py +++ b/osm/_utils.py @@ -5,7 +5,6 @@ import time import types from pathlib import Path -from time import sleep import pandas as pd import requests diff --git a/osm/pipeline/core.py b/osm/pipeline/core.py index 0a9cd063..16e38ba9 100644 --- a/osm/pipeline/core.py +++ b/osm/pipeline/core.py @@ -15,7 +15,7 @@ def __init__(self, version: str = "0.0.1"): self._orm_model = None @abstractmethod - def _run(self, data: bytes|dict, **kwargs) -> Any: + def _run(self, data: bytes | dict, **kwargs) -> Any: """Abstract method that subclasses must implement.""" pass @@ -99,9 +99,11 @@ def __init__( self.xml_path = xml_path self.metrics_path = metrics_path - def run(self,user_managed_compose:bool=False): + def run(self, user_managed_compose: bool = False): for parser in self.parsers: - parsed_data = parser.run(self.file_data,user_managed_compose=user_managed_compose) + parsed_data = parser.run( + self.file_data, user_managed_compose=user_managed_compose + ) if isinstance(parsed_data, bytes): self.savers.save_file(parsed_data, self.xml_path) for extractor in self.extractors: diff --git a/osm/pipeline/parsers.py b/osm/pipeline/parsers.py index 73f236c3..133bb4bc 100644 --- a/osm/pipeline/parsers.py +++ b/osm/pipeline/parsers.py @@ -1,9 +1,11 @@ -import requests +import io import time + +import requests + from osm.schemas.custom_fields import LongBytes from .core import Component -import io SCIENCEBEAM_URL = "http://localhost:8070/api/convert" @@ -28,17 +30,19 @@ class PMCParser(NoopParser): class ScienceBeamParser(Component): - def _run(self, data: bytes,user_managed_compose=False) -> str: + def _run(self, data: bytes, user_managed_compose=False) -> str: self.sample = LongBytes(data) headers = {"Accept": "application/tei+xml", "Content-Type": "application/pdf"} - files = {'file': ('input.pdf', io.BytesIO(data), 'application/pdf')} + files = {"file": ("input.pdf", io.BytesIO(data), "application/pdf")} for attempt in range(5): try: if not user_managed_compose: time.sleep(10) response = requests.post(SCIENCEBEAM_URL, files=files, headers=headers) - except requests.exceptions.RequestException as e: - print(f"Attempt {attempt + 1} for parsing the file failed. This can happen while the container is starting up. Retrying in 5 seconds.") + except requests.exceptions.RequestException: + print( + f"Attempt {attempt + 1} for parsing the file failed. This can happen while the container is starting up. Retrying in 5 seconds." + ) continue if response.status_code == 200: return response.content