Skip to content

Commit

Permalink
feat: Implement PDF to XML conversion using ScienceBeam Parser
Browse files Browse the repository at this point in the history
  • Loading branch information
gitstart-nimhdsst committed Jun 12, 2024
1 parent f7600d9 commit 20b331a
Show file tree
Hide file tree
Showing 11 changed files with 81 additions and 26 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,5 @@ __pycache__/
dist/
build/
.tox/
venv/
.vscode/settings.json
.DS_Store
Empty file added commands/converters/__init__.py
Empty file.
9 changes: 9 additions & 0 deletions commands/converters/converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from abc import ABC, abstractmethod

from pydantic import FilePath, BaseModel


class Converter(ABC, BaseModel):
@abstractmethod
def convert(self, pdf_path: FilePath) -> str:
pass
26 changes: 26 additions & 0 deletions commands/converters/pdf_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from pydantic import FilePath
import requests

from commands.converters.converter import Converter


class PDFConverter(Converter):
sciencebeam_url: str = 'http://localhost:8080/api/convert'

def convert(self, pdf_path: FilePath):
"""Convert a PDF file to XML using ScienceBeam Parser.
Args:
pdf_path: Path to the PDF file
Returns:
XML content as a string
"""
with open(pdf_path, 'rb') as pdf_file:
files = {'file': pdf_file}
headers = {'Accept': 'application/tei+xml'}
response = requests.post(self.sciencebeam_url, files=files, headers=headers)

if response.status_code == 200:
return response.text
else:
response.raise_for_status()
21 changes: 18 additions & 3 deletions commands/file_processing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import click
import requests
from pydantic import ValidationError

from commands.converters.pdf_converter import PDFConverter
from logs.logger import logger


@click.command()
@click.argument('file_path', type=click.Path(exists=True))
@click.argument('file_id', type=str)
Expand All @@ -9,11 +14,21 @@ def pdf_xml(file_path, file_id):
to XML
Args:
file_paht (file path): First parameter
file_path (file path): First parameter
file_id (string): Second parameter
Returns:
Creates an XML file in the directory xmls_sciencebeam
"""
try:
converter = PDFConverter()
xml_content = converter.convert(file_path)
# Save the converted xml contents
with open(f'docs/examples/sciencebeam_xml_outputs/{file_id}.xml', 'w', encoding='utf-8') as xml_file:
xml_file.write(xml_content)
logger.info(f'Converted: {file_path} with ID: {file_id} to XML')

except ValidationError as error:
logger.error("Validation error:", error)

# Function Implementation
logger.info(f'Converted: {file_path} with ID: {file_id} to XML')
except requests.RequestException as error:
logger.error("Request error:", error)
Binary file modified docs/examples/pdf_inputs/test_sample.pdf
Binary file not shown.
Binary file removed example_pdf_inputs/test.pdf
Binary file not shown.
Binary file removed example_pdf_inputs/test_sample.pdf
Binary file not shown.
4 changes: 3 additions & 1 deletion osm/cli.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import click
from commands.file_processing import pdf_xml


@click.group()
def cli():
"""Main command group."""
pass


# Add commands to the main group
cli.add_command(pdf_xml)

if __name__ == '__main__':
cli()
cli()
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
click==8.1.7
tox==4.15.0
pydantic==2.7.3
pytest==8.2.1
rich==13.7.1
requests==2.32.3
44 changes: 23 additions & 21 deletions tests/test_file_processing.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,33 @@
import pytest
import unittest
from click.testing import CliRunner
import os
import logging

from commands.file_processing import pdf_xml
from osm.cli import cli

@pytest.mark.usefixtures("caplog")
class TestFileProcessing:
def setup_method(self):

class TestFileProcessing(unittest.TestCase):
def setUp(self):
# Create a temporary PDF file for testing
self.pdf_path = 'test_sample.pdf'
with open(self.pdf_path, 'wb') as f:
f.write(b'%PDF-1.4\n%Test PDF content\n')
self.pdfs_folder = 'docs/examples/pdf_inputs'
self.file = 'test_sample.pdf'
self.file_id = 'test_file_id'

def teardown_method(self):
# Remove the temporary PDF file and any generated XML file
if os.path.exists(self.pdf_path):
os.remove(self.pdf_path)
xml_output = f"{self.pdf_path.replace('.pdf', '')}_test_file.xml"
if os.path.exists(xml_output):
os.remove(xml_output)
self.output_file = f'docs/examples/sciencebeam_xml_outputs/{self.file_id}.xml'

def test_pdf_xml_command(self, caplog):
caplog.set_level(logging.INFO)
def tearDown(self):
# Remove the generated XML file
if os.path.exists(self.output_file):
os.remove(self.output_file)

def test_pdf_xml_command(self):
runner = CliRunner()
result = runner.invoke(pdf_xml, [self.pdf_path, 'test_file'])
pdf_path = f'{self.pdfs_folder}/{self.file}'
result = runner.invoke(cli, ['pdf-xml', pdf_path, self.file_id])

# Check that the command executed successfully
self.assertEqual(result.exit_code, 0)
self.assertTrue(os.path.exists(self.output_file))


assert result.exit_code == 0
assert f'Converted: {self.pdf_path}' in caplog.text
if __name__ == '__main__':
unittest.main()

0 comments on commit 20b331a

Please sign in to comment.