Skip to content

Commit

Permalink
Add rpy2 dependency, refactor process_pdfs method, and implement unit…
Browse files Browse the repository at this point in the history
… tests for OddpubWrapper
  • Loading branch information
quang-ng committed Dec 10, 2024
1 parent da63428 commit e08aff9
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 4 deletions.
6 changes: 2 additions & 4 deletions dsst_etl/oddpub_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def _cleanup_output_folder(self, output_folder: str) -> None:
logger.error(f"Error cleaning up output folder: {str(e)}")
raise

def process_pdfs(self, pdf_folder: str, output_folder: str) -> OddpubMetrics:
def process_pdfs(self, pdf_folder: str) -> OddpubMetrics:
"""
Process PDFs through the complete ODDPub workflow and store results in database.
Expand All @@ -118,6 +118,7 @@ def process_pdfs(self, pdf_folder: str, output_folder: str) -> OddpubMetrics:
"""
try:
# Create output directory if it doesn't exist
output_folder = "oddpub_output"
Path(output_folder).mkdir(parents=True, exist_ok=True)

# Execute the workflow
Expand All @@ -130,9 +131,6 @@ def process_pdfs(self, pdf_folder: str, output_folder: str) -> OddpubMetrics:
self.db.commit()
logger.info("Successfully stored results in database")

# Cleanup
self._cleanup_output_folder(output_folder)

return result
except Exception as e:
logger.error(f"Error in PDF processing workflow: {str(e)}")
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ dependencies = [
"pdf2doi",
"tqdm",
"pypdf",
"rpy2"
]

[project.optional-dependencies]
Expand Down
34 changes: 34 additions & 0 deletions tests/test_oddpub_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import unittest
from unittest.mock import patch, MagicMock
from sqlalchemy import inspect
from dsst_etl.oddpub_wrapper import OddpubWrapper
from dsst_etl.models import OddpubMetrics
from dsst_etl.db import get_db_session, init_db
from dsst_etl import get_db_engine


class TestOddpubWrapper(unittest.TestCase):

def setUp(self):
self.engine = get_db_engine(is_test=True)
init_db(self.engine)
self.session = get_db_session(self.engine)
self.wrapper = OddpubWrapper(self.session)

def tearDown(self):
self.session.close()
self.engine.dispose()

def test_oddpub_wrapper(self):
self.wrapper.process_pdfs("test_pdfs", "test_output")

# Check if the OddpubMetrics table exists
inspector = inspect(self.session.bind)
self.assertTrue("oddpub_metrics" in inspector.get_table_names())

# Check if the data was inserted correctly
result = self.session.query(OddpubMetrics).first()
self.assertIsNotNone(result)

if __name__ == '__main__':
unittest.main()

0 comments on commit e08aff9

Please sign in to comment.