Skip to content

Commit

Permalink
Refactor PDFUploader tests for clarity and add new test cases for upl…
Browse files Browse the repository at this point in the history
…oad scenarios
  • Loading branch information
quang-ng committed Jan 9, 2025
1 parent eab1578 commit 63ebb76
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 34 deletions.
2 changes: 1 addition & 1 deletion dsst_etl/upload_pdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def run_uploader(

if not pdf_files:
logger.warning(f"No PDF files found in {pdf_directory_path}")
return
return [], []

with open(metadata_json_file_path) as f:
metadata = json.load(f)
Expand Down
116 changes: 83 additions & 33 deletions tests/test_pdf_uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,67 +14,117 @@ def setUp(self, mock_boto_client):
super().setUp()
self.mock_s3_client = MagicMock()
mock_boto_client.return_value = self.mock_s3_client
# Initialize PDFUploader with the session
self.uploader = PDFUploader(self.session)
self.base_dir = Path(__file__).resolve().parent
self.pdf_paths = f"{self.base_dir}/pdf-test"
self.metadata_json_file_path = f"{self.base_dir}/pdf-test/metadata.json"

def test_run_uploader(self):
# Test that the uploader runs without errors
def test_run_uploader_successful_uploads(self):
"""Test that the uploader runs and uploads files successfully."""
self.mock_s3_client.upload_file.return_value = None

base_dir = Path(__file__).resolve().parent

pdf_paths = f"{base_dir}/pdf-test"
metadata_json_file_path = f"{base_dir}/pdf-test/metadata.json"

susccessful_uploads, failed_uploads = self.uploader.run_uploader(
pdf_directory_path=pdf_paths,
metadata_json_file_path=metadata_json_file_path,
successful_uploads, failed_uploads = self.uploader.run_uploader(
pdf_directory_path=self.pdf_paths,
metadata_json_file_path=self.metadata_json_file_path,
comment="Test upload comment",
)

self.assertEqual(len(susccessful_uploads), 2)
self.assertEqual(len(successful_uploads), 2)
self.assertEqual(len(failed_uploads), 0)

self.mock_s3_client.upload_file.assert_called()

self.assertEqual(self.session.query(Identifier).count(), 2)
def test_identifiers_created(self):
"""Test that identifiers are created correctly."""
self.test_run_uploader_successful_uploads()

identifiers = self.session.query(Identifier).all()
self.assertEqual("10.1038/s41586-023-06521-1" in [identifier.doi for identifier in identifiers], True)
self.assertEqual("10.1038/s41586-023-06521-2" in [identifier.doi for identifier in identifiers], True)

self.assertEqual(34567890 in [identifier.pmid for identifier in identifiers], True)
self.assertEqual(45678901 in [identifier.pmid for identifier in identifiers], True)

self.assertEqual("PMC9876543" in [identifier.pmcid for identifier in identifiers], True)
self.assertEqual("PMC1234567" in [identifier.pmcid for identifier in identifiers], True)

self.assertEqual(len(identifiers), 2)
self.assertIn("10.1038/s41586-023-06521-1", [identifier.doi for identifier in identifiers])
self.assertIn("10.1038/s41586-023-06521-2", [identifier.doi for identifier in identifiers])
self.assertIn(34567890, [identifier.pmid for identifier in identifiers])
self.assertIn(45678901, [identifier.pmid for identifier in identifiers])
self.assertIn("PMC9876543", [identifier.pmcid for identifier in identifiers])
self.assertIn("PMC1234567", [identifier.pmcid for identifier in identifiers])

def test_documents_created(self):
"""Test that documents are created correctly."""
self.test_run_uploader_successful_uploads()

documents = self.session.query(Documents).all()
self.assertEqual(len(documents), 2)
self.assertIn("s3://osm-pdf-uploads/pdfs/test2.pdf", [document.s3uri for document in documents])
self.assertIn("s3://osm-pdf-uploads/pdfs/test1.pdf", [document.s3uri for document in documents])

self.assertEqual("s3://osm-pdf-uploads/pdfs/test2.pdf" in [document.s3uri for document in documents], True)
self.assertEqual("s3://osm-pdf-uploads/pdfs/test1.pdf" in [document.s3uri for document in documents], True)
def test_provenance_created(self):
"""Test that provenance is created correctly."""
self.test_run_uploader_successful_uploads()

provenance = self.session.query(Provenance).first()
self.assertEqual(provenance.comment, "Test upload comment")
self.assertEqual(provenance.pipeline_name, "Document Upload")


def test_documents_linked_to_provenance(self):
"""Test that documents are linked to the correct provenance."""
self.test_run_uploader_successful_uploads()

documents = self.session.query(Documents).all()
provenance = self.session.query(Provenance).first()
self.assertEqual(documents[0].provenance_id, provenance.id)
self.assertEqual(documents[1].provenance_id, provenance.id)

def test_works_created(self):
"""Test that works are created correctly."""
self.test_run_uploader_successful_uploads()

works = self.session.query(Works).all()
documents = self.session.query(Documents).all()
self.assertEqual(len(works), 2)
self.assertEqual(works[0].initial_document_id in [document.id for document in documents], True)
self.assertEqual(works[1].initial_document_id in [document.id for document in documents], True)
self.assertIn(works[0].initial_document_id, [document.id for document in documents])

self.assertEqual(works[0].primary_document_id in [document.id for document in documents], True)
self.assertEqual(works[1].primary_document_id in [document.id for document in documents], True)
def test_empty_pdf_directory(self):
"""Test that the uploader handles an empty PDF directory gracefully."""
self.mock_s3_client.upload_file.return_value = None

empty_pdf_dir = f"{self.base_dir}/empty-pdf-test"
successful_uploads, failed_uploads = self.uploader.run_uploader(
pdf_directory_path=empty_pdf_dir,
metadata_json_file_path=self.metadata_json_file_path,
comment="Test upload comment",
)

self.assertEqual(len(successful_uploads), 0)
self.assertEqual(len(failed_uploads), 0)

def test_invalid_metadata_json(self):
"""Test that the uploader handles invalid metadata JSON files."""
self.mock_s3_client.upload_file.return_value = None

self.assertEqual(works[0].provenance_id, provenance.id)
self.assertEqual(works[1].provenance_id, provenance.id)
invalid_metadata_json_file_path = f"{self.base_dir}/pdf-test/invalid_metadata.json"
with self.assertRaises(FileNotFoundError):
self.uploader.run_uploader(
pdf_directory_path=self.pdf_paths,
metadata_json_file_path=invalid_metadata_json_file_path,
comment="Test upload comment",
)

def test_partial_upload_failures(self):
"""Test that the uploader correctly reports partial upload failures."""
def mock_upload_file(Bucket, Key, Filename):
if "test1.pdf" in Filename:
raise Exception("Upload failed for test1.pdf")
return None

self.mock_s3_client.upload_file.side_effect = mock_upload_file

successful_uploads, failed_uploads = self.uploader.run_uploader(
pdf_directory_path=self.pdf_paths,
metadata_json_file_path=self.metadata_json_file_path,
comment="Test upload comment",
)

self.assertEqual(len(successful_uploads), 1)
self.assertEqual(len(failed_uploads), 1)


if __name__ == "__main__":
unittest.main()
unittest.main()

0 comments on commit 63ebb76

Please sign in to comment.