Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate work for document #9

Merged
merged 35 commits into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
ea2e1ae
Generate work for document
quang-ng Nov 19, 2024
a28107a
Add handle error
quang-ng Nov 19, 2024
4e1bb32
fix precomit hook
quang-ng Nov 19, 2024
b349dcc
make fix for missing variable
leej3 Nov 19, 2024
8217eed
use uv sync
leej3 Nov 19, 2024
fa113ea
add some fixes for the tests
leej3 Nov 19, 2024
82c846b
add unit-test for uploader
quang-ng Nov 21, 2024
e016e85
Add S3_BUCKET_NAME in mock env file
quang-ng Nov 21, 2024
5e4729c
Fix test created docs
quang-ng Nov 21, 2024
70c19c6
Add more docs
quang-ng Nov 21, 2024
bdccdfb
Create provenance
quang-ng Nov 21, 2024
b7c2702
add more test assert
quang-ng Nov 21, 2024
1874d8b
add more exception handle
quang-ng Nov 21, 2024
7c8d225
Remove os with pathlib
quang-ng Nov 21, 2024
c76d218
Run test with cocerage
quang-ng Nov 21, 2024
e47c7df
fix cmd
quang-ng Nov 21, 2024
7762f2a
update ignore file
quang-ng Nov 21, 2024
8173ba6
fix test
quang-ng Nov 21, 2024
eb0a755
initall dot-env
quang-ng Nov 21, 2024
350bc70
udpate sqlalchemy
quang-ng Nov 21, 2024
782d6af
fix run unit-test by uv
quang-ng Nov 21, 2024
0ea3f78
add coverage
quang-ng Nov 21, 2024
a5bd63d
add coverage dependencis
quang-ng Nov 21, 2024
0e1b9c0
add ci
quang-ng Nov 21, 2024
1568924
add config file
quang-ng Nov 21, 2024
7f3323e
update config file
quang-ng Nov 21, 2024
b74545c
Changed test PDFs to public domain, Irish Poetry
joshlawrimore Nov 22, 2024
6fa3ae0
Fix some warning
quang-ng Nov 23, 2024
4c7c03e
Add unit-test database
quang-ng Nov 23, 2024
66f7357
Add check table
quang-ng Nov 23, 2024
6e43d1d
add more logs
quang-ng Nov 23, 2024
d4158e3
Add logs show
quang-ng Nov 23, 2024
3e20bd1
print tlb
quang-ng Nov 23, 2024
a084c57
Add more logs
quang-ng Nov 24, 2024
c7525b7
Add logs and remove print
quang-ng Nov 24, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 13 additions & 4 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,25 @@ jobs:
- name: Install dependencies
run: |
pip install uv
uv pip install --system -r <(uv pip compile --all-extras pyproject.toml)
uv sync --all-extras
pip install .[ci]
quang-ng marked this conversation as resolved.
Show resolved Hide resolved

- name: Start Docker stack
run: |
cp .mockenv .env
docker compose -f .docker/postgres-compose.yaml up -d

- name: Run tests
run: pytest tests
- name: Run tests with coverage
run: |
coverage run -m pytest tests
coverage report
coverage xml

- name: Stop Docker stack
if: always()
run: docker compose -f compose.yaml -f compose.development.override.yaml down
run: docker compose -f .docker/postgres-compose.yaml down

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
with:
file: coverage.xml
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@ _version.py
pdfs/*
test-pdf/*.pdf
dsst_etl.egg-info/*
uv.lock
uv.lock
.coverage
3 changes: 3 additions & 0 deletions .mockenv
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@ POSTGRES_USER=postgres
POSTGRES_PASSWORD=postgres
POSTGRES_HOST=localhost
POSTGRES_PORT=5432
POSTGRES_DB=pdx

# requests will be rejected without user-agent
USER_AGENT=

# metapub; get key from https://pubmed.ncbi.nlm.nih.gov for faster requests
NCBI_API_KEY=

S3_BUCKET_NAME=osm-pdf-uploads
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ docker compose -f .docker/postgres-compose.yaml down -v
pre-commit install

# run the pre-commit hooks on all files
pre-commit run -all
pre-commit run --all-files

# run the tests
pytest
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""update non-null of document in works table

Revision ID: 4a908d10b459
Revises: 360c65a62392
Create Date: 2024-11-21 16:40:56.966690

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision: str = '4a908d10b459'
down_revision: Union[str, None] = '360c65a62392'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.alter_column('works', 'initial_document_id',
existing_type=sa.INTEGER(),
nullable=True)
op.alter_column('works', 'primary_document_id',
existing_type=sa.INTEGER(),
nullable=True)
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.alter_column('works', 'primary_document_id',
existing_type=sa.INTEGER(),
nullable=False)
op.alter_column('works', 'initial_document_id',
existing_type=sa.INTEGER(),
nullable=False)
# ### end Alembic commands ###
6 changes: 6 additions & 0 deletions dsst_etl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
DSST ETL Package
"""

import logging
import os

from dotenv import load_dotenv
Expand All @@ -10,6 +11,9 @@

load_dotenv()

logger = logging.getLogger(__name__)


def get_db_url():
database_url = (
"postgresql://"
Expand All @@ -20,9 +24,11 @@ def get_db_url():
)
return database_url


def get_db_engine():
return create_engine(get_db_url())


engine = get_db_engine()
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)

Expand Down
4 changes: 2 additions & 2 deletions dsst_etl/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def get_compute_context_id():


def get_bucket_name():
bucket_name = os.getenv('S3_BUCKET_NAME')
bucket_name = os.getenv("S3_BUCKET_NAME")
quang-ng marked this conversation as resolved.
Show resolved Hide resolved
if not bucket_name:
raise ValueError("S3_BUCKET_NAME environment variable is not set")
return bucket_name
return bucket_name
8 changes: 4 additions & 4 deletions dsst_etl/db.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import os
from sqlalchemy.orm import sessionmaker
from sqlalchemy_utils import create_database, database_exists

from dsst_etl import get_db_engine
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

from .models import Base



def get_db_session():
engine = get_db_engine()
Session = sessionmaker(bind=engine)
Expand All @@ -16,4 +14,6 @@ def get_db_session():

def init_db():
engine = get_db_engine()
if not database_exists(engine.url):
create_database(engine.url)
Base.metadata.create_all(engine)
13 changes: 0 additions & 13 deletions dsst_etl/extract.py

This file was deleted.

11 changes: 0 additions & 11 deletions dsst_etl/load.py

This file was deleted.

17 changes: 4 additions & 13 deletions dsst_etl/models.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
from datetime import datetime

from sqlalchemy import Column, DateTime, ForeignKey, Integer, LargeBinary, String, Text
from sqlalchemy import Column, DateTime, ForeignKey, Integer, String, Text
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship
from sqlalchemy.sql import func

Base = declarative_base()
Expand All @@ -14,12 +11,8 @@ class Works(Base):
id = Column(Integer, primary_key=True)
created_at = Column(DateTime, default=func.now())
modified_at = Column(DateTime, default=func.now(), onupdate=func.now())
initial_document_id = Column(
Integer, ForeignKey("documents.id"), nullable=False
)
primary_document_id = Column(
Integer, ForeignKey("documents.id"), nullable=False
)
initial_document_id = Column(Integer, ForeignKey("documents.id"), nullable=True)
primary_document_id = Column(Integer, ForeignKey("documents.id"), nullable=True)
provenance_id = Column(Integer, ForeignKey("provenance.id"))

# Relationships
Expand All @@ -30,7 +23,7 @@ class Works(Base):

class Documents(Base):
__tablename__ = "documents"

id = Column(Integer, primary_key=True)
hash_data = Column(String, nullable=False, unique=True)
created_at = Column(DateTime, default=func.now())
Expand All @@ -42,7 +35,6 @@ class Documents(Base):
# provenance = relationship("Provenance")



class Provenance(Base):
__tablename__ = "provenance"

Expand All @@ -52,4 +44,3 @@ class Provenance(Base):
compute = Column(Text)
personnel = Column(Text)
comment = Column(Text)

13 changes: 0 additions & 13 deletions dsst_etl/transform.py

This file was deleted.

Loading
Loading