Skip to content

Commit

Permalink
feat: add ocr radar dump
Browse files Browse the repository at this point in the history
  • Loading branch information
d116626 committed Jun 7, 2024
1 parent f3c3705 commit 222ab2e
Show file tree
Hide file tree
Showing 12 changed files with 4,425 additions and 8 deletions.
32 changes: 27 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,14 +1,36 @@
# Build arguments
ARG PYTHON_VERSION=3.10-slim
ARG PYTHON_VERSION=3.10-slim-buster

# Get Oracle Instant Client
FROM curlimages/curl:7.81.0 as curl-step
ARG ORACLE_INSTANT_CLIENT_URL=https://download.oracle.com/otn_software/linux/instantclient/215000/instantclient-basic-linux.x64-21.5.0.0.0dbru.zip
RUN curl -sSLo /tmp/instantclient.zip $ORACLE_INSTANT_CLIENT_URL

# Unzip Oracle Instant Client
FROM ubuntu:18.04 as unzip-step
COPY --from=curl-step /tmp/instantclient.zip /tmp/instantclient.zip
RUN apt-get update && \

Check failure on line 12 in Dockerfile

View workflow job for this annotation

GitHub Actions / Lint

DL3008 warning: Pin versions in apt get install. Instead of `apt-get install <package>` use `apt-get install <package>=<version>`
apt-get install --no-install-recommends -y unzip && \
rm -rf /var/lib/apt/lists/* && \
unzip /tmp/instantclient.zip -d /tmp

# Start Python image
FROM python:${PYTHON_VERSION}

# Install git
# Install a few dependencies and setup oracle instant client
WORKDIR /opt/oracle
COPY --from=unzip-step /tmp/instantclient_21_5 /opt/oracle/instantclient_21_5
RUN apt-get update && \

Check failure on line 23 in Dockerfile

View workflow job for this annotation

GitHub Actions / Lint

DL3008 warning: Pin versions in apt get install. Instead of `apt-get install <package>` use `apt-get install <package>=<version>`

Check failure on line 23 in Dockerfile

View workflow job for this annotation

GitHub Actions / Lint

DL4006 warning: Set the SHELL option -o pipefail before RUN with a pipe in it. If you are using /bin/sh in an alpine image or if your shell is symlinked to busybox then consider explicitly setting your SHELL to /bin/ash, or disable this check
apt-get install -y git && \
apt-get install --no-install-recommends -y git curl gnupg2 libaio1 && \
curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add - && \
echo "deb [arch=amd64,arm64,armhf] https://packages.microsoft.com/debian/12/prod bookworm main" > /etc/apt/sources.list.d/mssql-release.list && \
apt-get update && \
ACCEPT_EULA=Y apt-get install --no-install-recommends -y ffmpeg libsm6 libxext6 msodbcsql17 openssl unixodbc-dev && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
rm -rf /var/lib/apt/lists/* && \
sh -c "echo /opt/oracle/instantclient_21_5 > /etc/ld.so.conf.d/oracle-instantclient.conf" && \
ldconfig
COPY ./openssl.cnf /etc/ssl/openssl.cnf

# Setting environment with prefect version
ARG PREFECT_VERSION=1.4.1
Expand All @@ -23,4 +45,4 @@ RUN python3 -m pip install --no-cache-dir -U "pip>=21.2.4" "prefect==$PREFECT_VE
# Install requirements
WORKDIR /app
COPY . .
RUN python3 -m pip install --prefer-binary --no-cache-dir -U .
RUN python3 -m pip install --prefer-binary --no-cache-dir -U .
2 changes: 1 addition & 1 deletion pipelines/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class constants(Enum):
######################################
# Agent labels
######################################
# EXAMPLE_AGENT_LABEL = "example_agent"
RJ_CETRIO_AGENT_LABEL = "rj-cetrio"

######################################
# Other constants
Expand Down
3 changes: 2 additions & 1 deletion pipelines/flows.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
"""
Imports all flows for every project so we can register all of them.
"""
from pipelines.exemplo import * # noqa
from pipelines.ocr_radar import * # noqa
from pipelines.templates import * # noqa
2 changes: 2 additions & 0 deletions pipelines/ocr_radar/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# -*- coding: utf-8 -*-
from pipelines.ocr_radar.dump_db_radar.flows import * # noqa: F401, F403
Empty file.
45 changes: 45 additions & 0 deletions pipelines/ocr_radar/dump_db_radar/flows.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-
"""
Database dumping flows for cetrio project.
"""

from copy import deepcopy

from prefect.run_configs import KubernetesRun
from prefect.storage import GCS
from prefeitura_rio.pipelines_templates.dump_db.flows import flow as dump_sql_flow
from prefeitura_rio.pipelines_utils.prefect import set_default_parameters
from prefeitura_rio.pipelines_utils.state_handlers import (
handler_initialize_sentry,
handler_inject_bd_credentials,
)

from pipelines.constants import constants
from pipelines.ocr_radar.dump_db_radar.schedules import (
ocr_radar_monthly_update_schedule,
)

dump_sql_ocr_radar_flow = deepcopy(dump_sql_flow)
dump_sql_ocr_radar_flow.state_handlers = [handler_inject_bd_credentials, handler_initialize_sentry]
dump_sql_ocr_radar_flow.name = "CETRIO: ocr radar - Ingerir tabelas de banco SQL"
dump_sql_ocr_radar_flow.storage = GCS(constants.GCS_FLOWS_BUCKET.value)
dump_sql_ocr_radar_flow.run_config = KubernetesRun(
image=constants.DOCKER_IMAGE.value,
labels=[
constants.RJ_CETRIO_AGENT_LABEL.value,
],
)

ocr_radar_default_parameters = {
"db_database": "DWOCR_Staging",
"db_host": "10.39.64.50",
"db_port": "1433",
"db_type": "sql_server",
"infisical_secret_path": "/db-ocr-radar",
"dataset_id": "recursos_humanos_ocr_radar",
}
dump_sql_ocr_radar_flow = set_default_parameters(
dump_sql_ocr_radar_flow, default_parameters=ocr_radar_default_parameters
)

dump_sql_ocr_radar_flow.schedule = ocr_radar_monthly_update_schedule
51 changes: 51 additions & 0 deletions pipelines/ocr_radar/dump_db_radar/schedules.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# -*- coding: utf-8 -*-
# flake8: noqa: E501
"""
Schedules for the database dump pipeline
"""

from datetime import datetime, timedelta

import pytz
from prefect.schedules import Schedule
from prefeitura_rio.pipelines_utils.io import untuple_clocks as untuple
from prefeitura_rio.pipelines_utils.prefect import generate_dump_db_schedules

from pipelines.constants import constants

#####################################
#
# ocr_radar Schedules
#
#####################################

ocr_radar_queries = {
"equipamento": {
"materialize_after_dump": True,
"biglake_table": True,
"materialization_mode": "prod",
"dump_mode": "overwrite",
"execute_query": """
SELECT
*
FROM [DBOCR_2024].[dbo].[Equipamento]
""",
},
}

ocr_radar_clocks = generate_dump_db_schedules(
interval=timedelta(days=100),
start_date=datetime(2022, 11, 9, 22, 30, tzinfo=pytz.timezone("America/Sao_Paulo")),
labels=[
constants.RJ_CETRIO_AGENT_LABEL.value,
],
db_database="DBOCR_2024",
db_host="10.39.64.50",
db_port="1433",
db_type="sql_server",
dataset_id="ocr_radar",
infisical_secret_path="/db-ocr-radar",
table_parameters=ocr_radar_queries,
)

ocr_radar_monthly_update_schedule = Schedule(clocks=untuple(ocr_radar_clocks))
2 changes: 2 additions & 0 deletions pipelines/templates/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# -*- coding: utf-8 -*-
from pipelines.templates.run_dbt_model.flows import * # noqa: F401, F403
Empty file.
36 changes: 36 additions & 0 deletions pipelines/templates/run_dbt_model/flows.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# -*- coding: utf-8 -*-
"""
MATERIALIZA MODELOS DO DBT.
"""

from copy import deepcopy

from prefect.run_configs import KubernetesRun
from prefect.storage import GCS
from prefeitura_rio.pipelines_templates.run_dbt_model.flows import (
templates__run_dbt_model__flow,
)
from prefeitura_rio.pipelines_utils.prefect import set_default_parameters
from prefeitura_rio.pipelines_utils.state_handlers import handler_inject_bd_credentials

from pipelines.constants import constants

templates__run_dbt_model_smas__flow = deepcopy(templates__run_dbt_model__flow)
templates__run_dbt_model_smas__flow.state_handlers = [handler_inject_bd_credentials]

templates__run_dbt_model_smas__flow.storage = GCS(constants.GCS_FLOWS_BUCKET.value)
templates__run_dbt_model_smas__flow.run_config = KubernetesRun(
image=constants.DOCKER_IMAGE.value,
labels=[
constants.RJ_CETRIO_AGENT_LABEL.value,
],
)

templates_run_dbt_model_smas_default_parameters = {
"dataset_id": "dataset_id",
"table_id": "table_id",
}
templates__run_dbt_model_smas__flow = set_default_parameters(
templates__run_dbt_model_smas__flow,
default_parameters=templates_run_dbt_model_smas_default_parameters,
)
Loading

0 comments on commit 222ab2e

Please sign in to comment.