Skip to content

Commit

Permalink
Merge branch 'main' into fix_ncm_dir
Browse files Browse the repository at this point in the history
  • Loading branch information
tricktx authored Aug 22, 2024
2 parents 31291a4 + a768106 commit 644d437
Show file tree
Hide file tree
Showing 4 changed files with 302 additions and 1 deletion.
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{{
config(
alias="auxilio_emergencial",
schema="br_cgu_beneficios_cidadao",
materialized="table",
partition_by={
"field": "ano",
"data_type": "int64",
"range": {
"start": 2020,
"end": 2021,
"interval": 1,
},
},
cluster_by=["sigla_uf", "id_municipio"],
)
}}

select
safe_cast(split(mes, '-')[offset(0)] as int64) as ano,
safe_cast(split(mes, '-')[offset(1)] as int64) as mes,
safe_cast(sigla_uf as string) sigla_uf,
safe_cast(id_municipio as string) id_municipio,
safe_cast(nis_beneficiario as string) nis_beneficiario,
safe_cast(cpf_beneficiario as string) cpf_beneficiario,
safe_cast(nome_beneficiario as string) nome_beneficiario,
safe_cast(nis_responsavel as string) nis_responsavel,
safe_cast(cpf_responsavel as string) cpf_responsavel,
safe_cast(nome_responsavel as string) nome_responsavel,
safe_cast(enquadramento as string) enquadramento,
safe_cast(parcela as string) parcela,
safe_cast(observacao as string) observacao,
safe_cast(valor_beneficio as float64) valor_beneficio,
from `basedosdados-staging.br_cgu_beneficios_cidadao_staging.auxilio_emergencial` as t
69 changes: 69 additions & 0 deletions models/br_cgu_beneficios_cidadao/schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -207,3 +207,72 @@ models:
- name: valor_parcela
description: Valor da parcela do benefício
tests: [not_null]
- name: br_cgu_beneficios_cidadao__auxilio_emergencial
description: Microdados do auxílio emergencial de 2020 do Ministério da Cidadania
a nível de indivíduo.
tests:
- dbt_utils.unique_combination_of_columns:
combination_of_columns:
- ano
- mes
- id_municipio
- nis_beneficiario
- cpf_beneficiario
- nome_beneficiario
- nome_responsavel
- enquadramento
- observacao
- parcela
- not_null_proportion_multiple_columns:
at_least: 0.02 # A coluna "observacao" tem poucos valores preenchidos.
columns:
- name: ano
description: Ano
tests:
- relationships:
to: ref('br_bd_diretorios_data_tempo__ano')
field: ano.ano
- name: mes
description: Mês
tests:
- relationships:
to: ref('br_bd_diretorios_data_tempo__mes')
field: mes.mes
- name: sigla_uf
description: Sigla da Unidade da Federação
tests:
- custom_relationships:
to: ref('br_bd_diretorios_brasil__uf')
field: sigla
ignore_values: BR
- name: id_municipio
description: ID Município - IBGE 7 Dígitos
tests:
- relationships:
to: ref('br_bd_diretorios_brasil__municipio')
field: id_municipio
- name: nis_beneficiario
description: Número de Identificação Social (NIS), caso possua
- name: cpf_beneficiario
description: Número no Cadastro de Pessoas Físicas (CPF), caso possua
- name: nome_beneficiario
description: Nome do beneficiário
- name: nis_responsavel
description: Número de Identificação Social (NIS) do responsável pelo beneficiário,
caso possua
- name: cpf_responsavel
description: Número no Cadastro de Pessoas Físicas (CPF) do responsável beneficiário,
caso possua
- name: nome_responsavel
description: Nome do responsável pelo beneficiário, caso possua
- name: enquadramento
description: Identifica se o beneficiário é do grupo Bolsa Família, Inscrito
no Cadastro Único (CadÚnico) ou Não Inscrito no Cadastro Único (ExtraCad)
- name: parcela
description: Número sequencial da parcela disponibilizada
- name: observacao
description: Indica alterações na parcela disponibilizada como, por exemplo,
se foi devolvida ou está retida
- name: valor_beneficio
description: Valor do Benefício
tests: [not_null]
194 changes: 194 additions & 0 deletions models/br_inep_saeb/code/extend_dict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
# Script para alterar o formato da cobertura temporal do dicionario saeb
# O formato será expandido, cada linha será um ano
import basedosdados as bd
import re
import itertools
import pandas as pd
import os

ROOT = os.path.join("models", "br_inep_saeb")
INPUT = os.path.join(ROOT, "input")
OUTPUT = os.path.join(ROOT, "output")

os.makedirs(OUTPUT, exist_ok=True)

df = pd.read_csv(os.path.join(INPUT, "staging_br_inep_saeb_dicionario_dicionario.csv"))

df = df.loc[(df["cobertura_temporal"] != "1") & (df["cobertura_temporal"] != "D"),]


def parse_temporal_coverage(temporal_coverage: str) -> list[dict[str, int]]:
def parse_common(value: str) -> dict[str, int]:
# single value
# (y)
if value[0] == "(":
return dict(temporal_unit=int(value[1]))

# single year
if len(value) == 4:
return dict(single_year=int(value))

# x(y) or x(y)z
if "(" in value:
pattern_temporal_unit = r"\((\d+)\)"
# Split and drop empty strings
parts: list[str] = [
i for i in re.split(pattern_temporal_unit, value) if len(i) > 0
]

assert len(parts) <= 3, f"Error: {temporal_coverage=}"

# x(y), 2005(2)
if len(parts) == 2:
return dict(start_year=int(parts[0]), temporal_unit=int(parts[1]))

return dict(
start_year=int(parts[0]),
temporal_unit=int(parts[1]),
end_year=int(parts[2]),
)

raise Exception(f"Failed to parse {temporal_coverage=}")

if "," in temporal_coverage:
return [parse_common(i.strip()) for i in temporal_coverage.split(",")]
else:
return [parse_common(temporal_coverage)]


# Examples:
# {'start_year': 2013, 'temporal_unit': 2, 'end_year': 2017}
def build_date_range(
temporal_coverage: dict[str, int], start_year: int, latest_year: int
):
if (
"start_year" in temporal_coverage
and "temporal_unit" in temporal_coverage
and "end_year" in temporal_coverage
):
return list(
range(
temporal_coverage["start_year"],
temporal_coverage["end_year"] + temporal_coverage["temporal_unit"],
temporal_coverage["temporal_unit"],
)
)
elif "start_year" in temporal_coverage and "temporal_unit" in temporal_coverage:
return list(
range(
temporal_coverage["start_year"],
latest_year + temporal_coverage["temporal_unit"],
temporal_coverage["temporal_unit"],
)
)
elif "temporal_unit" in temporal_coverage:
return list(
range(
start_year,
latest_year + temporal_coverage["temporal_unit"],
temporal_coverage["temporal_unit"],
)
)
elif "single_year" in temporal_coverage:
return [temporal_coverage["single_year"]]


dfs = dict(
[
# Table id is wrong
(table_id.replace("aluno_ef_2_ano", "aluno_ef_2ano"), df_by_table)
for (table_id, df_by_table) in df.groupby("id_tabela")
]
)

backend = bd.Backend(
graphql_url="https://staging.backend.basedosdados.org/api/v1/graphql"
)


def transform_df(table_id: str, df: pd.DataFrame) -> pd.DataFrame:
d = df.copy()
table_slug = backend._get_table_id_from_name(
gcp_dataset_id="br_inep_saeb", gcp_table_id=table_id
)
if not isinstance(table_slug, str):
raise Exception(f"Not found slug fo {table_id=}")

response = backend._execute_query(
query="""
query($table_id: ID) {
allTable(id: $table_id) {
edges {
node {
name,
coverages {
edges {
node {
datetimeRanges {
edges {
node {
id,
startYear,
endYear
}
}
}
}
}
}
}
}
}
}
""",
variables={"table_id": table_slug},
)

payload = backend._simplify_graphql_response(response)["allTable"][0]["coverages"][
0
]["datetimeRanges"][0]

latest_year = payload["endYear"]
start_year = payload["startYear"]

d["temporal_coverage_parsed"] = d["cobertura_temporal"].apply(
lambda x: list(
itertools.chain(
*[ # type: ignore
build_date_range(i, start_year=start_year, latest_year=latest_year)
for i in parse_temporal_coverage(x)
]
)
)
)
return d


new_dict = {
table_id: transform_df(table_id, df_by_table)
for (table_id, df_by_table) in dfs.items()
}

OUTPUT_FILE = os.path.join(OUTPUT, "dicionario.csv")

dict_output = (
pd.concat(new_dict.values())
.drop(columns=["cobertura_temporal"])
.explode("temporal_coverage_parsed")
.rename(columns={"temporal_coverage_parsed": "cobertura_temporal"}, errors="raise")
)

dict_output["id_tabela"].unique()

dict_output["id_tabela"] = dict_output["id_tabela"].replace({"aluno_ef_2_ano": "aluno_ef_2ano"})

dict_output.to_csv(OUTPUT_FILE, index=False)

tb = bd.Table(dataset_id="br_inep_saeb", table_id="dicionario")

tb.create(
OUTPUT_FILE,
if_table_exists="replace",
if_storage_data_exists="replace",
)
6 changes: 5 additions & 1 deletion models/br_me_caged/br_me_caged__dicionario.sql
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
{{ config(alias="dicionario", schema="br_me_caged") }}
select
safe_cast(id_tabela as string) id_tabela,
safe_cast(
replace(
id_tabela, "microdados_movimentacoes", "microdados_movimentacao"
) as string
) id_tabela,
safe_cast(nome_coluna as string) nome_coluna,
safe_cast(chave as string) chave,
safe_cast(cobertura_temporal as string) cobertura_temporal,
Expand Down

0 comments on commit 644d437

Please sign in to comment.