Merge branch 'main' into fix_ncm_dir

basedosdados · Aug 22, 2024 · 644d437 · 644d437
2 parents 31291a4 + a768106
commit 644d437
Show file tree

Hide file tree

Showing 4 changed files with 302 additions and 1 deletion.
diff --git a/models/br_cgu_beneficios_cidadao/br_cgu_beneficios_cidadao__auxilio_emergencial.sql b/models/br_cgu_beneficios_cidadao/br_cgu_beneficios_cidadao__auxilio_emergencial.sql
@@ -0,0 +1,34 @@
+{{
+ config(
+ alias="auxilio_emergencial",
+ schema="br_cgu_beneficios_cidadao",
+ materialized="table",
+ partition_by={
+ "field": "ano",
+ "data_type": "int64",
+ "range": {
+ "start": 2020,
+ "end": 2021,
+ "interval": 1,
+ },
+ },
+ cluster_by=["sigla_uf", "id_municipio"],
+ )
+}}
+
+select
+ safe_cast(split(mes, '-')[offset(0)] as int64) as ano,
+ safe_cast(split(mes, '-')[offset(1)] as int64) as mes,
+ safe_cast(sigla_uf as string) sigla_uf,
+ safe_cast(id_municipio as string) id_municipio,
+ safe_cast(nis_beneficiario as string) nis_beneficiario,
+ safe_cast(cpf_beneficiario as string) cpf_beneficiario,
+ safe_cast(nome_beneficiario as string) nome_beneficiario,
+ safe_cast(nis_responsavel as string) nis_responsavel,
+ safe_cast(cpf_responsavel as string) cpf_responsavel,
+ safe_cast(nome_responsavel as string) nome_responsavel,
+ safe_cast(enquadramento as string) enquadramento,
+ safe_cast(parcela as string) parcela,
+ safe_cast(observacao as string) observacao,
+ safe_cast(valor_beneficio as float64) valor_beneficio,
+from `basedosdados-staging.br_cgu_beneficios_cidadao_staging.auxilio_emergencial` as t
diff --git a/models/br_cgu_beneficios_cidadao/schema.yml b/models/br_cgu_beneficios_cidadao/schema.yml
@@ -207,3 +207,72 @@ models:
  - name: valor_parcela
  description: Valor da parcela do benefício
  tests: [not_null]
+ - name: br_cgu_beneficios_cidadao__auxilio_emergencial
+ description: Microdados do auxílio emergencial de 2020 do Ministério da Cidadania
+ a nível de indivíduo.
+ tests:
+ - dbt_utils.unique_combination_of_columns:
+ combination_of_columns:
+ - ano
+ - mes
+ - id_municipio
+ - nis_beneficiario
+ - cpf_beneficiario
+ - nome_beneficiario
+ - nome_responsavel
+ - enquadramento
+ - observacao
+ - parcela
+ - not_null_proportion_multiple_columns:
+ at_least: 0.02 # A coluna "observacao" tem poucos valores preenchidos.
+ columns:
+ - name: ano
+ description: Ano
+ tests:
+ - relationships:
+ to: ref('br_bd_diretorios_data_tempo__ano')
+ field: ano.ano
+ - name: mes
+ description: Mês
+ tests:
+ - relationships:
+ to: ref('br_bd_diretorios_data_tempo__mes')
+ field: mes.mes
+ - name: sigla_uf
+ description: Sigla da Unidade da Federação
+ tests:
+ - custom_relationships:
+ to: ref('br_bd_diretorios_brasil__uf')
+ field: sigla
+ ignore_values: BR
+ - name: id_municipio
+ description: ID Município - IBGE 7 Dígitos
+ tests:
+ - relationships:
+ to: ref('br_bd_diretorios_brasil__municipio')
+ field: id_municipio
+ - name: nis_beneficiario
+ description: Número de Identificação Social (NIS), caso possua
+ - name: cpf_beneficiario
+ description: Número no Cadastro de Pessoas Físicas (CPF), caso possua
+ - name: nome_beneficiario
+ description: Nome do beneficiário
+ - name: nis_responsavel
+ description: Número de Identificação Social (NIS) do responsável pelo beneficiário,
+ caso possua
+ - name: cpf_responsavel
+ description: Número no Cadastro de Pessoas Físicas (CPF) do responsável beneficiário,
+ caso possua
+ - name: nome_responsavel
+ description: Nome do responsável pelo beneficiário, caso possua
+ - name: enquadramento
+ description: Identifica se o beneficiário é do grupo Bolsa Família, Inscrito
+ no Cadastro Único (CadÚnico) ou Não Inscrito no Cadastro Único (ExtraCad)
+ - name: parcela
+ description: Número sequencial da parcela disponibilizada
+ - name: observacao
+ description: Indica alterações na parcela disponibilizada como, por exemplo,
+ se foi devolvida ou está retida
+ - name: valor_beneficio
+ description: Valor do Benefício
+ tests: [not_null]
diff --git a/models/br_inep_saeb/code/extend_dict.py b/models/br_inep_saeb/code/extend_dict.py
@@ -0,0 +1,194 @@
+# Script para alterar o formato da cobertura temporal do dicionario saeb
+# O formato será expandido, cada linha será um ano
+import basedosdados as bd
+import re
+import itertools
+import pandas as pd
+import os
+
+ROOT = os.path.join("models", "br_inep_saeb")
+INPUT = os.path.join(ROOT, "input")
+OUTPUT = os.path.join(ROOT, "output")
+
+os.makedirs(OUTPUT, exist_ok=True)
+
+df = pd.read_csv(os.path.join(INPUT, "staging_br_inep_saeb_dicionario_dicionario.csv"))
+
+df = df.loc[(df["cobertura_temporal"] != "1") & (df["cobertura_temporal"] != "D"),]
+
+
+def parse_temporal_coverage(temporal_coverage: str) -> list[dict[str, int]]:
+ def parse_common(value: str) -> dict[str, int]:
+ # single value
+ # (y)
+ if value[0] == "(":
+ return dict(temporal_unit=int(value[1]))
+
+ # single year
+ if len(value) == 4:
+ return dict(single_year=int(value))
+
+ # x(y) or x(y)z
+ if "(" in value:
+ pattern_temporal_unit = r"\((\d+)\)"
+ # Split and drop empty strings
+ parts: list[str] = [
+ i for i in re.split(pattern_temporal_unit, value) if len(i) > 0
+ ]
+
+ assert len(parts) <= 3, f"Error: {temporal_coverage=}"
+
+ # x(y), 2005(2)
+ if len(parts) == 2:
+ return dict(start_year=int(parts[0]), temporal_unit=int(parts[1]))
+
+ return dict(
+ start_year=int(parts[0]),
+ temporal_unit=int(parts[1]),
+ end_year=int(parts[2]),
+ )
+
+ raise Exception(f"Failed to parse {temporal_coverage=}")
+
+ if "," in temporal_coverage:
+ return [parse_common(i.strip()) for i in temporal_coverage.split(",")]
+ else:
+ return [parse_common(temporal_coverage)]
+
+
+# Examples:
+# {'start_year': 2013, 'temporal_unit': 2, 'end_year': 2017}
+def build_date_range(
+ temporal_coverage: dict[str, int], start_year: int, latest_year: int
+):
+ if (
+ "start_year" in temporal_coverage
+ and "temporal_unit" in temporal_coverage
+ and "end_year" in temporal_coverage
+ ):
+ return list(
+ range(
+ temporal_coverage["start_year"],
+ temporal_coverage["end_year"] + temporal_coverage["temporal_unit"],
+ temporal_coverage["temporal_unit"],
+ )
+ )
+ elif "start_year" in temporal_coverage and "temporal_unit" in temporal_coverage:
+ return list(
+ range(
+ temporal_coverage["start_year"],
+ latest_year + temporal_coverage["temporal_unit"],
+ temporal_coverage["temporal_unit"],
+ )
+ )
+ elif "temporal_unit" in temporal_coverage:
+ return list(
+ range(
+ start_year,
+ latest_year + temporal_coverage["temporal_unit"],
+ temporal_coverage["temporal_unit"],
+ )
+ )
+ elif "single_year" in temporal_coverage:
+ return [temporal_coverage["single_year"]]
+
+
+dfs = dict(
+ [
+ # Table id is wrong
+ (table_id.replace("aluno_ef_2_ano", "aluno_ef_2ano"), df_by_table)
+ for (table_id, df_by_table) in df.groupby("id_tabela")
+ ]
+)
+
+backend = bd.Backend(
+ graphql_url="https://staging.backend.basedosdados.org/api/v1/graphql"
+)
+
+
+def transform_df(table_id: str, df: pd.DataFrame) -> pd.DataFrame:
+ d = df.copy()
+ table_slug = backend._get_table_id_from_name(
+ gcp_dataset_id="br_inep_saeb", gcp_table_id=table_id
+ )
+ if not isinstance(table_slug, str):
+ raise Exception(f"Not found slug fo {table_id=}")
+
+ response = backend._execute_query(
+ query="""
+ query($table_id: ID) {
+ allTable(id: $table_id) {
+ edges {
+ node {
+ name,
+ coverages {
+ edges {
+ node {
+ datetimeRanges {
+ edges {
+ node {
+ id,
+ startYear,
+ endYear
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ """,
+ variables={"table_id": table_slug},
+ )
+
+ payload = backend._simplify_graphql_response(response)["allTable"][0]["coverages"][
+ 0
+ ]["datetimeRanges"][0]
+
+ latest_year = payload["endYear"]
+ start_year = payload["startYear"]
+
+ d["temporal_coverage_parsed"] = d["cobertura_temporal"].apply(
+ lambda x: list(
+ itertools.chain(
+ *[ # type: ignore
+ build_date_range(i, start_year=start_year, latest_year=latest_year)
+ for i in parse_temporal_coverage(x)
+ ]
+ )
+ )
+ )
+ return d
+
+
+new_dict = {
+ table_id: transform_df(table_id, df_by_table)
+ for (table_id, df_by_table) in dfs.items()
+}
+
+OUTPUT_FILE = os.path.join(OUTPUT, "dicionario.csv")
+
+dict_output = (
+ pd.concat(new_dict.values())
+ .drop(columns=["cobertura_temporal"])
+ .explode("temporal_coverage_parsed")
+ .rename(columns={"temporal_coverage_parsed": "cobertura_temporal"}, errors="raise")
+)
+
+dict_output["id_tabela"].unique()
+
+dict_output["id_tabela"] = dict_output["id_tabela"].replace({"aluno_ef_2_ano": "aluno_ef_2ano"})
+
+dict_output.to_csv(OUTPUT_FILE, index=False)
+
+tb = bd.Table(dataset_id="br_inep_saeb", table_id="dicionario")
+
+tb.create(
+ OUTPUT_FILE,
+ if_table_exists="replace",
+ if_storage_data_exists="replace",
+)
diff --git a/models/br_me_caged/br_me_caged__dicionario.sql b/models/br_me_caged/br_me_caged__dicionario.sql
@@ -1,6 +1,10 @@
 {{ config(alias="dicionario", schema="br_me_caged") }}
 select
- safe_cast(id_tabela as string) id_tabela,
+ safe_cast(
+ replace(
+ id_tabela, "microdados_movimentacoes", "microdados_movimentacao"
+ ) as string
+ ) id_tabela,
  safe_cast(nome_coluna as string) nome_coluna,
  safe_cast(chave as string) chave,
  safe_cast(cobertura_temporal as string) cobertura_temporal,