diff --git a/vale/DENGBR/Figuras/DENGBR_ScaledAntesDepois.jpg b/vale/DENGBR/Figuras/DENGBR_ScaledAntesDepois.jpg new file mode 100644 index 0000000..4a9eb67 Binary files /dev/null and b/vale/DENGBR/Figuras/DENGBR_ScaledAntesDepois.jpg differ diff --git a/vale/DENGBR/Figuras/DENGBR_ScaledBeforeVsAfter_regionaltrend.jpg b/vale/DENGBR/Figuras/DENGBR_ScaledBeforeVsAfter_regionaltrend.jpg new file mode 100644 index 0000000..9fab4f6 Binary files /dev/null and b/vale/DENGBR/Figuras/DENGBR_ScaledBeforeVsAfter_regionaltrend.jpg differ diff --git a/vale/DENGBR/Figuras/DENGBR_ScaledDistance.jpg b/vale/DENGBR/Figuras/DENGBR_ScaledDistance.jpg new file mode 100644 index 0000000..b4b7bc8 Binary files /dev/null and b/vale/DENGBR/Figuras/DENGBR_ScaledDistance.jpg differ diff --git a/vale/DENGBR/Figuras/DENGBR_ScaledHeatmap.jpg b/vale/DENGBR/Figuras/DENGBR_ScaledHeatmap.jpg new file mode 100644 index 0000000..53d9800 Binary files /dev/null and b/vale/DENGBR/Figuras/DENGBR_ScaledHeatmap.jpg differ diff --git a/vale/DENGBR/Figuras/DENGBR_ScaledNMDS.jpg b/vale/DENGBR/Figuras/DENGBR_ScaledNMDS.jpg new file mode 100644 index 0000000..ba301b9 Binary files /dev/null and b/vale/DENGBR/Figuras/DENGBR_ScaledNMDS.jpg differ diff --git a/vale/DENGBR/Figuras/DENGBR_Scaledbiplot.jpg b/vale/DENGBR/Figuras/DENGBR_Scaledbiplot.jpg new file mode 100644 index 0000000..0099996 Binary files /dev/null and b/vale/DENGBR/Figuras/DENGBR_Scaledbiplot.jpg differ diff --git a/vale/DENGBR/Figuras/DENGBR_ScaledbyYear.jpg b/vale/DENGBR/Figuras/DENGBR_ScaledbyYear.jpg new file mode 100644 index 0000000..3bf612d Binary files /dev/null and b/vale/DENGBR/Figuras/DENGBR_ScaledbyYear.jpg differ diff --git a/vale/DENGBR/index.html.html b/vale/DENGBR/index.html.html new file mode 100644 index 0000000..7e26f71 --- /dev/null +++ b/vale/DENGBR/index.html.html @@ -0,0 +1,1052 @@ + + fase2 + + + + + + + + + + + + + +
+ + +

Base de dados secundários

+ +
+

Transmissão por vetores

+

Arboviroses

+

Dengue

+
Download dos dados
+

Os arquivos dos anos 2014 ao 2023 foram obtidos do SINAN através do seu sítio de FTP localizado em: ftp://ftp.datasus.gov.br/dissemin/publicos/SINAN/DADOS/ e identificados pelo prefixo DENGBR.

+
Preparação dos dados
+

O seguinte script cria o conjunto de variáveis a ser utilizada para a exploração, e os salva em um arquivo .Rdata que pode ser posteriormente carregado com a função load().

+
#install.packages("devtools")
+#library(devtools)
+#install_github("danicat/read.dbc")
+library(read.dbc) # ler dbc
+library(tidyverse) #contem vários pacotes para diversas funções
+library(foreign) #ler DBF
+
+
+########################################################################################################
+########################################################################################################
+setwd("/data/gabriel/Trabalhos/Vale/DENG/")                                                #############
+metadata = read.table("/data/gabriel/Trabalhos/Vale/metadata.txt", header = T, sep = "\t") #############
+city_codes <- metadata$CODMUN6                                                             #############
+disease_prefix = "DENGBR"                                                                  #############
+########################################################################################################
+########################################################################################################
+
+# Define folder path and initialize the list
+folder_path <- getwd()
+years <- 14:23
+
+# Create a table with disease incidence per 10k population for each year
+year_range <- as.character(2014:2023)
+
+# Find existing year columns in the dataset
+existing_years <- intersect(year_range, colnames(final_table))
+
+# Dynamically create the incidence table
+incidence_table <- final_table %>%
+  # Select necessary columns along with the existing years
+  select(NMMUNICIPIO, NMREG, AFETADO, CENSO2022, all_of(existing_years)) %>%
+  # Calculate incidence per 10k population for each available year
+  mutate(across(all_of(existing_years), ~ (.x / CENSO2022) * 10000, .names = "inc_{col}")) %>%
+  # Select relevant columns for the final table
+  select(NMMUNICIPIO, NMREG, AFETADO, starts_with("inc_"))
+
+# Rename columns for readability
+incidence_table <- incidence_table %>%
+  rename_with(~ sub("inc_", "", .), starts_with("inc_"))
+
+# View the resulting table and scale
+matrix = incidence_table[,4:ncol(incidence_table)]
+matrix_scaled_by_row <- t(scale(t(matrix)))  # Transpose, scale rows, then transpose back
+
+incidence_table = cbind(incidence_table[,1:3], matrix_scaled_by_row)
+
+# Assuming incidence_table is your data frame from previous steps
+
+# Reshape the data from wide to long format for plotting
+incidence_long <- incidence_table %>%
+  pivot_longer(
+    cols = all_of(existing_years), # Use only the columns that exist
+    names_to = "Year",             # Name of the new column for years
+    values_to = "Incidence"        # Name of the new column for incidence values
+  )
+
+save(incidence_long, incidence_table, final_table, file = paste0(disease_prefix,"_variables.Rdata"))
+
Análises descritivas
+
Boxplots de dispersão
+

Criar boxplots que mostrem a dispersão do Z-score ao longos dos anos, e separando o períodos em antes e depois do rompimento da barragem.

+
library(tidyverse)
+library(ggplot2)
+
+########################################################################################################
+########################################################################################################
+setwd("/data/gabriel/Trabalhos/Vale/DENG/")                                                #############
+metadata = read.table("/data/gabriel/Trabalhos/Vale/metadata.txt", header = T, sep = "\t") #############
+city_codes <- metadata$CODMUN6                                                             #############
+disease_prefix = "DENGBR"                                                                  #############
+########################################################################################################
+########################################################################################################
+
+# Load previously processed data
+load(paste0(disease_prefix,"_variables.Rdata"))
+
+
+# Create boxplot with incidence per year
+pdf(file = paste0(disease_prefix, "_ScaledbyYear.pdf"), width = 10, height = 10)
+
+ggplot(incidence_long, aes(x = Year, y = Incidence, fill = AFETADO)) +
+  geom_boxplot(outlier.shape = NA, width = 0.5) +  # Adjust box width
+  geom_point(position = position_jitterdodge(jitter.width = 0.15, dodge.width = 0.5), shape = 21, size = 2, stroke = 0.5, alpha = 0.5) +  # Add points with jitter dodge
+  facet_wrap(~ NMREG, ncol = 2) +
+  scale_fill_manual(values = c("SIM" = "#1b9e77", "NAO" = "#d95f02")) +
+  labs(
+    title = "Distribuição da Incidência de Doença por Região de Saúde",
+    x = "Ano",
+    y = "Incidência por 10000 Habitates",
+    fill = "Município Afetado"
+  ) +
+  theme_bw() +
+  theme(axis.text.x = element_text(angle = 45, hjust = 1))
+
+dev.off()
+
+
+# Categorize the years as "Before" and "After"
+incidence_long <- incidence_long %>%
+  mutate(
+    Period = case_when(
+      Year %in% 2014:2018 ~ "Antes",
+      Year %in% 2019:2023 ~ "Depois"
+    )
+  ) %>%
+  filter(!is.na(Period))  # Remove rows outside of the specified range
+
+# Create the plot showing distribution per city before and after
+
+pdf(file = paste0(disease_prefix, "_ScaledAntesDepois.pdf"), width = 10, height = 10)
+
+ggplot(incidence_long, aes(x = NMMUNICIPIO, y = Incidence, fill = AFETADO)) +
+  geom_boxplot(aes(group = interaction(NMMUNICIPIO, Period), fill = AFETADO), 
+               position = position_dodge(width = 0.8), width = 0.45, outlier.shape = NA) +  # Adjust box width and dodge
+  geom_point(aes(shape = Period, fill = AFETADO, group = Period), 
+             position = position_dodge(width = 0.8), 
+             size = 2, stroke = 0.5, alpha = 0.5) +  # Map Period to shape for distinction
+  facet_wrap(~ NMREG, ncol = 2, scales = "free_x") +
+  scale_fill_manual(values = c("SIM" = "#1b9e77", "NAO" = "#d95f02")) +
+  scale_shape_manual(values = c("Antes" = 21, "Depois" = 22)) +  # Set shapes for periods
+  labs(
+    title = "Comparação da Incidência de Doenças antes e depois do evento",
+    x = "Minucípio",
+    y = "Incidência por 10000 Habitantes",
+    fill = "Município Afetado",
+    shape = "Período"
+  ) +
+  theme_bw() +
+  theme(axis.text.x = element_text(angle = 45, hjust = 1))
+
+dev.off()
+

By Year
+By Moment

+
Busca por padrões
+

A busca por padrões envolve o cálculo de distâncias para criar agrupamentos. A distância utilizada foi a correlação de Pearson, uma vez que o objetivo é identificar cidades que seguem o mesmo padrão da incidência ao longo dos anos.
+O índice de Calinski-Harabasz foi utilizado para definir o número ótimo de cluster. Quando o número era maior que 10 (indicando falta de padrão), optamos para escolher visualmente. |
+Neste caso, o número de agrupamentos foi 5.

+
library(tidyverse)
+library(ggplot2)
+library(pheatmap)
+library(vegan)
+library(ComplexHeatmap)
+library(ggrepel)
+
+########################################################################################################
+########################################################################################################
+setwd("/data/gabriel/Trabalhos/Vale/DENG/")                                                #############
+metadata = read.table("/data/gabriel/Trabalhos/Vale/metadata.txt", header = T, sep = "\t") #############
+city_codes <- metadata$CODMUN6                                                             #############
+disease_prefix = "DENGBR"                                                                  #############
+########################################################################################################
+########################################################################################################
+
+# Load previously processed data
+load(paste0(disease_prefix,"_variables.Rdata"))
+
+matrix = data.frame(incidence_table[,4:ncol(incidence_table)], row.names = incidence_table$NMMUNICIPIO)
+row_anot = data.frame(incidence_table[,2:3], row.names = incidence_table$NMMUNICIPIO)
+
+# Calculate the distance matrix
+data.dist <- as.dist(1 - cor(t(matrix)))
+
+# Perform hierarchical clustering
+hc <- hclust(data.dist, method = "complete")
+
+# Calculate the CH Index for different numbers of clusters
+ch_index <- numeric(10)
+for (k in 2:10) {
+  clustering <- cutree(hc, k = k)
+  ch_index[k] <- cluster.stats(d = as.dist(data.dist), clustering)$ch
+}
+
+# Find the optimal number of clusters
+optimal_clusters <- which.max(ch_index)
+optimal_clusters <- 5
+
+# Plot the CH Index
+plot(2:10, ch_index[2:10], type = "b", xlab = "Number of Clusters", ylab = "CH Index",
+     main = "Optimal Number of Clusters using CH Index")
+abline(v = optimal_clusters, col = "red", lty = 2)
+
+annotation_colors <- list(
+  AFETADO = c("SIM" = "#1b9e77", "NAO" = "#d95f02"),  # Colors for AFETADO
+  NMREG = c("BETIM" = "#66c2a5",           # Greenish
+            "CURVELO" = "#fc8d62",         # Orange
+            "PARA DE MINAS/NOVA SERRANA" = "#8da0cb",  # Blue
+            "PATOS DE MINAS" = "#e78ac3",  # Pink
+            "SETE LAGOAS" = "#a6d854"),     # Light green
+  Cluster = c("1" = "#FFDB6D", "2" = "#C4961A", "3" = "#D16103", "4" = "#52854C", "5" = "#293352")
+)
+
+row_anot = data.frame(row_anot, Cluster = as.factor(cutree(hc, k = optimal_clusters)))
+
+

A partir dos padrões, geramos 2 heatmaps:

+ +
# Plot the heatmap
+pdf(file = paste0(disease_prefix, "_ScaledHeatmap.pdf"), width = 12, height = 10)
+pheatmap(
+  mat = as.matrix(matrix[rowSums(matrix != 0) > 0, ]),
+  scale = "row",
+  annotation_row = row_anot[rowSums(matrix != 0) > 0, ],
+  annotation_colors = annotation_colors,
+  clustering_distance_rows = "correlation",
+  cluster_cols = F,
+  cutree_rows = optimal_clusters,
+  clustering_method = "complete",
+  show_rownames = TRUE,
+  show_colnames = TRUE,
+  main = "Heatmap of Disease Incidence"
+)
+dev.off()
+
+# Create a color mapping for annotations
+annotation_colors <- list(
+  AFETADO = c("SIM" = "#1b9e77", "NAO" = "#d95f02"),
+  NMREG = c("BETIM" = "#66c2a5", 
+            "CURVELO" = "#fc8d62", 
+            "PARA DE MINAS/NOVA SERRANA" = "#8da0cb", 
+            "PATOS DE MINAS" = "#e78ac3", 
+            "SETE LAGOAS" = "#a6d854")
+)
+
+# Define row annotations
+row_annotation <- rowAnnotation(
+  AFETADO = row_anot$AFETADO,
+  NMREG = row_anot$NMREG,
+  col = annotation_colors
+)
+
+# Create the heatmap object with row annotations
+ht <- Heatmap(
+  as.matrix(data.dist),
+  name = "Distance",
+  row_title = "Cities",
+  column_title = "Cities",
+  row_split = 5,
+  column_split = 5,
+  right_annotation = row_annotation
+)
+
+# Draw the heatmap with legends at the bottom
+pdf(file = paste0(disease_prefix, "_ScaledDistance.pdf"), width = 13, height = 14)
+draw(ht, heatmap_legend_side = "bottom", annotation_legend_side = "bottom")
+dev.off()
+

Heatmap
+Distance

+

Um teste exato de Fisher foi usado para avaliar se havia maior prevalência de municípios afetados, ou de uma determinada região em um cluster. Isso poderia explicar se os municípios se agrupam mais por suas propriedades regionais, ou pelo fato de terem sido afetados.

+
# Initialize results for affected prevalence
+affected_results <- list()
+
+for (cluster_id in unique(row_anot$Cluster)) {
+  # Cities inside the cluster
+  inside_cluster <- row_anot[row_anot$Cluster == cluster_id, ]
+  # Cities outside the cluster
+  outside_cluster <- row_anot[row_anot$Cluster != cluster_id, ]
+  
+  # Contingency table for AFETADO
+  affected_table <- matrix(
+    c(
+      sum(inside_cluster$AFETADO == "SIM"),
+      sum(inside_cluster$AFETADO == "NAO"),
+      sum(outside_cluster$AFETADO == "SIM"),
+      sum(outside_cluster$AFETADO == "NAO")
+    ),
+    nrow = 2,
+    dimnames = list(
+      c("Inside Cluster", "Outside Cluster"),
+      c("Affected", "Not Affected")
+    )
+  )
+  
+  # Run Fisher's test
+  fisher_affected <- fisher.test(affected_table)
+  
+  # Save results
+  affected_results[[paste0("Cluster_", cluster_id)]] <- list(
+    pval = fisher_affected$p.value,
+    affected_ratio_inside = affected_table[1, 1] / sum(affected_table[1, ]),
+    affected_ratio_outside = affected_table[2, 1] / sum(affected_table[2, ])
+  )
+}
+
+# Initialize results for regional prevalence
+region_results <- list()
+
+for (cluster_id in unique(row_anot$Cluster)) {
+  for (region in unique(row_anot$NMREG)) {
+    # Cities inside the cluster
+    inside_cluster <- row_anot[row_anot$Cluster == cluster_id, ]
+    # Cities outside the cluster
+    outside_cluster <- row_anot[row_anot$Cluster != cluster_id, ]
+    
+    # Contingency table for NMREG
+    region_table <- matrix(
+      c(
+        sum(inside_cluster$NMREG == region),
+        nrow(inside_cluster) - sum(inside_cluster$NMREG == region),
+        sum(outside_cluster$NMREG == region),
+        nrow(outside_cluster) - sum(outside_cluster$NMREG == region)
+      ),
+      nrow = 2,
+      dimnames = list(
+        c("Inside Cluster", "Outside Cluster"),
+        c("In Region", "Not In Region")
+      )
+    )
+    
+    # Run Fisher's test
+    fisher_region <- fisher.test(region_table)
+    
+    # Save results
+    region_results[[paste0("Cluster_", cluster_id, "_Region_", region)]] <- list(
+      pval = fisher_region$p.value,
+      region_ratio_inside = region_table[1, 1] / sum(region_table[1, ]),
+      region_ratio_outside = region_table[2, 1] / sum(region_table[2, ])
+    )
+  }
+}
+
+
+#Compile
+affected_results_df <- do.call(rbind, lapply(names(affected_results), function(name) {
+  data.frame(
+    Cluster = name,
+    pval = affected_results[[name]]$pval,
+    ratio_inside = affected_results[[name]]$affected_ratio_inside,
+    ratio_outside = affected_results[[name]]$affected_ratio_outside
+  )
+}))
+region_results_df <- do.call(rbind, lapply(names(region_results), function(name) {
+  parts <- strsplit(name, "_")[[1]]  # Split name into cluster and region
+  data.frame(
+    Cluster = parts[2],
+    Region = parts[4],
+    pval = region_results[[name]]$pval,
+    ratio_inside = region_results[[name]]$region_ratio_inside,
+    ratio_outside = region_results[[name]]$region_ratio_outside
+  )
+}))
+
+kable(subset(affected_results_df, subset = pval < 0.05), format = "markdown")
+kable(subset(region_results_df, subset = pval < 0.05), format = "markdown")
+

Os clusters 1 e 2 estão superrepresentados por municípios das regiões de Betim e Patos de Minas, respectivamente.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ClusterRegionpvalratio_insideratio_outside
11BETIM0.03161800.53846150.2037037
92PATOS DE MINAS0.02065850.36363640.0714286
+

A importância de cada variável em explicar a variância entre os municípios foi medida através de um teste de PERMANOVA e foi visualizado em uma análise de Escalonamento Multidimensional não métrico (NMDS).

+
### PERMANOVA
+
+# Ensure the annotation variables are factors
+row_anot$AFETADO <- factor(row_anot$AFETADO)
+row_anot$NMREG <- factor(row_anot$NMREG)
+
+# Perform the PERMANOVA test
+permanova_result <- adonis2(
+  as.dist(data.dist) ~ AFETADO + NMREG + AFETADO:NMREG,
+  data = row_anot,
+  permutations = 199,
+  method = "euclidean", by = "terms"
+)
+
+# Display the results
+kable(permanova_result)
+
+# Extract PERMANOVA results using rownames
+permanova_text <- paste0(
+  "PERMANOVA Results:\n",
+  "AFETADO: R² = ", round(permanova_result["AFETADO", "R2"] * 100, 2), "%, p = ", signif(permanova_result["AFETADO", "Pr(>F)"], 3), "\n",
+  "NMREG: R² = ", round(permanova_result["NMREG", "R2"] * 100, 2), "%, p = ", signif(permanova_result["NMREG", "Pr(>F)"], 3), "\n",
+  "Interaction (AFETADO:NMREG): R² = ", round(permanova_result["AFETADO:NMREG", "R2"] * 100, 2), "%, p = ", signif(permanova_result["AFETADO:NMREG", "Pr(>F)"], 3)
+)
+
+# Perform NMDS
+nmds_result <- metaMDS(as.dist(data.dist), k = 2, trymax = 100)
+# Calculate variance explained by each NMDS axis
+distance_matrix <- as.matrix(data.dist)
+eigenvalues <- cmdscale(distance_matrix, k = 2, eig = TRUE)$eig
+variance_explained_axis1 <- round(100 * eigenvalues[1] / sum(abs(eigenvalues)), 2)
+variance_explained_axis2 <- round(100 * eigenvalues[2] / sum(abs(eigenvalues)), 2)
+# Extract coordinates and add annotations
+nmds_data <- data.frame(
+  NMDS1 = nmds_result$points[, 1],
+  NMDS2 = nmds_result$points[, 2],
+  NMREG = row_anot$NMREG,
+  AFETADO = row_anot$AFETADO
+)
+# Define region colors
+region_colors <- c(
+  "BETIM" = "#66c2a5",
+  "CURVELO" = "#fc8d62",
+  "PARA DE MINAS/NOVA SERRANA" = "#8da0cb",
+  "PATOS DE MINAS" = "#e78ac3",
+  "SETE LAGOAS" = "#a6d854"
+)
+
+
+# Plot with ggplot2
+pdf(file = paste0(disease_prefix, "_ScaledNMDS.pdf"), width = 12, height = 10)
+
+ggplot(nmds_data, aes(x = NMDS1, y = NMDS2, fill = AFETADO)) +
+  geom_point(size = 4, alpha = 0.5, shape = 21, stroke = 0.5, color = "black") +
+  stat_ellipse(aes(group = NMREG, color = NMREG), size = 1, alpha = 0.8) +  # Explicit group aesthetic for ellipses
+  scale_fill_manual(values = c("SIM" = "#1b9e77", "NAO" = "#d95f02")) +
+  scale_color_manual(values = region_colors) +
+  labs(
+    title = "NMDS of Disease Incidence",
+    x = paste0("NMDS 1 (", variance_explained_axis1, "% explained)"),
+    y = paste0("NMDS 2 (", variance_explained_axis2, "% explained)"),
+    fill = "Affected (AFETADO)",
+    color = "Region (NMREG)"
+  ) +
+  annotate(
+    "text", x = Inf, y = Inf, label = permanova_text, hjust = 1.1, vjust = 1.2,
+    size = 4, color = "black", fontface = "italic"
+  ) +
+  theme_minimal() +
+  theme(
+    legend.position = "right",
+    legend.box = "vertical"
+  )
+
+dev.off()
+

O PERMANOVA aponta que a variável AFETADO explica -1,32% da variabilidade observada entre as amostras. Esse valor negativo significa que a variável representa mais um ruído do que uma solução para os padrões. Por outro lado, a variável NMREG, que representa a região, responde por 32,92% da variabilidade (coluna R2), com um valor de p de 0,005 (coluna Pr(>F))

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
DfSumOfSqsR2FPr(>F)
AFETADO1-0.1221172-0.0132911-1.2653511.000
NMREG43.02496680.32923347.8360100.005
AFETADO:NMREG40.78407420.08533762.0311010.045
Residual575.50098580.5987201NANA
Total669.18790971.0000000NANA
+

NMDS

+
Análise de componentes principais (PCA)
+

A análise de componentes principais avalia qual o ano tem mais influência sobre o a distribuição dos municípios ao longo dos eixos X e Y.

+
# Perform PCA
+pca_result_row_scaled <- prcomp(matrix, scale. = F)
+
+# Extract PCA scores for cities
+pca_scores_row_scaled <- data.frame(
+  pca_result_row_scaled$x, 
+  AFETADO = row_anot$AFETADO, 
+  NMREG = row_anot$NMREG, 
+  CityName = rownames(matrix)  # Add city names for labeling
+)
+
+# Extract PCA loadings for years
+pca_loadings_row_scaled <- data.frame(
+  pca_result_row_scaled$rotation, 
+  Variable = colnames(matrix)
+)
+
+# PCA Biplot with ellipses for NMREG
+
+pdf(file = paste0(disease_prefix, "_Scaledbiplot.pdf"), width = 12, height = 10)
+
+ggplot() +
+  # Plot cities as points, styled by AFETADO
+  geom_point(data = pca_scores_row_scaled, aes(x = PC1, y = PC2, fill = AFETADO), 
+             size = 4, alpha = 0.5, shape = 21, stroke = 0.5, color = "black") +
+  # Add city names with ggrepel
+  geom_text_repel(data = pca_scores_row_scaled, aes(x = PC1, y = PC2, label = CityName),
+                  size = 3, box.padding = 0.3, point.padding = 0.2, segment.color = "grey50") +
+  # Add ellipses for NMREG
+  stat_ellipse(data = pca_scores_row_scaled, aes(x = PC1, y = PC2, group = NMREG, color = NMREG), 
+               size = 1, alpha = 0.8) +
+  # Add arrows for years
+  geom_segment(data = pca_loadings_row_scaled, aes(x = 0, y = 0, xend = PC1 * 5, yend = PC2 * 5), 
+               arrow = arrow(length = unit(0.2, "cm")), color = "blue", size = 0.5) +
+  # Label arrows with year names
+  geom_text(data = pca_loadings_row_scaled, aes(x = PC1 * 5, y = PC2 * 5, label = Variable), 
+            color = "blue", size = 3, vjust = -0.5) +
+  # Define color scales
+  scale_fill_manual(values = c("SIM" = "#1b9e77", "NAO" = "#d95f02")) +
+  scale_color_manual(values = region_colors) +
+  # Add labels and themes
+  labs(
+    title = "PCA Biplot of Disease Incidence (Row-Scaled, 2014-2023)",
+    x = paste0("PC1 (", round(summary(pca_result_row_scaled)$importance[2, 1] * 100, 1), "%)"),
+    y = paste0("PC2 (", round(summary(pca_result_row_scaled)$importance[2, 2] * 100, 1), "%)"),
+    fill = "Affected (AFETADO)",
+    color = "Region (NMREG)"
+  ) +
+  theme_minimal() +
+  theme(
+    legend.position = "right",
+    legend.box = "vertical"
+  )
+
+dev.off()
+

PCA

+
Construção dos modelos
+

A proposta foi de construir um modelo que explique a dinâmica da incidência 5 antes do rompimento da barragem (2014-2018) e comparar com um modelo que explique a dinâmica após o rompimento (2019-2023).
+Avaliamos então a importância de cada variável ou combinação de variáveis com um teste anova.

+
library(tidyverse)
+library(ggplot2)
+library(knitr)
+library(car)
+
+########################################################################################################
+########################################################################################################
+setwd("/data/gabriel/Trabalhos/Vale/DENG/")                                                #############
+metadata = read.table("/data/gabriel/Trabalhos/Vale/metadata.txt", header = T, sep = "\t") #############
+city_codes <- metadata$CODMUN6                                                             #############
+disease_prefix = "DENGBR"                                                                  #############
+########################################################################################################
+########################################################################################################
+
+# Load previously processed data
+load(paste0(disease_prefix,"_variables.Rdata"))
+
+########################################################################################################
+############################## Compare slopes ##########################################################
+########################################################################################################
+# Create a period variable
+incidence_long$Period <- ifelse(incidence_long$Year < 2019, "Before", "After")
+incidence_long$Period <- as.factor(incidence_long$Period)
+
+
+#################### #################### #################### #################### #################### 
+#################### Regional Model.      #################### #################### #################### 
+#################### #################### #################### #################### #################### 
+model_before_regional <- glm(
+  Incidence ~ Year * AFETADO * NMREG,
+  data = subset(incidence_long, Period == "Before"),
+  family = gaussian
+)
+
+# Summary and ANOVA for "Before"
+kable(summary(model_before_regional)$coefficients)
+kable(subset(summary(model_before_regional)$coefficients, subset = summary(model_before_regional)$coefficients[,4]<0.05))
+anova_before_regional <- Anova(model_before_regional, type = "III")
+kable(anova_before_regional)
+write.table(anova_before_regional, file = paste0(disease_prefix, "before_regional_anova.txt"), sep = "\t", quote = F)
+
+
+model_after_regional <- glm(
+  Incidence ~ Year * AFETADO * NMREG,
+  data = subset(incidence_long, Period == "After"),
+  family = gaussian
+)
+
+# Summary and ANOVA for "After"
+kable(summary(model_after_regional)$coefficients)
+kable(subset(summary(model_after_regional)$coefficients, subset = summary(model_after_regional)$coefficients[,4]<0.05))
+anova_after_regional <- Anova(model_after_regional, type = "III")
+kable(anova_after_regional)
+write.table(anova_after_regional, file = paste0(disease_prefix, "after_regional_anova.txt"), sep = "\t", quote = F)
+
+# Predictions for "Before"
+before_predictions_regional <- expand.grid(
+  Year = as.factor(seq(min(subset(incidence_long, Period == "Before")$Year), 2018, by = 1)),
+  AFETADO = c("SIM", "NAO"),
+  NMREG = unique(incidence_long$NMREG)
+)
+before_predictions_regional$Predicted_Incidence <- predict(model_before_regional, newdata = before_predictions_regional)
+before_predictions_regional$Period <- "Before"
+
+# Predictions for "After"
+after_predictions_regional <- expand.grid(
+  Year = as.factor(seq(2019, max(subset(incidence_long, Period == "After")$Year), by = 1)),
+  AFETADO = c("SIM", "NAO"),
+  NMREG = unique(incidence_long$NMREG)
+)
+after_predictions_regional$Predicted_Incidence <- predict(model_after_regional, newdata = after_predictions_regional)
+after_predictions_regional$Period <- "After"
+
+# Combine predictions
+combined_predictions_regional <- rbind(before_predictions_regional, after_predictions_regional)
+
+pdf(file = paste0(disease_prefix, "_ScaledBeforeVsAfter_regionaltrend.pdf"), width = 10, height = 8)
+
+ggplot(combined_predictions_regional, aes(x = Year, y = Predicted_Incidence, fill = AFETADO, color = AFETADO, linetype = Period, group = interaction(AFETADO, Period))) +
+  geom_line(size = 1.2) +
+  geom_point(size = 2, shape = 21, stroke = 0.5, color = "black") +
+  scale_color_manual(values = c("SIM" = "#1b9e77", "NAO" = "#d95f02")) +
+  scale_fill_manual(values = c("SIM" = "#1b9e77", "NAO" = "#d95f02")) +
+  scale_linetype_manual(values = c("Before" = "dotted", "After" = "solid")) +
+  facet_wrap(~ NMREG, scales = "free_y") +  # Separate plots by region
+  labs(
+    title = "Regional Trends in Disease Incidence Before and After 2019",
+    x = "Year",
+    y = "Predicted Incidence",
+    color = "Affected (AFETADO)",
+    linetype = "Period"
+  ) +
+  theme_minimal() +
+  theme(
+    axis.text.x = element_text(angle = 45, hjust = 1),
+    legend.position = "bottom"
+  )
+
+dev.off()
+

Para o modelo antes do rompimento, temos uma grande importância da variável Ano, e de sua combinação do a região.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LR ChisqDfPr(>Chisq)
Year39.302837840.0000001
AFETADO0.045407810.8312560
NMREG3.363066440.4990023
Year:AFETADO1.266219240.8670803
Year:NMREG79.9783356160.0000000
AFETADO:NMREG7.712432140.1026986
Year:AFETADO:NMREG28.3729996160.0285234
+

Detalhadamente, vemos que alguns anos tiveram grandes impactos em regiões específicas.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
EstimateStd. Errort valuePr(>|t|)
(Intercept)-0.65426160.2640693-2.4776130.0138061
Year20151.06469530.37345042.8509690.0046771
Year20161.89379040.37345045.0710630.0000007
Year2015:NMREGCURVELO-0.98174910.4573814-2.1464560.0326807
Year2015:NMREGPARA DE MINAS/NOVA SERRANA-1.26777730.4821223-2.6295760.0090134
Year2015:NMREGPATOS DE MINAS-1.11850410.4418724-2.5312830.0119025
Year2016:NMREGPATOS DE MINAS-1.73060260.4418724-3.9165210.0001125
AFETADOSIM:NMREGCURVELO1.08181400.47808782.2627940.0244010
Year2015:AFETADOSIM:NMREGCURVELO-1.56871320.6761182-2.3201760.0210376
Year2016:AFETADOSIM:NMREGCURVELO-1.98497590.6761182-2.9358410.0035975
Year2016:AFETADOSIM:NMREGPATOS DE MINAS2.24462260.90282692.4862160.0134839
+

No modelo após o rompimento, observamos importância das variáveis tempo, região, e suas interações com os municípios afetados.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LR ChisqDfPr(>Chisq)
Year28.73567640.0000088
AFETADO6.25719410.0123690
NMREG13.65098940.0084964
Year:AFETADO9.95642240.0411682
Year:NMREG24.066263160.0880672
AFETADO:NMREG8.78503940.0667029
Year:AFETADO:NMREG38.761046160.0011774
+

Detalhadamente temos diversas combinações com potencial preditivo.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
EstimateStd. Errort valuePr(>|t|)
(Intercept)1.01362200.30481463.3253720.0009987
Year2020-1.46241080.4310729-3.3924910.0007907
Year2021-1.59218670.4310729-3.6935440.0002650
Year2022-1.69429630.4310729-3.9304170.0001065
AFETADOSIM0.91638080.36634152.5014380.0129300
Year2020:AFETADOSIM-1.02116170.5180852-1.9710310.0496878
Year2021:AFETADOSIM-1.03289130.5180852-1.9936710.0471409
Year2023:AFETADOSIM-1.60307280.5180852-3.0942260.0021693
AFETADOSIM:NMREGSETE LAGOAS-1.10478460.4433741-2.4917670.0132795
Year2020:AFETADOSIM:NMREGPARA DE MINAS/NOVA SERRANA2.80435660.87403813.2085060.0014865
Year2022:AFETADOSIM:NMREGSETE LAGOAS1.26259890.62702562.0136320.0449879
Year2023:AFETADOSIM:NMREGSETE LAGOAS2.34637740.62702563.7420760.0002207
+

Visualmente observamos que o padrão está mais associado à característica regional e o momento em que foi atingido pela epidemia de dengue: anos 2016, 2019 ou 2023, o que corrobora o PCA mostrado.
+Modelo

+ +
+ + + + + + + + + + \ No newline at end of file