diff --git a/vale/DENGBR/Figuras/DENGBR_ScaledAntesDepois.jpg b/vale/DENGBR/Figuras/DENGBR_ScaledAntesDepois.jpg new file mode 100644 index 0000000..4a9eb67 Binary files /dev/null and b/vale/DENGBR/Figuras/DENGBR_ScaledAntesDepois.jpg differ diff --git a/vale/DENGBR/Figuras/DENGBR_ScaledBeforeVsAfter_regionaltrend.jpg b/vale/DENGBR/Figuras/DENGBR_ScaledBeforeVsAfter_regionaltrend.jpg new file mode 100644 index 0000000..9fab4f6 Binary files /dev/null and b/vale/DENGBR/Figuras/DENGBR_ScaledBeforeVsAfter_regionaltrend.jpg differ diff --git a/vale/DENGBR/Figuras/DENGBR_ScaledDistance.jpg b/vale/DENGBR/Figuras/DENGBR_ScaledDistance.jpg new file mode 100644 index 0000000..b4b7bc8 Binary files /dev/null and b/vale/DENGBR/Figuras/DENGBR_ScaledDistance.jpg differ diff --git a/vale/DENGBR/Figuras/DENGBR_ScaledHeatmap.jpg b/vale/DENGBR/Figuras/DENGBR_ScaledHeatmap.jpg new file mode 100644 index 0000000..53d9800 Binary files /dev/null and b/vale/DENGBR/Figuras/DENGBR_ScaledHeatmap.jpg differ diff --git a/vale/DENGBR/Figuras/DENGBR_ScaledNMDS.jpg b/vale/DENGBR/Figuras/DENGBR_ScaledNMDS.jpg new file mode 100644 index 0000000..ba301b9 Binary files /dev/null and b/vale/DENGBR/Figuras/DENGBR_ScaledNMDS.jpg differ diff --git a/vale/DENGBR/Figuras/DENGBR_Scaledbiplot.jpg b/vale/DENGBR/Figuras/DENGBR_Scaledbiplot.jpg new file mode 100644 index 0000000..0099996 Binary files /dev/null and b/vale/DENGBR/Figuras/DENGBR_Scaledbiplot.jpg differ diff --git a/vale/DENGBR/Figuras/DENGBR_ScaledbyYear.jpg b/vale/DENGBR/Figuras/DENGBR_ScaledbyYear.jpg new file mode 100644 index 0000000..3bf612d Binary files /dev/null and b/vale/DENGBR/Figuras/DENGBR_ScaledbyYear.jpg differ diff --git a/vale/DENGBR/index.html.html b/vale/DENGBR/index.html.html new file mode 100644 index 0000000..7e26f71 --- /dev/null +++ b/vale/DENGBR/index.html.html @@ -0,0 +1,1052 @@ +
+Os arquivos dos anos 2014 ao 2023 foram obtidos do SINAN através do seu sítio de FTP localizado em: ftp://ftp.datasus.gov.br/dissemin/publicos/SINAN/DADOS/ e identificados pelo prefixo DENGBR.
+O seguinte script cria o conjunto de variáveis a ser utilizada para a exploração, e os salva em um arquivo .Rdata
que pode ser posteriormente carregado com a função load()
.
#install.packages("devtools")
+#library(devtools)
+#install_github("danicat/read.dbc")
+library(read.dbc) # ler dbc
+library(tidyverse) #contem vários pacotes para diversas funções
+library(foreign) #ler DBF
+
+
+########################################################################################################
+########################################################################################################
+setwd("/data/gabriel/Trabalhos/Vale/DENG/") #############
+metadata = read.table("/data/gabriel/Trabalhos/Vale/metadata.txt", header = T, sep = "\t") #############
+city_codes <- metadata$CODMUN6 #############
+disease_prefix = "DENGBR" #############
+########################################################################################################
+########################################################################################################
+
+# Define folder path and initialize the list
+folder_path <- getwd()
+years <- 14:23
+
+# Create a table with disease incidence per 10k population for each year
+year_range <- as.character(2014:2023)
+
+# Find existing year columns in the dataset
+existing_years <- intersect(year_range, colnames(final_table))
+
+# Dynamically create the incidence table
+incidence_table <- final_table %>%
+ # Select necessary columns along with the existing years
+ select(NMMUNICIPIO, NMREG, AFETADO, CENSO2022, all_of(existing_years)) %>%
+ # Calculate incidence per 10k population for each available year
+ mutate(across(all_of(existing_years), ~ (.x / CENSO2022) * 10000, .names = "inc_{col}")) %>%
+ # Select relevant columns for the final table
+ select(NMMUNICIPIO, NMREG, AFETADO, starts_with("inc_"))
+
+# Rename columns for readability
+incidence_table <- incidence_table %>%
+ rename_with(~ sub("inc_", "", .), starts_with("inc_"))
+
+# View the resulting table and scale
+matrix = incidence_table[,4:ncol(incidence_table)]
+matrix_scaled_by_row <- t(scale(t(matrix))) # Transpose, scale rows, then transpose back
+
+incidence_table = cbind(incidence_table[,1:3], matrix_scaled_by_row)
+
+# Assuming incidence_table is your data frame from previous steps
+
+# Reshape the data from wide to long format for plotting
+incidence_long <- incidence_table %>%
+ pivot_longer(
+ cols = all_of(existing_years), # Use only the columns that exist
+ names_to = "Year", # Name of the new column for years
+ values_to = "Incidence" # Name of the new column for incidence values
+ )
+
+save(incidence_long, incidence_table, final_table, file = paste0(disease_prefix,"_variables.Rdata"))
+
Criar boxplots que mostrem a dispersão do Z-score ao longos dos anos, e separando o períodos em antes e depois do rompimento da barragem.
+library(tidyverse)
+library(ggplot2)
+
+########################################################################################################
+########################################################################################################
+setwd("/data/gabriel/Trabalhos/Vale/DENG/") #############
+metadata = read.table("/data/gabriel/Trabalhos/Vale/metadata.txt", header = T, sep = "\t") #############
+city_codes <- metadata$CODMUN6 #############
+disease_prefix = "DENGBR" #############
+########################################################################################################
+########################################################################################################
+
+# Load previously processed data
+load(paste0(disease_prefix,"_variables.Rdata"))
+
+
+# Create boxplot with incidence per year
+pdf(file = paste0(disease_prefix, "_ScaledbyYear.pdf"), width = 10, height = 10)
+
+ggplot(incidence_long, aes(x = Year, y = Incidence, fill = AFETADO)) +
+ geom_boxplot(outlier.shape = NA, width = 0.5) + # Adjust box width
+ geom_point(position = position_jitterdodge(jitter.width = 0.15, dodge.width = 0.5), shape = 21, size = 2, stroke = 0.5, alpha = 0.5) + # Add points with jitter dodge
+ facet_wrap(~ NMREG, ncol = 2) +
+ scale_fill_manual(values = c("SIM" = "#1b9e77", "NAO" = "#d95f02")) +
+ labs(
+ title = "Distribuição da Incidência de Doença por Região de Saúde",
+ x = "Ano",
+ y = "Incidência por 10000 Habitates",
+ fill = "Município Afetado"
+ ) +
+ theme_bw() +
+ theme(axis.text.x = element_text(angle = 45, hjust = 1))
+
+dev.off()
+
+
+# Categorize the years as "Before" and "After"
+incidence_long <- incidence_long %>%
+ mutate(
+ Period = case_when(
+ Year %in% 2014:2018 ~ "Antes",
+ Year %in% 2019:2023 ~ "Depois"
+ )
+ ) %>%
+ filter(!is.na(Period)) # Remove rows outside of the specified range
+
+# Create the plot showing distribution per city before and after
+
+pdf(file = paste0(disease_prefix, "_ScaledAntesDepois.pdf"), width = 10, height = 10)
+
+ggplot(incidence_long, aes(x = NMMUNICIPIO, y = Incidence, fill = AFETADO)) +
+ geom_boxplot(aes(group = interaction(NMMUNICIPIO, Period), fill = AFETADO),
+ position = position_dodge(width = 0.8), width = 0.45, outlier.shape = NA) + # Adjust box width and dodge
+ geom_point(aes(shape = Period, fill = AFETADO, group = Period),
+ position = position_dodge(width = 0.8),
+ size = 2, stroke = 0.5, alpha = 0.5) + # Map Period to shape for distinction
+ facet_wrap(~ NMREG, ncol = 2, scales = "free_x") +
+ scale_fill_manual(values = c("SIM" = "#1b9e77", "NAO" = "#d95f02")) +
+ scale_shape_manual(values = c("Antes" = 21, "Depois" = 22)) + # Set shapes for periods
+ labs(
+ title = "Comparação da Incidência de Doenças antes e depois do evento",
+ x = "Minucípio",
+ y = "Incidência por 10000 Habitantes",
+ fill = "Município Afetado",
+ shape = "Período"
+ ) +
+ theme_bw() +
+ theme(axis.text.x = element_text(angle = 45, hjust = 1))
+
+dev.off()
+
+
A busca por padrões envolve o cálculo de distâncias para criar agrupamentos. A distância utilizada foi a correlação de Pearson, uma vez que o objetivo é identificar cidades que seguem o mesmo padrão da incidência ao longo dos anos.
+O índice de Calinski-Harabasz foi utilizado para definir o número ótimo de cluster. Quando o número era maior que 10 (indicando falta de padrão), optamos para escolher visualmente. |
+Neste caso, o número de agrupamentos foi 5.
library(tidyverse)
+library(ggplot2)
+library(pheatmap)
+library(vegan)
+library(ComplexHeatmap)
+library(ggrepel)
+
+########################################################################################################
+########################################################################################################
+setwd("/data/gabriel/Trabalhos/Vale/DENG/") #############
+metadata = read.table("/data/gabriel/Trabalhos/Vale/metadata.txt", header = T, sep = "\t") #############
+city_codes <- metadata$CODMUN6 #############
+disease_prefix = "DENGBR" #############
+########################################################################################################
+########################################################################################################
+
+# Load previously processed data
+load(paste0(disease_prefix,"_variables.Rdata"))
+
+matrix = data.frame(incidence_table[,4:ncol(incidence_table)], row.names = incidence_table$NMMUNICIPIO)
+row_anot = data.frame(incidence_table[,2:3], row.names = incidence_table$NMMUNICIPIO)
+
+# Calculate the distance matrix
+data.dist <- as.dist(1 - cor(t(matrix)))
+
+# Perform hierarchical clustering
+hc <- hclust(data.dist, method = "complete")
+
+# Calculate the CH Index for different numbers of clusters
+ch_index <- numeric(10)
+for (k in 2:10) {
+ clustering <- cutree(hc, k = k)
+ ch_index[k] <- cluster.stats(d = as.dist(data.dist), clustering)$ch
+}
+
+# Find the optimal number of clusters
+optimal_clusters <- which.max(ch_index)
+optimal_clusters <- 5
+
+# Plot the CH Index
+plot(2:10, ch_index[2:10], type = "b", xlab = "Number of Clusters", ylab = "CH Index",
+ main = "Optimal Number of Clusters using CH Index")
+abline(v = optimal_clusters, col = "red", lty = 2)
+
+annotation_colors <- list(
+ AFETADO = c("SIM" = "#1b9e77", "NAO" = "#d95f02"), # Colors for AFETADO
+ NMREG = c("BETIM" = "#66c2a5", # Greenish
+ "CURVELO" = "#fc8d62", # Orange
+ "PARA DE MINAS/NOVA SERRANA" = "#8da0cb", # Blue
+ "PATOS DE MINAS" = "#e78ac3", # Pink
+ "SETE LAGOAS" = "#a6d854"), # Light green
+ Cluster = c("1" = "#FFDB6D", "2" = "#C4961A", "3" = "#D16103", "4" = "#52854C", "5" = "#293352")
+)
+
+row_anot = data.frame(row_anot, Cluster = as.factor(cutree(hc, k = optimal_clusters)))
+
+
A partir dos padrões, geramos 2 heatmaps:
+# Plot the heatmap
+pdf(file = paste0(disease_prefix, "_ScaledHeatmap.pdf"), width = 12, height = 10)
+pheatmap(
+ mat = as.matrix(matrix[rowSums(matrix != 0) > 0, ]),
+ scale = "row",
+ annotation_row = row_anot[rowSums(matrix != 0) > 0, ],
+ annotation_colors = annotation_colors,
+ clustering_distance_rows = "correlation",
+ cluster_cols = F,
+ cutree_rows = optimal_clusters,
+ clustering_method = "complete",
+ show_rownames = TRUE,
+ show_colnames = TRUE,
+ main = "Heatmap of Disease Incidence"
+)
+dev.off()
+
+# Create a color mapping for annotations
+annotation_colors <- list(
+ AFETADO = c("SIM" = "#1b9e77", "NAO" = "#d95f02"),
+ NMREG = c("BETIM" = "#66c2a5",
+ "CURVELO" = "#fc8d62",
+ "PARA DE MINAS/NOVA SERRANA" = "#8da0cb",
+ "PATOS DE MINAS" = "#e78ac3",
+ "SETE LAGOAS" = "#a6d854")
+)
+
+# Define row annotations
+row_annotation <- rowAnnotation(
+ AFETADO = row_anot$AFETADO,
+ NMREG = row_anot$NMREG,
+ col = annotation_colors
+)
+
+# Create the heatmap object with row annotations
+ht <- Heatmap(
+ as.matrix(data.dist),
+ name = "Distance",
+ row_title = "Cities",
+ column_title = "Cities",
+ row_split = 5,
+ column_split = 5,
+ right_annotation = row_annotation
+)
+
+# Draw the heatmap with legends at the bottom
+pdf(file = paste0(disease_prefix, "_ScaledDistance.pdf"), width = 13, height = 14)
+draw(ht, heatmap_legend_side = "bottom", annotation_legend_side = "bottom")
+dev.off()
+
+
Um teste exato de Fisher foi usado para avaliar se havia maior prevalência de municípios afetados, ou de uma determinada região em um cluster. Isso poderia explicar se os municípios se agrupam mais por suas propriedades regionais, ou pelo fato de terem sido afetados.
+# Initialize results for affected prevalence
+affected_results <- list()
+
+for (cluster_id in unique(row_anot$Cluster)) {
+ # Cities inside the cluster
+ inside_cluster <- row_anot[row_anot$Cluster == cluster_id, ]
+ # Cities outside the cluster
+ outside_cluster <- row_anot[row_anot$Cluster != cluster_id, ]
+
+ # Contingency table for AFETADO
+ affected_table <- matrix(
+ c(
+ sum(inside_cluster$AFETADO == "SIM"),
+ sum(inside_cluster$AFETADO == "NAO"),
+ sum(outside_cluster$AFETADO == "SIM"),
+ sum(outside_cluster$AFETADO == "NAO")
+ ),
+ nrow = 2,
+ dimnames = list(
+ c("Inside Cluster", "Outside Cluster"),
+ c("Affected", "Not Affected")
+ )
+ )
+
+ # Run Fisher's test
+ fisher_affected <- fisher.test(affected_table)
+
+ # Save results
+ affected_results[[paste0("Cluster_", cluster_id)]] <- list(
+ pval = fisher_affected$p.value,
+ affected_ratio_inside = affected_table[1, 1] / sum(affected_table[1, ]),
+ affected_ratio_outside = affected_table[2, 1] / sum(affected_table[2, ])
+ )
+}
+
+# Initialize results for regional prevalence
+region_results <- list()
+
+for (cluster_id in unique(row_anot$Cluster)) {
+ for (region in unique(row_anot$NMREG)) {
+ # Cities inside the cluster
+ inside_cluster <- row_anot[row_anot$Cluster == cluster_id, ]
+ # Cities outside the cluster
+ outside_cluster <- row_anot[row_anot$Cluster != cluster_id, ]
+
+ # Contingency table for NMREG
+ region_table <- matrix(
+ c(
+ sum(inside_cluster$NMREG == region),
+ nrow(inside_cluster) - sum(inside_cluster$NMREG == region),
+ sum(outside_cluster$NMREG == region),
+ nrow(outside_cluster) - sum(outside_cluster$NMREG == region)
+ ),
+ nrow = 2,
+ dimnames = list(
+ c("Inside Cluster", "Outside Cluster"),
+ c("In Region", "Not In Region")
+ )
+ )
+
+ # Run Fisher's test
+ fisher_region <- fisher.test(region_table)
+
+ # Save results
+ region_results[[paste0("Cluster_", cluster_id, "_Region_", region)]] <- list(
+ pval = fisher_region$p.value,
+ region_ratio_inside = region_table[1, 1] / sum(region_table[1, ]),
+ region_ratio_outside = region_table[2, 1] / sum(region_table[2, ])
+ )
+ }
+}
+
+
+#Compile
+affected_results_df <- do.call(rbind, lapply(names(affected_results), function(name) {
+ data.frame(
+ Cluster = name,
+ pval = affected_results[[name]]$pval,
+ ratio_inside = affected_results[[name]]$affected_ratio_inside,
+ ratio_outside = affected_results[[name]]$affected_ratio_outside
+ )
+}))
+region_results_df <- do.call(rbind, lapply(names(region_results), function(name) {
+ parts <- strsplit(name, "_")[[1]] # Split name into cluster and region
+ data.frame(
+ Cluster = parts[2],
+ Region = parts[4],
+ pval = region_results[[name]]$pval,
+ ratio_inside = region_results[[name]]$region_ratio_inside,
+ ratio_outside = region_results[[name]]$region_ratio_outside
+ )
+}))
+
+kable(subset(affected_results_df, subset = pval < 0.05), format = "markdown")
+kable(subset(region_results_df, subset = pval < 0.05), format = "markdown")
+
Os clusters 1 e 2 estão superrepresentados por municípios das regiões de Betim e Patos de Minas, respectivamente.
++ | Cluster | +Region | +pval | +ratio_inside | +ratio_outside | +
---|---|---|---|---|---|
1 | +1 | +BETIM | +0.0316180 | +0.5384615 | +0.2037037 | +
9 | +2 | +PATOS DE MINAS | +0.0206585 | +0.3636364 | +0.0714286 | +
A importância de cada variável em explicar a variância entre os municípios foi medida através de um teste de PERMANOVA
e foi visualizado em uma análise de Escalonamento Multidimensional não métrico (NMDS).
### PERMANOVA
+
+# Ensure the annotation variables are factors
+row_anot$AFETADO <- factor(row_anot$AFETADO)
+row_anot$NMREG <- factor(row_anot$NMREG)
+
+# Perform the PERMANOVA test
+permanova_result <- adonis2(
+ as.dist(data.dist) ~ AFETADO + NMREG + AFETADO:NMREG,
+ data = row_anot,
+ permutations = 199,
+ method = "euclidean", by = "terms"
+)
+
+# Display the results
+kable(permanova_result)
+
+# Extract PERMANOVA results using rownames
+permanova_text <- paste0(
+ "PERMANOVA Results:\n",
+ "AFETADO: R² = ", round(permanova_result["AFETADO", "R2"] * 100, 2), "%, p = ", signif(permanova_result["AFETADO", "Pr(>F)"], 3), "\n",
+ "NMREG: R² = ", round(permanova_result["NMREG", "R2"] * 100, 2), "%, p = ", signif(permanova_result["NMREG", "Pr(>F)"], 3), "\n",
+ "Interaction (AFETADO:NMREG): R² = ", round(permanova_result["AFETADO:NMREG", "R2"] * 100, 2), "%, p = ", signif(permanova_result["AFETADO:NMREG", "Pr(>F)"], 3)
+)
+
+# Perform NMDS
+nmds_result <- metaMDS(as.dist(data.dist), k = 2, trymax = 100)
+# Calculate variance explained by each NMDS axis
+distance_matrix <- as.matrix(data.dist)
+eigenvalues <- cmdscale(distance_matrix, k = 2, eig = TRUE)$eig
+variance_explained_axis1 <- round(100 * eigenvalues[1] / sum(abs(eigenvalues)), 2)
+variance_explained_axis2 <- round(100 * eigenvalues[2] / sum(abs(eigenvalues)), 2)
+# Extract coordinates and add annotations
+nmds_data <- data.frame(
+ NMDS1 = nmds_result$points[, 1],
+ NMDS2 = nmds_result$points[, 2],
+ NMREG = row_anot$NMREG,
+ AFETADO = row_anot$AFETADO
+)
+# Define region colors
+region_colors <- c(
+ "BETIM" = "#66c2a5",
+ "CURVELO" = "#fc8d62",
+ "PARA DE MINAS/NOVA SERRANA" = "#8da0cb",
+ "PATOS DE MINAS" = "#e78ac3",
+ "SETE LAGOAS" = "#a6d854"
+)
+
+
+# Plot with ggplot2
+pdf(file = paste0(disease_prefix, "_ScaledNMDS.pdf"), width = 12, height = 10)
+
+ggplot(nmds_data, aes(x = NMDS1, y = NMDS2, fill = AFETADO)) +
+ geom_point(size = 4, alpha = 0.5, shape = 21, stroke = 0.5, color = "black") +
+ stat_ellipse(aes(group = NMREG, color = NMREG), size = 1, alpha = 0.8) + # Explicit group aesthetic for ellipses
+ scale_fill_manual(values = c("SIM" = "#1b9e77", "NAO" = "#d95f02")) +
+ scale_color_manual(values = region_colors) +
+ labs(
+ title = "NMDS of Disease Incidence",
+ x = paste0("NMDS 1 (", variance_explained_axis1, "% explained)"),
+ y = paste0("NMDS 2 (", variance_explained_axis2, "% explained)"),
+ fill = "Affected (AFETADO)",
+ color = "Region (NMREG)"
+ ) +
+ annotate(
+ "text", x = Inf, y = Inf, label = permanova_text, hjust = 1.1, vjust = 1.2,
+ size = 4, color = "black", fontface = "italic"
+ ) +
+ theme_minimal() +
+ theme(
+ legend.position = "right",
+ legend.box = "vertical"
+ )
+
+dev.off()
+
O PERMANOVA aponta que a variável AFETADO
explica -1,32% da variabilidade observada entre as amostras. Esse valor negativo significa que a variável representa mais um ruído do que uma solução para os padrões. Por outro lado, a variável NMREG
, que representa a região, responde por 32,92% da variabilidade (coluna R2
), com um valor de p de 0,005 (coluna Pr(>F)
)
+ | Df | +SumOfSqs | +R2 | +F | +Pr(>F) | +
---|---|---|---|---|---|
AFETADO | +1 | +-0.1221172 | +-0.0132911 | +-1.265351 | +1.000 | +
NMREG | +4 | +3.0249668 | +0.3292334 | +7.836010 | +0.005 | +
AFETADO:NMREG | +4 | +0.7840742 | +0.0853376 | +2.031101 | +0.045 | +
Residual | +57 | +5.5009858 | +0.5987201 | +NA | +NA | +
Total | +66 | +9.1879097 | +1.0000000 | +NA | +NA | +
A análise de componentes principais avalia qual o ano tem mais influência sobre o a distribuição dos municípios ao longo dos eixos X e Y.
+# Perform PCA
+pca_result_row_scaled <- prcomp(matrix, scale. = F)
+
+# Extract PCA scores for cities
+pca_scores_row_scaled <- data.frame(
+ pca_result_row_scaled$x,
+ AFETADO = row_anot$AFETADO,
+ NMREG = row_anot$NMREG,
+ CityName = rownames(matrix) # Add city names for labeling
+)
+
+# Extract PCA loadings for years
+pca_loadings_row_scaled <- data.frame(
+ pca_result_row_scaled$rotation,
+ Variable = colnames(matrix)
+)
+
+# PCA Biplot with ellipses for NMREG
+
+pdf(file = paste0(disease_prefix, "_Scaledbiplot.pdf"), width = 12, height = 10)
+
+ggplot() +
+ # Plot cities as points, styled by AFETADO
+ geom_point(data = pca_scores_row_scaled, aes(x = PC1, y = PC2, fill = AFETADO),
+ size = 4, alpha = 0.5, shape = 21, stroke = 0.5, color = "black") +
+ # Add city names with ggrepel
+ geom_text_repel(data = pca_scores_row_scaled, aes(x = PC1, y = PC2, label = CityName),
+ size = 3, box.padding = 0.3, point.padding = 0.2, segment.color = "grey50") +
+ # Add ellipses for NMREG
+ stat_ellipse(data = pca_scores_row_scaled, aes(x = PC1, y = PC2, group = NMREG, color = NMREG),
+ size = 1, alpha = 0.8) +
+ # Add arrows for years
+ geom_segment(data = pca_loadings_row_scaled, aes(x = 0, y = 0, xend = PC1 * 5, yend = PC2 * 5),
+ arrow = arrow(length = unit(0.2, "cm")), color = "blue", size = 0.5) +
+ # Label arrows with year names
+ geom_text(data = pca_loadings_row_scaled, aes(x = PC1 * 5, y = PC2 * 5, label = Variable),
+ color = "blue", size = 3, vjust = -0.5) +
+ # Define color scales
+ scale_fill_manual(values = c("SIM" = "#1b9e77", "NAO" = "#d95f02")) +
+ scale_color_manual(values = region_colors) +
+ # Add labels and themes
+ labs(
+ title = "PCA Biplot of Disease Incidence (Row-Scaled, 2014-2023)",
+ x = paste0("PC1 (", round(summary(pca_result_row_scaled)$importance[2, 1] * 100, 1), "%)"),
+ y = paste0("PC2 (", round(summary(pca_result_row_scaled)$importance[2, 2] * 100, 1), "%)"),
+ fill = "Affected (AFETADO)",
+ color = "Region (NMREG)"
+ ) +
+ theme_minimal() +
+ theme(
+ legend.position = "right",
+ legend.box = "vertical"
+ )
+
+dev.off()
+
+A proposta foi de construir um modelo que explique a dinâmica da incidência 5 antes do rompimento da barragem (2014-2018) e comparar com um modelo que explique a dinâmica após o rompimento (2019-2023).
+Avaliamos então a importância de cada variável ou combinação de variáveis com um teste anova.
library(tidyverse)
+library(ggplot2)
+library(knitr)
+library(car)
+
+########################################################################################################
+########################################################################################################
+setwd("/data/gabriel/Trabalhos/Vale/DENG/") #############
+metadata = read.table("/data/gabriel/Trabalhos/Vale/metadata.txt", header = T, sep = "\t") #############
+city_codes <- metadata$CODMUN6 #############
+disease_prefix = "DENGBR" #############
+########################################################################################################
+########################################################################################################
+
+# Load previously processed data
+load(paste0(disease_prefix,"_variables.Rdata"))
+
+########################################################################################################
+############################## Compare slopes ##########################################################
+########################################################################################################
+# Create a period variable
+incidence_long$Period <- ifelse(incidence_long$Year < 2019, "Before", "After")
+incidence_long$Period <- as.factor(incidence_long$Period)
+
+
+#################### #################### #################### #################### ####################
+#################### Regional Model. #################### #################### ####################
+#################### #################### #################### #################### ####################
+model_before_regional <- glm(
+ Incidence ~ Year * AFETADO * NMREG,
+ data = subset(incidence_long, Period == "Before"),
+ family = gaussian
+)
+
+# Summary and ANOVA for "Before"
+kable(summary(model_before_regional)$coefficients)
+kable(subset(summary(model_before_regional)$coefficients, subset = summary(model_before_regional)$coefficients[,4]<0.05))
+anova_before_regional <- Anova(model_before_regional, type = "III")
+kable(anova_before_regional)
+write.table(anova_before_regional, file = paste0(disease_prefix, "before_regional_anova.txt"), sep = "\t", quote = F)
+
+
+model_after_regional <- glm(
+ Incidence ~ Year * AFETADO * NMREG,
+ data = subset(incidence_long, Period == "After"),
+ family = gaussian
+)
+
+# Summary and ANOVA for "After"
+kable(summary(model_after_regional)$coefficients)
+kable(subset(summary(model_after_regional)$coefficients, subset = summary(model_after_regional)$coefficients[,4]<0.05))
+anova_after_regional <- Anova(model_after_regional, type = "III")
+kable(anova_after_regional)
+write.table(anova_after_regional, file = paste0(disease_prefix, "after_regional_anova.txt"), sep = "\t", quote = F)
+
+# Predictions for "Before"
+before_predictions_regional <- expand.grid(
+ Year = as.factor(seq(min(subset(incidence_long, Period == "Before")$Year), 2018, by = 1)),
+ AFETADO = c("SIM", "NAO"),
+ NMREG = unique(incidence_long$NMREG)
+)
+before_predictions_regional$Predicted_Incidence <- predict(model_before_regional, newdata = before_predictions_regional)
+before_predictions_regional$Period <- "Before"
+
+# Predictions for "After"
+after_predictions_regional <- expand.grid(
+ Year = as.factor(seq(2019, max(subset(incidence_long, Period == "After")$Year), by = 1)),
+ AFETADO = c("SIM", "NAO"),
+ NMREG = unique(incidence_long$NMREG)
+)
+after_predictions_regional$Predicted_Incidence <- predict(model_after_regional, newdata = after_predictions_regional)
+after_predictions_regional$Period <- "After"
+
+# Combine predictions
+combined_predictions_regional <- rbind(before_predictions_regional, after_predictions_regional)
+
+pdf(file = paste0(disease_prefix, "_ScaledBeforeVsAfter_regionaltrend.pdf"), width = 10, height = 8)
+
+ggplot(combined_predictions_regional, aes(x = Year, y = Predicted_Incidence, fill = AFETADO, color = AFETADO, linetype = Period, group = interaction(AFETADO, Period))) +
+ geom_line(size = 1.2) +
+ geom_point(size = 2, shape = 21, stroke = 0.5, color = "black") +
+ scale_color_manual(values = c("SIM" = "#1b9e77", "NAO" = "#d95f02")) +
+ scale_fill_manual(values = c("SIM" = "#1b9e77", "NAO" = "#d95f02")) +
+ scale_linetype_manual(values = c("Before" = "dotted", "After" = "solid")) +
+ facet_wrap(~ NMREG, scales = "free_y") + # Separate plots by region
+ labs(
+ title = "Regional Trends in Disease Incidence Before and After 2019",
+ x = "Year",
+ y = "Predicted Incidence",
+ color = "Affected (AFETADO)",
+ linetype = "Period"
+ ) +
+ theme_minimal() +
+ theme(
+ axis.text.x = element_text(angle = 45, hjust = 1),
+ legend.position = "bottom"
+ )
+
+dev.off()
+
Para o modelo antes do rompimento, temos uma grande importância da variável Ano, e de sua combinação do a região.
++ | LR Chisq | +Df | +Pr(>Chisq) | +
---|---|---|---|
Year | +39.3028378 | +4 | +0.0000001 | +
AFETADO | +0.0454078 | +1 | +0.8312560 | +
NMREG | +3.3630664 | +4 | +0.4990023 | +
Year:AFETADO | +1.2662192 | +4 | +0.8670803 | +
Year:NMREG | +79.9783356 | +16 | +0.0000000 | +
AFETADO:NMREG | +7.7124321 | +4 | +0.1026986 | +
Year:AFETADO:NMREG | +28.3729996 | +16 | +0.0285234 | +
Detalhadamente, vemos que alguns anos tiveram grandes impactos em regiões específicas.
++ | Estimate | +Std. Error | +t value | +Pr(>|t|) | +
---|---|---|---|---|
(Intercept) | +-0.6542616 | +0.2640693 | +-2.477613 | +0.0138061 | +
Year2015 | +1.0646953 | +0.3734504 | +2.850969 | +0.0046771 | +
Year2016 | +1.8937904 | +0.3734504 | +5.071063 | +0.0000007 | +
Year2015:NMREGCURVELO | +-0.9817491 | +0.4573814 | +-2.146456 | +0.0326807 | +
Year2015:NMREGPARA DE MINAS/NOVA SERRANA | +-1.2677773 | +0.4821223 | +-2.629576 | +0.0090134 | +
Year2015:NMREGPATOS DE MINAS | +-1.1185041 | +0.4418724 | +-2.531283 | +0.0119025 | +
Year2016:NMREGPATOS DE MINAS | +-1.7306026 | +0.4418724 | +-3.916521 | +0.0001125 | +
AFETADOSIM:NMREGCURVELO | +1.0818140 | +0.4780878 | +2.262794 | +0.0244010 | +
Year2015:AFETADOSIM:NMREGCURVELO | +-1.5687132 | +0.6761182 | +-2.320176 | +0.0210376 | +
Year2016:AFETADOSIM:NMREGCURVELO | +-1.9849759 | +0.6761182 | +-2.935841 | +0.0035975 | +
Year2016:AFETADOSIM:NMREGPATOS DE MINAS | +2.2446226 | +0.9028269 | +2.486216 | +0.0134839 | +
No modelo após o rompimento, observamos importância das variáveis tempo, região, e suas interações com os municípios afetados.
++ | LR Chisq | +Df | +Pr(>Chisq) | +
---|---|---|---|
Year | +28.735676 | +4 | +0.0000088 | +
AFETADO | +6.257194 | +1 | +0.0123690 | +
NMREG | +13.650989 | +4 | +0.0084964 | +
Year:AFETADO | +9.956422 | +4 | +0.0411682 | +
Year:NMREG | +24.066263 | +16 | +0.0880672 | +
AFETADO:NMREG | +8.785039 | +4 | +0.0667029 | +
Year:AFETADO:NMREG | +38.761046 | +16 | +0.0011774 | +
Detalhadamente temos diversas combinações com potencial preditivo.
++ | Estimate | +Std. Error | +t value | +Pr(>|t|) | +
---|---|---|---|---|
(Intercept) | +1.0136220 | +0.3048146 | +3.325372 | +0.0009987 | +
Year2020 | +-1.4624108 | +0.4310729 | +-3.392491 | +0.0007907 | +
Year2021 | +-1.5921867 | +0.4310729 | +-3.693544 | +0.0002650 | +
Year2022 | +-1.6942963 | +0.4310729 | +-3.930417 | +0.0001065 | +
AFETADOSIM | +0.9163808 | +0.3663415 | +2.501438 | +0.0129300 | +
Year2020:AFETADOSIM | +-1.0211617 | +0.5180852 | +-1.971031 | +0.0496878 | +
Year2021:AFETADOSIM | +-1.0328913 | +0.5180852 | +-1.993671 | +0.0471409 | +
Year2023:AFETADOSIM | +-1.6030728 | +0.5180852 | +-3.094226 | +0.0021693 | +
AFETADOSIM:NMREGSETE LAGOAS | +-1.1047846 | +0.4433741 | +-2.491767 | +0.0132795 | +
Year2020:AFETADOSIM:NMREGPARA DE MINAS/NOVA SERRANA | +2.8043566 | +0.8740381 | +3.208506 | +0.0014865 | +
Year2022:AFETADOSIM:NMREGSETE LAGOAS | +1.2625989 | +0.6270256 | +2.013632 | +0.0449879 | +
Year2023:AFETADOSIM:NMREGSETE LAGOAS | +2.3463774 | +0.6270256 | +3.742076 | +0.0002207 | +
Visualmente observamos que o padrão está mais associado à característica regional e o momento em que foi atingido pela epidemia de dengue: anos 2016, 2019 ou 2023, o que corrobora o PCA mostrado.
+