Merge pull request #43 from nmfs-fish-tools/update-text-mining-analysis

Update text mining analysis
nmfs-fish-tools · Jul 26, 2023 · bf220bc · bf220bc
2 parents 364af6a + 2360d47
commit bf220bc
Show file tree

Hide file tree

Showing 40 changed files with 1,284 additions and 284 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -9,7 +9,7 @@ License: `use_mit_license()`, `use_gpl3_license()` or friends to pick a
     license
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.1
+RoxygenNote: 7.2.3
 Imports: 
     mvbutils,
     tools

diff --git a/TextAnalysis/top10_analysis.R b/TextAnalysis/top10_analysis.R
@@ -3,20 +3,20 @@ library(here)
 library(pdftools)
 library(tm)
 library(stringr)
-library(xlsx)
-library(ggplot2)
-library(patchwork) #devtools::install_github("thomasp85/patchwork")
 library(wordcloud)
 library(RColorBrewer)
+library(ggplot2)
+library(scales)
+library(viridis)
 
 # Define keyword ---------------------------------------------------------------
 
-z <- data.frame(
-  keyword = c(
-    "instantaneous mortality rate|z"
-  ),
-  group = "instantaneous mortality rate"
-)
+# z <- data.frame(
+#   keyword = c(
+#     "instantaneous total mortality rate|z"
+#   ),
+#   group = "instantaneous mortality rate"
+# )
 
 spawning_biomass <- data.frame(
   keyword = c(
@@ -33,27 +33,25 @@ cpue <- data.frame(
     "catch per unit effort|cpue",
     "catch rate", 
     "index of abundance",
-    "standardized fishery catch time series",
     "catch per effort",
-    "fishing success",
-    "availability"
+    "fishing success"
   ),
   group = "catch per unit effort"
 )
 
 catch <- data.frame(
   keyword = c(
     "catch",
-    "total mortality"
+    "total mortality", 
+    "harvest"
   ),
   group = "catch"
 )
 
 landings <- data.frame(
   keyword = c(
     "landings",
-    "retained catch",
-    "wanted catch"
+    "retained catch"
   ),
   group = "landings"
 )
@@ -75,13 +73,13 @@ sex <- data.frame(
   group = "sex"
 )
 
-spr <- data.frame(
-  keyword = c(
-    "spawner per recruit|spr",
-    "30%spr|spr30%|30spr|spr30"
-  ),
-  group = "spawner per recruit"
-)
+# spr <- data.frame(
+#   keyword = c(
+#     "spawner per recruit|spr",
+#     "spawning potential ratio"
+#   ),
+#   group = "spawner per recruit"
+# )
 
 weight <- data.frame(
   keyword = c(
@@ -102,8 +100,8 @@ lnr0 <- data.frame(
   group = "natural log of unfished recruitment"
 )
 
-keyword_map <- rbind(z, spawning_biomass, cpue, catch, landings,
-                 projection, sex, spr, weight, lnr0)
+keyword_map <- rbind(catch, cpue, landings,
+                     lnr0, projection, sex, spawning_biomass, weight)
 keyword_map$keyword_id <- as.factor(1:nrow(keyword_map))
 keyword <- keyword_map$keyword
 
@@ -114,7 +112,7 @@ subfolder_path <- list.dirs(path = working_path, full.names = TRUE, recursive =
 subfolder_name <- list.dirs(path = working_path, full.names = FALSE, recursive = FALSE)
 
 # Create keyword database
-col_name <- c("ID", "Science_Center", "File_Path", keyword)
+col_name <- c("ID", "Source", "File_Path", keyword)
 frequency_database <- presence_database <- proportion_database <- 
   data.frame(matrix(NA, ncol = length(col_name))) 
 colnames(frequency_database) <- colnames(presence_database) <- colnames(proportion_database) <- 
@@ -216,47 +214,60 @@ proportion_database <- rbind(
 )
 proportion_database[nrow(proportion_database), "ID"] <- "Sum"
 
-xlsx_path <- here::here("TextAnalysis", "top10_analysis.xlsx")
-xlsx::write.xlsx(frequency_database, file=xlsx_path, sheetName="frequency", row.names=FALSE)
-xlsx::write.xlsx(presence_database, file=xlsx_path, sheetName="presence", append=TRUE, row.names=FALSE)
-xlsx::write.xlsx(proportion_database, file=xlsx_path, sheetName="proportion", append=TRUE, row.names=FALSE)
+write.csv(frequency_database, file=here::here("TextAnalysis", "top10_frequency.csv"), row.names=FALSE)
+write.csv(presence_database, file=here::here("TextAnalysis", "top10_presence.csv"), row.names=FALSE)
+write.csv(proportion_database, file=here::here("TextAnalysis", "top10_proportion.csv"), row.names=FALSE)
 
 # Upload xlsx to Google Drive
-# Google Drive folder id
-id_googledrive <- "1BUsYYd11lE2TECqHru5tX6LwkMniVnKv"
-googledrive::drive_upload(media = xlsx_path, path = as_id(id_googledrive), overwrite = TRUE, type="spreadsheet")
+authorize_GoogleDrive <- FALSE
+if (authorize_GoogleDrive) {
+  # Google Drive folder id
+  id_googledrive <- "1BUsYYd11lE2TECqHru5tX6LwkMniVnKv"
+  googledrive::drive_upload(media = xlsx_path, path = as_id(id_googledrive), overwrite = TRUE, type="spreadsheet")
+}
 
 # Plot results ------------------------------------------------------------
-xlsx_path <- here::here("TextAnalysis", "top10_analysis.xlsx")
-frequency_database <- xlsx::read.xlsx(file=xlsx_path, sheetName="frequency")
-presence_database<-xlsx::read.xlsx(file=xlsx_path, sheetName="presence")
-proportion_database<-xlsx::read.xlsx(file=xlsx_path, sheetName="proportion")
+frequency_database <- read.csv(file=here::here("TextAnalysis", "top10_frequency.csv"))
+presence_database<-read.csv(file=here::here("TextAnalysis", "top10_presence.csv"))
+proportion_database<-read.csv(file=here::here("TextAnalysis", "top10_proportion.csv"))
 
 colnames(presence_database) <- col_name
 data_reshape <- reshape2::melt(
   presence_database[, c(2, 4:ncol(presence_database))],
-  id = c("Science_Center")
+  id = c("Source")
 )
-colnames(data_reshape) <- c("Science_Center", "keyword", "value")
+colnames(data_reshape) <- c("Source", "keyword", "value")
 
 data_merge <- merge(data_reshape, keyword_map, by= "keyword")
 
-sum_by_group <- aggregate(value ~ keyword+keyword_id+group+Science_Center, data = data_merge, sum)
+sum_by_group <- aggregate(value ~ keyword+keyword_id+group+Source, data = data_merge, sum)
 sum_by_group <- sum_by_group[order(sum_by_group$keyword_id),]
 group <- unique(sum_by_group$group)
-figure <- vector(mode="list", length=length(group))
 
-for (i in 1:length(group)){
-  figure[[i]] <- ggplot(data = sum_by_group[sum_by_group$group==group[i], ], aes(x=keyword_id, y=value, fill=Science_Center)) +
-    geom_bar(position='dodge', stat='identity') +
-    labs(
-      x = group[i],
-      y = "Frequency"
-    )+
-    theme_bw()
-}
+jpeg(filename = here::here("TextAnalysis", "top10_barplot_sum.jpg"), width=200, height=150, units="mm", res=1200)
+ggplot(sum_by_group, aes(fill=Source, y=value, x=keyword_id)) + 
+  geom_bar(position="dodge", stat="identity") +
+  facet_wrap(~group, scales = "free_x") +
+  labs(
+    x = "Term ID",
+    y = "Frequency"
+  ) +
+  scale_fill_viridis(discrete = TRUE) +
+  theme_bw()
+dev.off()
+
+mean_by_group <- aggregate(value ~ keyword+keyword_id+group+Source, data = data_merge, mean)
+mean_by_group <- mean_by_group[order(mean_by_group$keyword_id),]
+group <- unique(mean_by_group$group)
 
-jpeg(filename = here::here("TextAnalysis", "top10_barplot.jpg"), width=300, height=270, units="mm", res=1200)
-(figure[[1]] + figure[[2]] + figure[[3]] + figure[[4]] + figure[[5]])/
-  (figure[[6]] + figure[[7]] + figure[[8]] + figure[[9]] + figure[[10]])
+jpeg(filename = here::here("TextAnalysis", "top10_barplot_mean.jpg"), width=200, height=150, units="mm", res=1200)
+ggplot(mean_by_group, aes(fill=Source, y=value*100, x=keyword_id)) + 
+  geom_bar(position="dodge", stat="identity") +
+  facet_wrap(~group, scales = "free_x") +
+  labs(
+    x = "Term ID",
+    y = "Presence (%)"
+  ) +
+  scale_fill_viridis(discrete = TRUE) +
+  theme_bw()
 dev.off()
diff --git a/TextAnalysis/top10_analysis.xlsx b/TextAnalysis/top10_analysis.xlsx
diff --git a/TextAnalysis/top10_barplot.jpg b/TextAnalysis/top10_barplot.jpg
diff --git a/TextAnalysis/top10_barplot_mean.jpg b/TextAnalysis/top10_barplot_mean.jpg
diff --git a/TextAnalysis/top10_barplot_sum.jpg b/TextAnalysis/top10_barplot_sum.jpg