Skip to content

Commit

Permalink
Merge pull request #43 from nmfs-fish-tools/update-text-mining-analysis
Browse files Browse the repository at this point in the history
Update text mining analysis
  • Loading branch information
chantelwetzel-noaa authored Jul 26, 2023
2 parents 364af6a + 2360d47 commit bf220bc
Show file tree
Hide file tree
Showing 40 changed files with 1,284 additions and 284 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ License: `use_mit_license()`, `use_gpl3_license()` or friends to pick a
license
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.1
RoxygenNote: 7.2.3
Imports:
mvbutils,
tools
Expand Down
115 changes: 63 additions & 52 deletions TextAnalysis/top10_analysis.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,20 @@ library(here)
library(pdftools)
library(tm)
library(stringr)
library(xlsx)
library(ggplot2)
library(patchwork) #devtools::install_github("thomasp85/patchwork")
library(wordcloud)
library(RColorBrewer)
library(ggplot2)
library(scales)
library(viridis)

# Define keyword ---------------------------------------------------------------

z <- data.frame(
keyword = c(
"instantaneous mortality rate|z"
),
group = "instantaneous mortality rate"
)
# z <- data.frame(
# keyword = c(
# "instantaneous total mortality rate|z"
# ),
# group = "instantaneous mortality rate"
# )

spawning_biomass <- data.frame(
keyword = c(
Expand All @@ -33,27 +33,25 @@ cpue <- data.frame(
"catch per unit effort|cpue",
"catch rate",
"index of abundance",
"standardized fishery catch time series",
"catch per effort",
"fishing success",
"availability"
"fishing success"
),
group = "catch per unit effort"
)

catch <- data.frame(
keyword = c(
"catch",
"total mortality"
"total mortality",
"harvest"
),
group = "catch"
)

landings <- data.frame(
keyword = c(
"landings",
"retained catch",
"wanted catch"
"retained catch"
),
group = "landings"
)
Expand All @@ -75,13 +73,13 @@ sex <- data.frame(
group = "sex"
)

spr <- data.frame(
keyword = c(
"spawner per recruit|spr",
"30%spr|spr30%|30spr|spr30"
),
group = "spawner per recruit"
)
# spr <- data.frame(
# keyword = c(
# "spawner per recruit|spr",
# "spawning potential ratio"
# ),
# group = "spawner per recruit"
# )

weight <- data.frame(
keyword = c(
Expand All @@ -102,8 +100,8 @@ lnr0 <- data.frame(
group = "natural log of unfished recruitment"
)

keyword_map <- rbind(z, spawning_biomass, cpue, catch, landings,
projection, sex, spr, weight, lnr0)
keyword_map <- rbind(catch, cpue, landings,
lnr0, projection, sex, spawning_biomass, weight)
keyword_map$keyword_id <- as.factor(1:nrow(keyword_map))
keyword <- keyword_map$keyword

Expand All @@ -114,7 +112,7 @@ subfolder_path <- list.dirs(path = working_path, full.names = TRUE, recursive =
subfolder_name <- list.dirs(path = working_path, full.names = FALSE, recursive = FALSE)

# Create keyword database
col_name <- c("ID", "Science_Center", "File_Path", keyword)
col_name <- c("ID", "Source", "File_Path", keyword)
frequency_database <- presence_database <- proportion_database <-
data.frame(matrix(NA, ncol = length(col_name)))
colnames(frequency_database) <- colnames(presence_database) <- colnames(proportion_database) <-
Expand Down Expand Up @@ -216,47 +214,60 @@ proportion_database <- rbind(
)
proportion_database[nrow(proportion_database), "ID"] <- "Sum"

xlsx_path <- here::here("TextAnalysis", "top10_analysis.xlsx")
xlsx::write.xlsx(frequency_database, file=xlsx_path, sheetName="frequency", row.names=FALSE)
xlsx::write.xlsx(presence_database, file=xlsx_path, sheetName="presence", append=TRUE, row.names=FALSE)
xlsx::write.xlsx(proportion_database, file=xlsx_path, sheetName="proportion", append=TRUE, row.names=FALSE)
write.csv(frequency_database, file=here::here("TextAnalysis", "top10_frequency.csv"), row.names=FALSE)
write.csv(presence_database, file=here::here("TextAnalysis", "top10_presence.csv"), row.names=FALSE)
write.csv(proportion_database, file=here::here("TextAnalysis", "top10_proportion.csv"), row.names=FALSE)

# Upload xlsx to Google Drive
# Google Drive folder id
id_googledrive <- "1BUsYYd11lE2TECqHru5tX6LwkMniVnKv"
googledrive::drive_upload(media = xlsx_path, path = as_id(id_googledrive), overwrite = TRUE, type="spreadsheet")
authorize_GoogleDrive <- FALSE
if (authorize_GoogleDrive) {
# Google Drive folder id
id_googledrive <- "1BUsYYd11lE2TECqHru5tX6LwkMniVnKv"
googledrive::drive_upload(media = xlsx_path, path = as_id(id_googledrive), overwrite = TRUE, type="spreadsheet")
}

# Plot results ------------------------------------------------------------
xlsx_path <- here::here("TextAnalysis", "top10_analysis.xlsx")
frequency_database <- xlsx::read.xlsx(file=xlsx_path, sheetName="frequency")
presence_database<-xlsx::read.xlsx(file=xlsx_path, sheetName="presence")
proportion_database<-xlsx::read.xlsx(file=xlsx_path, sheetName="proportion")
frequency_database <- read.csv(file=here::here("TextAnalysis", "top10_frequency.csv"))
presence_database<-read.csv(file=here::here("TextAnalysis", "top10_presence.csv"))
proportion_database<-read.csv(file=here::here("TextAnalysis", "top10_proportion.csv"))

colnames(presence_database) <- col_name
data_reshape <- reshape2::melt(
presence_database[, c(2, 4:ncol(presence_database))],
id = c("Science_Center")
id = c("Source")
)
colnames(data_reshape) <- c("Science_Center", "keyword", "value")
colnames(data_reshape) <- c("Source", "keyword", "value")

data_merge <- merge(data_reshape, keyword_map, by= "keyword")

sum_by_group <- aggregate(value ~ keyword+keyword_id+group+Science_Center, data = data_merge, sum)
sum_by_group <- aggregate(value ~ keyword+keyword_id+group+Source, data = data_merge, sum)
sum_by_group <- sum_by_group[order(sum_by_group$keyword_id),]
group <- unique(sum_by_group$group)
figure <- vector(mode="list", length=length(group))

for (i in 1:length(group)){
figure[[i]] <- ggplot(data = sum_by_group[sum_by_group$group==group[i], ], aes(x=keyword_id, y=value, fill=Science_Center)) +
geom_bar(position='dodge', stat='identity') +
labs(
x = group[i],
y = "Frequency"
)+
theme_bw()
}
jpeg(filename = here::here("TextAnalysis", "top10_barplot_sum.jpg"), width=200, height=150, units="mm", res=1200)
ggplot(sum_by_group, aes(fill=Source, y=value, x=keyword_id)) +
geom_bar(position="dodge", stat="identity") +
facet_wrap(~group, scales = "free_x") +
labs(
x = "Term ID",
y = "Frequency"
) +
scale_fill_viridis(discrete = TRUE) +
theme_bw()
dev.off()

mean_by_group <- aggregate(value ~ keyword+keyword_id+group+Source, data = data_merge, mean)
mean_by_group <- mean_by_group[order(mean_by_group$keyword_id),]
group <- unique(mean_by_group$group)

jpeg(filename = here::here("TextAnalysis", "top10_barplot.jpg"), width=300, height=270, units="mm", res=1200)
(figure[[1]] + figure[[2]] + figure[[3]] + figure[[4]] + figure[[5]])/
(figure[[6]] + figure[[7]] + figure[[8]] + figure[[9]] + figure[[10]])
jpeg(filename = here::here("TextAnalysis", "top10_barplot_mean.jpg"), width=200, height=150, units="mm", res=1200)
ggplot(mean_by_group, aes(fill=Source, y=value*100, x=keyword_id)) +
geom_bar(position="dodge", stat="identity") +
facet_wrap(~group, scales = "free_x") +
labs(
x = "Term ID",
y = "Presence (%)"
) +
scale_fill_viridis(discrete = TRUE) +
theme_bw()
dev.off()
Binary file removed TextAnalysis/top10_analysis.xlsx
Binary file not shown.
Binary file removed TextAnalysis/top10_barplot.jpg
Binary file not shown.
Binary file added TextAnalysis/top10_barplot_mean.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added TextAnalysis/top10_barplot_sum.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit bf220bc

Please sign in to comment.