From cfc14dc317ab4e6e1fb33d3a0f5cd075745628e1 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Fri, 26 Jan 2024 12:25:18 -0500 Subject: [PATCH 1/6] refactor: untrack report html files --- .gitignore | 1 + docs/2024/report_2024-01-17.html | 5539 ------------------------------ docs/report.html | 5539 ------------------------------ 3 files changed, 1 insertion(+), 11078 deletions(-) delete mode 100644 docs/2024/report_2024-01-17.html delete mode 100644 docs/report.html diff --git a/.gitignore b/.gitignore index 06f31c6..d80a5b7 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ # data /data/ +/datashare/ # R .Rproj.user diff --git a/docs/2024/report_2024-01-17.html b/docs/2024/report_2024-01-17.html deleted file mode 100644 index ebeb739..0000000 --- a/docs/2024/report_2024-01-17.html +++ /dev/null @@ -1,5539 +0,0 @@ - - - - - - - - - - - - - - - -spacesavers2 🚀 report - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - -
knitr::opts_chunk$set(message = FALSE, warning = FALSE)
-

View this report on the web: https://ccbr.github.io/spacesavers2/2024/report_2024-01-17.html

-
library(bslib)
-library(dplyr)
-library(DT)
-library(fontawesome)
-library(ggplot2)
-library(glue)
-library(here)
-library(htmltools)
-library(knitr)
-library(lubridate)
-library(plotly)
-library(purrr)
-library(readr)
-library(rlang)
-library(scales)
-library(shiny)
-library(stringr)
-library(tidyr)
-theme_set(theme_bw())
-
-to_bytes <- function(x, from_unit) {
-  bytes_units <- list(
-    KiB = 1,
-    MiB = 2,
-    GiB = 3,
-    TiB = 4
-  )
-  return(x * (1024^bytes_units[[from_unit]]))
-}
-from_bytes <- function(x, to_unit) {
-  return(x * x / (to_bytes(x, to_unit)))
-}
-
-from_bytes_v <- Vectorize(from_bytes)
-to_bytes_v <- Vectorize(to_bytes)
-
-filter_users <- function(dat, usercol = username) {
-  non_people <- c("allusers", "rpcuser", "slurm")
-  dat %>%
-    filter(
-      !({{ usercol }} %in% non_people), # not actual people
-      !str_detect({{ usercol }}, "[0-9]") # entirely numeric usernames
-    )
-}
-
-is_large_range <- function(x, n_orders_magnitude = 5) {
-  xrange <- range(x)
-  return((xrange[2] - xrange[1]) >= 10^n_orders_magnitude)
-}
-
-plot_user_metric <- function(dat, x_metric) {
-  dat %>%
-    ggplot(aes(
-      x = eval_tidy(data_sym(x_metric)),
-      y = username,
-      fill = eval_tidy(data_sym(x_metric)),
-      text = glue("{username}\n{eval_tidy(data_sym(x_metric))} {x_metric}")
-    )) +
-    geom_col() +
-    # TODO: ggplotly doesn't know what to do with scale::label_log
-    # {if (is_large_range(dat %>% pull(x_metric))) scale_x_log10(labels = label_log(digits = 2)) } +
-    labs(x = x_metric, y = "") +
-    theme(legend.position = "none")
-}
-
-plot_metric_time <- function(dat, y_metric) {
-  dat %>%
-    ggplot(aes(
-      x = date,
-      y = eval_tidy(data_sym(y_metric)),
-      color = username
-    )) +
-    geom_line(alpha = 0.7) +
-    geom_point(aes(text = glue("{username}\n{eval_tidy(data_sym(y_metric))} {y_metric}"))) +
-    labs(y = y_metric)
-}
-
-panel_summary <- function(dat,
-                          folder_path = "/data/CCBR",
-                          plot_fcn = plot_metric_time) {
-  summary_dat_folder <- dat %>%
-    filter(FolderPath == folder_path)
-  top_users <- summary_dat_folder %>%
-    pivot_longer(all_of(summary_metrics),
-      names_to = "metric"
-    ) %>%
-    mutate(value_adj = case_when(
-      metric == "OverallScore" ~ -value,
-      TRUE ~ value
-    )) %>%
-    group_by(metric) %>%
-    slice_max(order_by = value_adj, n = n_top_users) %>%
-    pull(username) %>%
-    unique()
-  plots <- summary_metrics %>% lapply(function(y_metric) {
-    user_order <- summary_dat_folder %>%
-      filter(username %in% top_users) %>%
-      pivot_longer(all_of(summary_metrics),
-        names_to = "metric"
-      ) %>%
-      mutate(value_adj = case_when(
-        metric == "OverallScore" ~ -value,
-        TRUE ~ value
-      )) %>%
-      filter(metric == y_metric) %>%
-      arrange(by = value_adj) %>%
-      pull(username) %>%
-      unique()
-    if (y_metric == "TotalBytes" | y_metric == "DuplicateBytes") {
-      to_unit <- "TiB" # TODO: dynamically set based on range of metric
-      new_metric_name <- glue("{y_metric}_{to_unit}")
-      summary_dat_folder <- summary_dat_folder %>%
-        mutate("{new_metric_name}" := from_bytes(eval_tidy(data_sym(y_metric)), to_unit))
-      y_metric <- new_metric_name
-    } else if (y_metric == "TotalMeanAge" | y_metric == "DuplicateMeanAge") {
-      new_metric_name <- glue("{y_metric}_Days")
-      summary_dat_folder <- summary_dat_folder %>%
-        rename("{new_metric_name}" := y_metric)
-      y_metric <- new_metric_name
-    } else if (y_metric == "TotalFiles" | y_metric == "DuplicateFiles") {
-      new_metric_name <- glue("{y_metric}_Millions")
-      summary_dat_folder <- summary_dat_folder %>%
-        mutate("{new_metric_name}" := eval_tidy(data_sym(y_metric)) / 10^6)
-      y_metric <- new_metric_name
-    }
-    p <- summary_dat_folder %>%
-      filter(username %in% user_order) %>%
-      mutate(username = factor(username, levels = user_order)) %>%
-      mutate(across(where(is.numeric), round, digits = 2)) %>%
-      plot_fcn(y_metric)
-    nav_panel(title = y_metric, card_header(y_metric), ggplotly(p, tooltip = "text"))
-  })
-  nav_panel(
-    title = markdown(glue("`{folder_path}`")),
-    navset_pill_list(!!!plots)
-  )
-}
-
n_top_users <- params$n_top_users
-input_dir <- params$input_dir # here("data")
-aggregated_filetypes <- c("blamematrix", "catalog", "mimeo")
-# TODO: only load last N weeks of data to keep RAM usage reasonably low
-all_files <- tibble(filename = list.dirs(input_dir) %>%
-  Filter(function(x) {
-    x != input_dir
-  }, .) %>%
-  lapply(function(x) {
-    list.files(x, full.names = TRUE)
-  }) %>%
-  unlist())
-user_dat <- all_files %>%
-  filter(!str_detect(filename, paste(aggregated_filetypes, collapse = "|"))) %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("date", "path", "username", "file", "ext"),
-    too_few = "debug"
-  ) %>%
-  mutate(date = as_date(basename(date)))
-
-dates <- user_dat %>%
-  filter(!is.na(date)) %>%
-  pull(date) %>%
-  unique()
-most_recent_date <- dates %>% max()
-
-total_usage_tb <- user_dat %>%
-  filter(
-    username == "allusers",
-    date == most_recent_date,
-    file == "summary",
-    path == "_data_CCBR"
-  ) %>%
-  pull(filename) %>%
-  read_tsv() %>%
-  filter(FolderPath == "/data/CCBR") %>%
-  mutate(disk_usage_tb = from_bytes(TotalBytes, "TiB")) %>%
-  pull(disk_usage_tb)
-# TODO disk_usage_tb doesn't agree with output from `df`
-
-grubbers_allusers_err <- user_dat %>%
-  filter(
-    username == "allusers",
-    date == most_recent_date,
-    file == "grubbers",
-    ext == "err",
-    path == "_data_CCBR"
-  ) %>%
-  pull(filename) %>%
-  read_lines()
-grubbers_message <- grubbers_allusers_err[2] %>%
-  str_split(":") %>%
-  unlist() %>%
-  .[3]
-
-user_dat <- user_dat %>% filter_users()
-usernames <- user_dat %>%
-  pull(username) %>%
-  unique()
-
-summary_dat_recent <- user_dat %>%
-  filter(
-    date == most_recent_date, file == "summary"
-  ) %>%
-  pull(filename) %>%
-  map(function(x) {
-    read_tsv(x) %>% mutate(filename = x)
-  }) %>%
-  list_rbind() %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("basepath", "path", "username", "file", "ext")
-  )
-summary_metrics <- summary_dat_recent %>%
-  pivot_longer(where(is.numeric), names_to = "metric") %>%
-  pull(metric) %>%
-  unique()
-
-

Total disk usage

-
disk_usage <- read_tsv(here("results", "disk_usage.txt"))
-df_date <- disk_usage %>%
-  pull("date") %>%
-  as_date()
-
-layout_column_wrap(
-  width = 1 / 2,
-  value_box(
-    title = p(fa("hard-drive"), "  Disk space in /data/CCBR"),
-    value = markdown(disk_usage %>%
-      mutate(Usage = glue("{Used} / {Size}")) %>%
-      select(Usage, `Use%`) %>%
-      kable()),
-    theme = "warning"
-  ),
-  value_box(
-    title = p(fa("users", prefer_type = "regular"), "  Users"),
-    value = p(glue("{length(usernames)} users as of {format(df_date, '%b %d, %Y')}")),
-    theme = "primary"
-  )
-)
-
-
-
-
-
-

- - Disk space in /data/CCBR -

- - - - - - - - - - - - - -
UsageUse%
197T / 200T99%
- -
-
- -
-
-
-
-
-
-

- - Users -

-

36 users as of Oct 17, 2023

-
-
- -
-
-
-
-
-

Summary over time

-

Usage by top users for each spacesavers metric.

-
summary_dat_all <- user_dat %>%
-  filter(
-    file == "summary"
-  ) %>%
-  pull(filename) %>%
-  map(function(x) {
-    read_tsv(x) %>% mutate(filename = x)
-  }) %>%
-  list_rbind() %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("basepath", "path", "username", "file", "ext")
-  ) %>%
-  mutate(date = str_replace(basepath, ".*/", "") %>% as_date())
-
-navset_tab(
-  summary_dat_all %>% panel_summary("/data/CCBR", plot_metric_time),
-  summary_dat_all %>% panel_summary("/data/CCBR/rawdata", plot_metric_time),
-  summary_dat_all %>% panel_summary("/data/CCBR/projects", plot_metric_time),
-)
-
- -
-
-
- -
-
-
-
TotalBytes_TiB
-
- -
-
-
DuplicateBytes_TiB
-
- -
-
-
PercentDuplicateBytes
-
- -
-
-
TotalFiles_Millions
-
- -
-
-
DuplicateFiles_Millions
-
- -
-
-
PercentDuplicateFiles
-
- -
-
-
TotalMeanAge_Days
-
- -
-
-
DuplicateMeanAge_Days
-
- -
-
-
AgeScore
-
- -
-
-
DupScore
-
- -
-
-
OccScore
-
- -
-
-
OverallScore
-
- -
-
-
-
-
-
-
- -
-
-
-
TotalBytes_TiB
-
- -
-
-
DuplicateBytes_TiB
-
- -
-
-
PercentDuplicateBytes
-
- -
-
-
TotalFiles_Millions
-
- -
-
-
DuplicateFiles_Millions
-
- -
-
-
PercentDuplicateFiles
-
- -
-
-
TotalMeanAge_Days
-
- -
-
-
DuplicateMeanAge_Days
-
- -
-
-
AgeScore
-
- -
-
-
DupScore
-
- -
-
-
OccScore
-
- -
-
-
OverallScore
-
- -
-
-
-
-
-
-
- -
-
-
-
TotalBytes_TiB
-
- -
-
-
DuplicateBytes_TiB
-
- -
-
-
PercentDuplicateBytes
-
- -
-
-
TotalFiles_Millions
-
- -
-
-
DuplicateFiles_Millions
-
- -
-
-
PercentDuplicateFiles
-
- -
-
-
TotalMeanAge_Days
-
- -
-
-
DuplicateMeanAge_Days
-
- -
-
-
AgeScore
-
- -
-
-
DupScore
-
- -
-
-
OccScore
-
- -
-
-
OverallScore
-
- -
-
-
-
-
-
-
-
-
-

Most recent summary (2024-01-15)

-

Usage by top users for each spacesavers metric.

-
navset_tab(
-  summary_dat_recent %>% panel_summary("/data/CCBR", plot_user_metric),
-  summary_dat_recent %>% panel_summary("/data/CCBR/rawdata", plot_user_metric),
-  summary_dat_recent %>% panel_summary("/data/CCBR/projects", plot_user_metric),
-)
-
- -
-
-
- -
-
-
-
TotalBytes_TiB
-
- -
-
-
DuplicateBytes_TiB
-
- -
-
-
PercentDuplicateBytes
-
- -
-
-
TotalFiles_Millions
-
- -
-
-
DuplicateFiles_Millions
-
- -
-
-
PercentDuplicateFiles
-
- -
-
-
TotalMeanAge_Days
-
- -
-
-
DuplicateMeanAge_Days
-
- -
-
-
AgeScore
-
- -
-
-
DupScore
-
- -
-
-
OccScore
-
- -
-
-
OverallScore
-
- -
-
-
-
-
-
-
- -
-
-
-
TotalBytes_TiB
-
- -
-
-
DuplicateBytes_TiB
-
- -
-
-
PercentDuplicateBytes
-
- -
-
-
TotalFiles_Millions
-
- -
-
-
DuplicateFiles_Millions
-
- -
-
-
PercentDuplicateFiles
-
- -
-
-
TotalMeanAge_Days
-
- -
-
-
DuplicateMeanAge_Days
-
- -
-
-
AgeScore
-
- -
-
-
DupScore
-
- -
-
-
OccScore
-
- -
-
-
OverallScore
-
- -
-
-
-
-
-
-
- -
-
-
-
TotalBytes_TiB
-
- -
-
-
DuplicateBytes_TiB
-
- -
-
-
PercentDuplicateBytes
-
- -
-
-
TotalFiles_Millions
-
- -
-
-
DuplicateFiles_Millions
-
- -
-
-
PercentDuplicateFiles
-
- -
-
-
TotalMeanAge_Days
-
- -
-
-
DuplicateMeanAge_Days
-
- -
-
-
AgeScore
-
- -
-
-
DupScore
-
- -
-
-
OccScore
-
- -
-
-
OverallScore
-
- -
-
-
-
-
-
-
-
-
-

Summary table

-
allusers_summary <- all_files %>%
-  filter(str_detect(filename, "_data_CCBR.allusers.summary.txt")) %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("date", "path", "username", "file", "ext")
-  ) %>%
-  mutate(date = as_date(basename(date))) %>%
-  slice_max(order_by = date) %>%
-  pull(filename) %>%
-  map(function(x) {
-    read_tsv(x)
-  }) %>%
-  list_rbind() %>%
-  mutate(
-    TotalBytes_GiB = round(from_bytes_v(TotalBytes, "GiB"), 2),
-    DuplicateBytes_GiB = round(from_bytes_v(DuplicateBytes, "GiB"), 2),
-    .before = "DuplicateBytes"
-  ) %>%
-  select(-c(TotalBytes, DuplicateBytes))
-
-card(
-  card_header("Summary across all users"),
-  datatable(allusers_summary, fillContainer = TRUE)
-)
-
-
Summary across all users
-
-
- -
- -
-
-
-

Blame matrix

-
blame_matrix <- all_files %>%
-  filter(str_detect(filename, "blamematrix")) %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("date", "path", "file", "ext")
-  ) %>%
-  mutate(date = as_date(basename(date))) %>%
-  filter(!is.na(date), file == "blamematrix", ext == "tsv", path == "_data_CCBR") %>%
-  slice_max(order_by = date) %>%
-  pull(filename) %>%
-  map(function(x) {
-    read_tsv(x)
-  }) %>%
-  list_rbind()
-
-card(
-  card_header("Disk usage by user in subdirectories"),
-  datatable(blame_matrix, fillContainer = TRUE)
-)
-
-
Disk usage by user in subdirectories
-
-
- -
- -
-
-
-

Duplicate files

-

Deleting top grubbers will save 6.61 TiB!

-
-

Potential savings per user

-
grub_err <- user_dat %>%
-  filter_users() %>%
-  filter(!is.na(date), file == "grubbers", ext == "err", path == "_data_CCBR") %>%
-  slice_max(order_by = date) %>%
-  pull(filename) %>%
-  map(function(x) {
-    read_tsv(x, col_names = FALSE) %>%
-      mutate(filename = x)
-  }) %>%
-  list_rbind() %>%
-  filter(str_detect(X1, "Deleting")) %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("date", "path", "username", "file", "ext")
-  ) %>%
-  mutate(
-    date = as_date(basename(date)),
-    grub_msg = str_replace_all(X1, regex("^.*:"), ""),
-    savings_value = as.numeric(
-      str_replace_all(
-        grub_msg,
-        regex(".*save ([\\d\\.]*) [\\w!]+"),
-        "\\1"
-      )
-    ),
-    savings_unit = str_replace_all(
-      grub_msg,
-      regex(".*save [\\d\\.]* ([\\w]+)!"),
-      "\\1"
-    ),
-    savings_bytes = to_bytes_v(savings_value, savings_unit)
-  )
-
-user_grub_table <- grub_err %>%
-  arrange(desc(savings_bytes)) %>%
-  select(username, savings_value, savings_unit)
-
-card(
-  card_header("Savings per user"),
-  datatable(user_grub_table, fillContainer = TRUE)
-)
-
-
Savings per user
-
-
- -
- -
-
-
-

All high-value duplicates

-
grub_dat <- user_dat %>%
-  filter_users() %>%
-  filter(!is.na(date), file == "grubbers", ext == "tsv", path == "_data_CCBR") %>%
-  slice_max(order_by = date) %>%
-  pull(filename) %>%
-  map(function(x) {
-    read_tsv(x, col_names = FALSE) %>%
-      mutate(filename = x)
-  }) %>%
-  list_rbind() %>%
-  rename(
-    file_hash = X1,
-    file_count = X2,
-    total_disk_usage = X3,
-    single_disk_usage = X4,
-    filepaths = X5
-  ) %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("date", "path", "username", "file", "ext")
-  ) %>%
-  mutate(date = as_date(basename(date))) %>%
-  filter_users() %>%
-  separate_wider_delim(total_disk_usage,
-    delim = " ",
-    names = c("total_disk_usage_value", "total_disk_usage_unit"),
-    cols_remove = FALSE
-  ) %>%
-  separate_wider_delim(single_disk_usage,
-    delim = " ",
-    names = c("single_disk_usage_value", "single_disk_usage_unit"),
-    cols_remove = FALSE
-  ) %>%
-  mutate(across(all_of(c("total_disk_usage_value", "single_disk_usage_value")), as.numeric))
-
-top_files <- grub_dat %>%
-  arrange(order_by = desc(total_disk_usage_value)) %>%
-  select(total_disk_usage_value, username, filepaths) %>%
-  rename(disk_usage_gb = total_disk_usage_value)
-
-card(card_header("Top files"), datatable(top_files, fillContainer = TRUE))
-
-
Top files
-
-
- -
- -
-

For instructions on how to replace duplicates with hard links, see -the usurp -command in the spacesavers docs.

-
-
- - - - -
- - - - - - - - - - - - - - - - diff --git a/docs/report.html b/docs/report.html deleted file mode 100644 index ebeb739..0000000 --- a/docs/report.html +++ /dev/null @@ -1,5539 +0,0 @@ - - - - - - - - - - - - - - - -spacesavers2 🚀 report - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - -
knitr::opts_chunk$set(message = FALSE, warning = FALSE)
-

View this report on the web: https://ccbr.github.io/spacesavers2/2024/report_2024-01-17.html

-
library(bslib)
-library(dplyr)
-library(DT)
-library(fontawesome)
-library(ggplot2)
-library(glue)
-library(here)
-library(htmltools)
-library(knitr)
-library(lubridate)
-library(plotly)
-library(purrr)
-library(readr)
-library(rlang)
-library(scales)
-library(shiny)
-library(stringr)
-library(tidyr)
-theme_set(theme_bw())
-
-to_bytes <- function(x, from_unit) {
-  bytes_units <- list(
-    KiB = 1,
-    MiB = 2,
-    GiB = 3,
-    TiB = 4
-  )
-  return(x * (1024^bytes_units[[from_unit]]))
-}
-from_bytes <- function(x, to_unit) {
-  return(x * x / (to_bytes(x, to_unit)))
-}
-
-from_bytes_v <- Vectorize(from_bytes)
-to_bytes_v <- Vectorize(to_bytes)
-
-filter_users <- function(dat, usercol = username) {
-  non_people <- c("allusers", "rpcuser", "slurm")
-  dat %>%
-    filter(
-      !({{ usercol }} %in% non_people), # not actual people
-      !str_detect({{ usercol }}, "[0-9]") # entirely numeric usernames
-    )
-}
-
-is_large_range <- function(x, n_orders_magnitude = 5) {
-  xrange <- range(x)
-  return((xrange[2] - xrange[1]) >= 10^n_orders_magnitude)
-}
-
-plot_user_metric <- function(dat, x_metric) {
-  dat %>%
-    ggplot(aes(
-      x = eval_tidy(data_sym(x_metric)),
-      y = username,
-      fill = eval_tidy(data_sym(x_metric)),
-      text = glue("{username}\n{eval_tidy(data_sym(x_metric))} {x_metric}")
-    )) +
-    geom_col() +
-    # TODO: ggplotly doesn't know what to do with scale::label_log
-    # {if (is_large_range(dat %>% pull(x_metric))) scale_x_log10(labels = label_log(digits = 2)) } +
-    labs(x = x_metric, y = "") +
-    theme(legend.position = "none")
-}
-
-plot_metric_time <- function(dat, y_metric) {
-  dat %>%
-    ggplot(aes(
-      x = date,
-      y = eval_tidy(data_sym(y_metric)),
-      color = username
-    )) +
-    geom_line(alpha = 0.7) +
-    geom_point(aes(text = glue("{username}\n{eval_tidy(data_sym(y_metric))} {y_metric}"))) +
-    labs(y = y_metric)
-}
-
-panel_summary <- function(dat,
-                          folder_path = "/data/CCBR",
-                          plot_fcn = plot_metric_time) {
-  summary_dat_folder <- dat %>%
-    filter(FolderPath == folder_path)
-  top_users <- summary_dat_folder %>%
-    pivot_longer(all_of(summary_metrics),
-      names_to = "metric"
-    ) %>%
-    mutate(value_adj = case_when(
-      metric == "OverallScore" ~ -value,
-      TRUE ~ value
-    )) %>%
-    group_by(metric) %>%
-    slice_max(order_by = value_adj, n = n_top_users) %>%
-    pull(username) %>%
-    unique()
-  plots <- summary_metrics %>% lapply(function(y_metric) {
-    user_order <- summary_dat_folder %>%
-      filter(username %in% top_users) %>%
-      pivot_longer(all_of(summary_metrics),
-        names_to = "metric"
-      ) %>%
-      mutate(value_adj = case_when(
-        metric == "OverallScore" ~ -value,
-        TRUE ~ value
-      )) %>%
-      filter(metric == y_metric) %>%
-      arrange(by = value_adj) %>%
-      pull(username) %>%
-      unique()
-    if (y_metric == "TotalBytes" | y_metric == "DuplicateBytes") {
-      to_unit <- "TiB" # TODO: dynamically set based on range of metric
-      new_metric_name <- glue("{y_metric}_{to_unit}")
-      summary_dat_folder <- summary_dat_folder %>%
-        mutate("{new_metric_name}" := from_bytes(eval_tidy(data_sym(y_metric)), to_unit))
-      y_metric <- new_metric_name
-    } else if (y_metric == "TotalMeanAge" | y_metric == "DuplicateMeanAge") {
-      new_metric_name <- glue("{y_metric}_Days")
-      summary_dat_folder <- summary_dat_folder %>%
-        rename("{new_metric_name}" := y_metric)
-      y_metric <- new_metric_name
-    } else if (y_metric == "TotalFiles" | y_metric == "DuplicateFiles") {
-      new_metric_name <- glue("{y_metric}_Millions")
-      summary_dat_folder <- summary_dat_folder %>%
-        mutate("{new_metric_name}" := eval_tidy(data_sym(y_metric)) / 10^6)
-      y_metric <- new_metric_name
-    }
-    p <- summary_dat_folder %>%
-      filter(username %in% user_order) %>%
-      mutate(username = factor(username, levels = user_order)) %>%
-      mutate(across(where(is.numeric), round, digits = 2)) %>%
-      plot_fcn(y_metric)
-    nav_panel(title = y_metric, card_header(y_metric), ggplotly(p, tooltip = "text"))
-  })
-  nav_panel(
-    title = markdown(glue("`{folder_path}`")),
-    navset_pill_list(!!!plots)
-  )
-}
-
n_top_users <- params$n_top_users
-input_dir <- params$input_dir # here("data")
-aggregated_filetypes <- c("blamematrix", "catalog", "mimeo")
-# TODO: only load last N weeks of data to keep RAM usage reasonably low
-all_files <- tibble(filename = list.dirs(input_dir) %>%
-  Filter(function(x) {
-    x != input_dir
-  }, .) %>%
-  lapply(function(x) {
-    list.files(x, full.names = TRUE)
-  }) %>%
-  unlist())
-user_dat <- all_files %>%
-  filter(!str_detect(filename, paste(aggregated_filetypes, collapse = "|"))) %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("date", "path", "username", "file", "ext"),
-    too_few = "debug"
-  ) %>%
-  mutate(date = as_date(basename(date)))
-
-dates <- user_dat %>%
-  filter(!is.na(date)) %>%
-  pull(date) %>%
-  unique()
-most_recent_date <- dates %>% max()
-
-total_usage_tb <- user_dat %>%
-  filter(
-    username == "allusers",
-    date == most_recent_date,
-    file == "summary",
-    path == "_data_CCBR"
-  ) %>%
-  pull(filename) %>%
-  read_tsv() %>%
-  filter(FolderPath == "/data/CCBR") %>%
-  mutate(disk_usage_tb = from_bytes(TotalBytes, "TiB")) %>%
-  pull(disk_usage_tb)
-# TODO disk_usage_tb doesn't agree with output from `df`
-
-grubbers_allusers_err <- user_dat %>%
-  filter(
-    username == "allusers",
-    date == most_recent_date,
-    file == "grubbers",
-    ext == "err",
-    path == "_data_CCBR"
-  ) %>%
-  pull(filename) %>%
-  read_lines()
-grubbers_message <- grubbers_allusers_err[2] %>%
-  str_split(":") %>%
-  unlist() %>%
-  .[3]
-
-user_dat <- user_dat %>% filter_users()
-usernames <- user_dat %>%
-  pull(username) %>%
-  unique()
-
-summary_dat_recent <- user_dat %>%
-  filter(
-    date == most_recent_date, file == "summary"
-  ) %>%
-  pull(filename) %>%
-  map(function(x) {
-    read_tsv(x) %>% mutate(filename = x)
-  }) %>%
-  list_rbind() %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("basepath", "path", "username", "file", "ext")
-  )
-summary_metrics <- summary_dat_recent %>%
-  pivot_longer(where(is.numeric), names_to = "metric") %>%
-  pull(metric) %>%
-  unique()
-
-

Total disk usage

-
disk_usage <- read_tsv(here("results", "disk_usage.txt"))
-df_date <- disk_usage %>%
-  pull("date") %>%
-  as_date()
-
-layout_column_wrap(
-  width = 1 / 2,
-  value_box(
-    title = p(fa("hard-drive"), "  Disk space in /data/CCBR"),
-    value = markdown(disk_usage %>%
-      mutate(Usage = glue("{Used} / {Size}")) %>%
-      select(Usage, `Use%`) %>%
-      kable()),
-    theme = "warning"
-  ),
-  value_box(
-    title = p(fa("users", prefer_type = "regular"), "  Users"),
-    value = p(glue("{length(usernames)} users as of {format(df_date, '%b %d, %Y')}")),
-    theme = "primary"
-  )
-)
-
-
-
-
-
-

- - Disk space in /data/CCBR -

- - - - - - - - - - - - - -
UsageUse%
197T / 200T99%
- -
-
- -
-
-
-
-
-
-

- - Users -

-

36 users as of Oct 17, 2023

-
-
- -
-
-
-
-
-

Summary over time

-

Usage by top users for each spacesavers metric.

-
summary_dat_all <- user_dat %>%
-  filter(
-    file == "summary"
-  ) %>%
-  pull(filename) %>%
-  map(function(x) {
-    read_tsv(x) %>% mutate(filename = x)
-  }) %>%
-  list_rbind() %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("basepath", "path", "username", "file", "ext")
-  ) %>%
-  mutate(date = str_replace(basepath, ".*/", "") %>% as_date())
-
-navset_tab(
-  summary_dat_all %>% panel_summary("/data/CCBR", plot_metric_time),
-  summary_dat_all %>% panel_summary("/data/CCBR/rawdata", plot_metric_time),
-  summary_dat_all %>% panel_summary("/data/CCBR/projects", plot_metric_time),
-)
-
- -
-
-
- -
-
-
-
TotalBytes_TiB
-
- -
-
-
DuplicateBytes_TiB
-
- -
-
-
PercentDuplicateBytes
-
- -
-
-
TotalFiles_Millions
-
- -
-
-
DuplicateFiles_Millions
-
- -
-
-
PercentDuplicateFiles
-
- -
-
-
TotalMeanAge_Days
-
- -
-
-
DuplicateMeanAge_Days
-
- -
-
-
AgeScore
-
- -
-
-
DupScore
-
- -
-
-
OccScore
-
- -
-
-
OverallScore
-
- -
-
-
-
-
-
-
- -
-
-
-
TotalBytes_TiB
-
- -
-
-
DuplicateBytes_TiB
-
- -
-
-
PercentDuplicateBytes
-
- -
-
-
TotalFiles_Millions
-
- -
-
-
DuplicateFiles_Millions
-
- -
-
-
PercentDuplicateFiles
-
- -
-
-
TotalMeanAge_Days
-
- -
-
-
DuplicateMeanAge_Days
-
- -
-
-
AgeScore
-
- -
-
-
DupScore
-
- -
-
-
OccScore
-
- -
-
-
OverallScore
-
- -
-
-
-
-
-
-
- -
-
-
-
TotalBytes_TiB
-
- -
-
-
DuplicateBytes_TiB
-
- -
-
-
PercentDuplicateBytes
-
- -
-
-
TotalFiles_Millions
-
- -
-
-
DuplicateFiles_Millions
-
- -
-
-
PercentDuplicateFiles
-
- -
-
-
TotalMeanAge_Days
-
- -
-
-
DuplicateMeanAge_Days
-
- -
-
-
AgeScore
-
- -
-
-
DupScore
-
- -
-
-
OccScore
-
- -
-
-
OverallScore
-
- -
-
-
-
-
-
-
-
-
-

Most recent summary (2024-01-15)

-

Usage by top users for each spacesavers metric.

-
navset_tab(
-  summary_dat_recent %>% panel_summary("/data/CCBR", plot_user_metric),
-  summary_dat_recent %>% panel_summary("/data/CCBR/rawdata", plot_user_metric),
-  summary_dat_recent %>% panel_summary("/data/CCBR/projects", plot_user_metric),
-)
-
- -
-
-
- -
-
-
-
TotalBytes_TiB
-
- -
-
-
DuplicateBytes_TiB
-
- -
-
-
PercentDuplicateBytes
-
- -
-
-
TotalFiles_Millions
-
- -
-
-
DuplicateFiles_Millions
-
- -
-
-
PercentDuplicateFiles
-
- -
-
-
TotalMeanAge_Days
-
- -
-
-
DuplicateMeanAge_Days
-
- -
-
-
AgeScore
-
- -
-
-
DupScore
-
- -
-
-
OccScore
-
- -
-
-
OverallScore
-
- -
-
-
-
-
-
-
- -
-
-
-
TotalBytes_TiB
-
- -
-
-
DuplicateBytes_TiB
-
- -
-
-
PercentDuplicateBytes
-
- -
-
-
TotalFiles_Millions
-
- -
-
-
DuplicateFiles_Millions
-
- -
-
-
PercentDuplicateFiles
-
- -
-
-
TotalMeanAge_Days
-
- -
-
-
DuplicateMeanAge_Days
-
- -
-
-
AgeScore
-
- -
-
-
DupScore
-
- -
-
-
OccScore
-
- -
-
-
OverallScore
-
- -
-
-
-
-
-
-
- -
-
-
-
TotalBytes_TiB
-
- -
-
-
DuplicateBytes_TiB
-
- -
-
-
PercentDuplicateBytes
-
- -
-
-
TotalFiles_Millions
-
- -
-
-
DuplicateFiles_Millions
-
- -
-
-
PercentDuplicateFiles
-
- -
-
-
TotalMeanAge_Days
-
- -
-
-
DuplicateMeanAge_Days
-
- -
-
-
AgeScore
-
- -
-
-
DupScore
-
- -
-
-
OccScore
-
- -
-
-
OverallScore
-
- -
-
-
-
-
-
-
-
-
-

Summary table

-
allusers_summary <- all_files %>%
-  filter(str_detect(filename, "_data_CCBR.allusers.summary.txt")) %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("date", "path", "username", "file", "ext")
-  ) %>%
-  mutate(date = as_date(basename(date))) %>%
-  slice_max(order_by = date) %>%
-  pull(filename) %>%
-  map(function(x) {
-    read_tsv(x)
-  }) %>%
-  list_rbind() %>%
-  mutate(
-    TotalBytes_GiB = round(from_bytes_v(TotalBytes, "GiB"), 2),
-    DuplicateBytes_GiB = round(from_bytes_v(DuplicateBytes, "GiB"), 2),
-    .before = "DuplicateBytes"
-  ) %>%
-  select(-c(TotalBytes, DuplicateBytes))
-
-card(
-  card_header("Summary across all users"),
-  datatable(allusers_summary, fillContainer = TRUE)
-)
-
-
Summary across all users
-
-
- -
- -
-
-
-

Blame matrix

-
blame_matrix <- all_files %>%
-  filter(str_detect(filename, "blamematrix")) %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("date", "path", "file", "ext")
-  ) %>%
-  mutate(date = as_date(basename(date))) %>%
-  filter(!is.na(date), file == "blamematrix", ext == "tsv", path == "_data_CCBR") %>%
-  slice_max(order_by = date) %>%
-  pull(filename) %>%
-  map(function(x) {
-    read_tsv(x)
-  }) %>%
-  list_rbind()
-
-card(
-  card_header("Disk usage by user in subdirectories"),
-  datatable(blame_matrix, fillContainer = TRUE)
-)
-
-
Disk usage by user in subdirectories
-
-
- -
- -
-
-
-

Duplicate files

-

Deleting top grubbers will save 6.61 TiB!

-
-

Potential savings per user

-
grub_err <- user_dat %>%
-  filter_users() %>%
-  filter(!is.na(date), file == "grubbers", ext == "err", path == "_data_CCBR") %>%
-  slice_max(order_by = date) %>%
-  pull(filename) %>%
-  map(function(x) {
-    read_tsv(x, col_names = FALSE) %>%
-      mutate(filename = x)
-  }) %>%
-  list_rbind() %>%
-  filter(str_detect(X1, "Deleting")) %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("date", "path", "username", "file", "ext")
-  ) %>%
-  mutate(
-    date = as_date(basename(date)),
-    grub_msg = str_replace_all(X1, regex("^.*:"), ""),
-    savings_value = as.numeric(
-      str_replace_all(
-        grub_msg,
-        regex(".*save ([\\d\\.]*) [\\w!]+"),
-        "\\1"
-      )
-    ),
-    savings_unit = str_replace_all(
-      grub_msg,
-      regex(".*save [\\d\\.]* ([\\w]+)!"),
-      "\\1"
-    ),
-    savings_bytes = to_bytes_v(savings_value, savings_unit)
-  )
-
-user_grub_table <- grub_err %>%
-  arrange(desc(savings_bytes)) %>%
-  select(username, savings_value, savings_unit)
-
-card(
-  card_header("Savings per user"),
-  datatable(user_grub_table, fillContainer = TRUE)
-)
-
-
Savings per user
-
-
- -
- -
-
-
-

All high-value duplicates

-
grub_dat <- user_dat %>%
-  filter_users() %>%
-  filter(!is.na(date), file == "grubbers", ext == "tsv", path == "_data_CCBR") %>%
-  slice_max(order_by = date) %>%
-  pull(filename) %>%
-  map(function(x) {
-    read_tsv(x, col_names = FALSE) %>%
-      mutate(filename = x)
-  }) %>%
-  list_rbind() %>%
-  rename(
-    file_hash = X1,
-    file_count = X2,
-    total_disk_usage = X3,
-    single_disk_usage = X4,
-    filepaths = X5
-  ) %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("date", "path", "username", "file", "ext")
-  ) %>%
-  mutate(date = as_date(basename(date))) %>%
-  filter_users() %>%
-  separate_wider_delim(total_disk_usage,
-    delim = " ",
-    names = c("total_disk_usage_value", "total_disk_usage_unit"),
-    cols_remove = FALSE
-  ) %>%
-  separate_wider_delim(single_disk_usage,
-    delim = " ",
-    names = c("single_disk_usage_value", "single_disk_usage_unit"),
-    cols_remove = FALSE
-  ) %>%
-  mutate(across(all_of(c("total_disk_usage_value", "single_disk_usage_value")), as.numeric))
-
-top_files <- grub_dat %>%
-  arrange(order_by = desc(total_disk_usage_value)) %>%
-  select(total_disk_usage_value, username, filepaths) %>%
-  rename(disk_usage_gb = total_disk_usage_value)
-
-card(card_header("Top files"), datatable(top_files, fillContainer = TRUE))
-
-
Top files
-
-
- -
- -
-

For instructions on how to replace duplicates with hard links, see -the usurp -command in the spacesavers docs.

-
-
- - - - -
- - - - - - - - - - - - - - - - From d29b00fb884ef3d89d2dea7104039ee0d54892d0 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Fri, 26 Jan 2024 13:11:43 -0500 Subject: [PATCH 2/6] refactor: save report to datashare --- bin/render.R | 2 +- bin/render_report_biowulf.sh | 12 ++++++------ report.Rmd | 10 +++++----- src/send_email.py | 14 +++++++++----- 4 files changed, 21 insertions(+), 17 deletions(-) diff --git a/bin/render.R b/bin/render.R index c0ece17..ebb8ae1 100644 --- a/bin/render.R +++ b/bin/render.R @@ -1,5 +1,5 @@ #!/usr/bin/env Rscript rmarkdown::render("report.Rmd", - output_file = "docs/report.html", + output_file = "datashare/report.html", params = list(input_dir = "data") ) diff --git a/bin/render_report_biowulf.sh b/bin/render_report_biowulf.sh index 5539026..c93a2df 100644 --- a/bin/render_report_biowulf.sh +++ b/bin/render_report_biowulf.sh @@ -6,19 +6,19 @@ SINGULARITY_CACHEDIR=/data/CCBR_Pipeliner/SIFS today=$(date +'%Y-%m-%d') year=$(date +'%Y') -mkdir -p docs/$year -html_filename="docs/${year}/spacesavers2-report_${today}.html" +mkdir -p datashare/$year +html_filename="datashare/${year}/spacesavers2-report_${today}.html" recipient_email="kelly.sovacool@nih.gov,vishal.koparde@nih.gov" +url=https://hpc.nih.gov/~sovacoolkl/spacesavers2/${year}/spacesavers2-report_${today}.html + echo "cd /mnt && \ Rscript bin/render.R && \ - cp docs/report.html $html_filename && \ + cp datashare/report.html $html_filename && \ python src/send_email.py \ $html_filename \ $recipient_email \ " |\ singularity exec -C -B $PWD:/mnt,/data/CCBR_Pipeliner/userdata/spacesavers2/:/mnt/data docker://nciccbr/spacesavers2:0.1.1 bash -git add docs -git commit -m 'chore: render report 🤖' -git push +cp -r datashare/* /data/sovacoolkl/datashare/spacesavers2/ \ No newline at end of file diff --git a/report.Rmd b/report.Rmd index 8f256c6..94a5c2c 100644 --- a/report.Rmd +++ b/report.Rmd @@ -13,13 +13,13 @@ params: input_dir: '/data/CCBR_Pipeliner/userdata/spacesavers2/' n_top_users: 10 knit: (function(inputFile, encoding) { - rmarkdown::render(inputFile, encoding = encoding, output_dir = "docs/") }) + rmarkdown::render(inputFile, encoding = encoding, output_dir = "datashare/") }) --- ```{r setup} knitr::opts_chunk$set(message = FALSE, warning = FALSE) ``` -View this report on the web: `r glue::glue("")` +Download this report (must be logged into NIH VPN): `r glue::glue("")` Notice a bug or want to make a suggestion for this report? [Open an issue](https://github.com/CCBR/spacesavers2/issues) on GitHub. @@ -101,11 +101,11 @@ plot_metric_time <- function(dat, y_metric) { labs(y = y_metric) } -min_bytes_GiB <- 10 +min_user_bytes_GiB <- 10 panel_summary <- function(dat, folder_path = "/data/CCBR", plot_fcn = plot_metric_time, - min_bytes_GiB = min_bytes_GiB) { + min_bytes_GiB = min_user_bytes_GiB) { summary_dat_folder <- dat %>% filter(FolderPath == folder_path) %>% mutate(TotalBytes_GiB = from_bytes(TotalBytes, 'GiB')) %>% @@ -322,7 +322,7 @@ card(ggplotly(p, tooltip = "text")) ## Summary over time Usage by top users for each spacesavers metric. -Only users with at least `r min_bytes_GiB` GiB of total disk usage are shown. +Only users with at least `r min_user_bytes_GiB` GiB of total disk usage are shown. ```{r summary_over_time} summary_dat_all <- user_dat %>% diff --git a/src/send_email.py b/src/send_email.py index 06512fc..aeb1c59 100644 --- a/src/send_email.py +++ b/src/send_email.py @@ -3,11 +3,11 @@ """ Email the html report Usage: - python src/send_email.py + python src/send_email.py Example: - python src/send_email.py docs/report.html kelly.sovacool@nih.gov - python src/send_email.py docs/2024/report_2024-01-17.html kelly.sovacool@nih.gov,vishal.koparde@nih.gov + python src/send_email.py docs/report.html https://hpc.nih.gov/~sovacoolkl/spacesavers2/report.html kelly.sovacool@nih.gov + python src/send_email.py docs/2024/spacesavers2-report_2024-01-17.html https://hpc.nih.gov/~sovacoolkl/spacesavers2/2024/spacesavers2-report_2024-01-17.html kelly.sovacool@nih.gov,vishal.koparde@nih.gov """ @@ -44,11 +44,15 @@ def send_email( if __name__ == "__main__": + # TODO switch to click for argument parsing if this starts to get any more complicated html_filename = sys.argv[1] if len(sys.argv) > 1 else '' - recipient_addr = sys.argv[2] if len(sys.argv) > 2 else 'kelly.sovacool@nih.gov' + url = sys.argv[2] if len(sys.argv) > 2 else '' + recipient_addr = sys.argv[3] if len(sys.argv) > 3 else 'kelly.sovacool@nih.gov' + + download_text = f"Download the attached report or from {url}\n" if url else '' send_email( subject=f"🚀 spacesavers2 report", recipient=recipient_addr, - plain_text = f"Download the attached report or view it at https://ccbr.github.io/spacesavers2/{html_filename.strip('docs/')}\n\nThis is an automated email.", + plain_text = f"{download_text}\n\nThis is an automated email.", html_attach = html_filename ) From 19179d1904920c0984be8343bc21e353181d3404 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Fri, 26 Jan 2024 13:17:11 -0500 Subject: [PATCH 3/6] fix: send url in email --- bin/render_report_biowulf.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/render_report_biowulf.sh b/bin/render_report_biowulf.sh index c93a2df..c4626a5 100644 --- a/bin/render_report_biowulf.sh +++ b/bin/render_report_biowulf.sh @@ -10,6 +10,7 @@ mkdir -p datashare/$year html_filename="datashare/${year}/spacesavers2-report_${today}.html" recipient_email="kelly.sovacool@nih.gov,vishal.koparde@nih.gov" +# TODO switch this to ~CCBR_Pipeliner after we create the datashare there url=https://hpc.nih.gov/~sovacoolkl/spacesavers2/${year}/spacesavers2-report_${today}.html echo "cd /mnt && \ @@ -17,6 +18,7 @@ echo "cd /mnt && \ cp datashare/report.html $html_filename && \ python src/send_email.py \ $html_filename \ + $url \ $recipient_email \ " |\ singularity exec -C -B $PWD:/mnt,/data/CCBR_Pipeliner/userdata/spacesavers2/:/mnt/data docker://nciccbr/spacesavers2:0.1.1 bash From 7ff4dce36545387fef376ae0a77c23582e3863f1 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Mon, 29 Jan 2024 15:23:32 -0500 Subject: [PATCH 4/6] fix: move df command to separate bash script code chunk in Rmd wont run from singularity since it wont have access to /data/ccbr --- bin/disk_usage.sh | 9 +++++++++ bin/render_report_biowulf.sh | 3 +++ report.Rmd | 11 ----------- 3 files changed, 12 insertions(+), 11 deletions(-) create mode 100755 bin/disk_usage.sh mode change 100644 => 100755 bin/render_report_biowulf.sh diff --git a/bin/disk_usage.sh b/bin/disk_usage.sh new file mode 100755 index 0000000..d1cc0df --- /dev/null +++ b/bin/disk_usage.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +today=`date +"%Y-%m-%d %H:%M:%S"` + +df /data/CCBR |\ + awk -v today="$today" 'NR==1{$(NF+1)="datetime"} NR>1{$(NF+1)=today}1' |\ + sed -E 's/Mounted on/Mounted_on/' |\ + sed -E 's/ +/\t/g' |\ + tail -n 1 \ + >> results/disk_usage.tsv \ No newline at end of file diff --git a/bin/render_report_biowulf.sh b/bin/render_report_biowulf.sh old mode 100644 new mode 100755 index 5539026..87428e1 --- a/bin/render_report_biowulf.sh +++ b/bin/render_report_biowulf.sh @@ -10,6 +10,9 @@ mkdir -p docs/$year html_filename="docs/${year}/spacesavers2-report_${today}.html" recipient_email="kelly.sovacool@nih.gov,vishal.koparde@nih.gov" +# update disk usage +bash bin/disk_usage.sh +# render report and send via email echo "cd /mnt && \ Rscript bin/render.R && \ cp docs/report.html $html_filename && \ diff --git a/report.Rmd b/report.Rmd index c6216c9..32c5600 100644 --- a/report.Rmd +++ b/report.Rmd @@ -252,17 +252,6 @@ summary_metrics <- summary_dat_recent %>% ## Total disk usage -```{sh df, eval=dir.exists('/data/CCBR'), echo = FALSE} -today=`date +"%Y-%m-%d %H:%M:%S"` -df /data/CCBR |\ - awk -v today="$today" 'NR==1{$(NF+1)="datetime"} NR>1{$(NF+1)=today}1' |\ - sed -E 's/Mounted on/Mounted_on/' |\ - sed -E 's/ +/\t/g' |\ - # remove header and append to keep track over time - tail -n 1 |\ - >> results/disk_usage.tsv -``` - ```{r disk_usage_latest} disk_usage <- read_tsv(here("results", "disk_usage.tsv")) %>% mutate(used_tib = from_bytes(to_bytes(Used, "KiB"), "TiB"), From c5eed3981049f8c1480971140ec661d4d9a8b8ab Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Mon, 29 Jan 2024 15:23:49 -0500 Subject: [PATCH 5/6] chore: make script executable --- bin/render.R | 0 bin/render.sh | 1 + 2 files changed, 1 insertion(+) mode change 100644 => 100755 bin/render.R mode change 100644 => 100755 bin/render.sh diff --git a/bin/render.R b/bin/render.R old mode 100644 new mode 100755 diff --git a/bin/render.sh b/bin/render.sh old mode 100644 new mode 100755 index 33a5cf9..6c3242a --- a/bin/render.sh +++ b/bin/render.sh @@ -4,6 +4,7 @@ module load singularity SINGULARITY_CACHEDIR=/data/CCBR_Pipeliner/SIFS +# render report echo "cd /mnt && \ Rscript bin/render.R \ " |\ From 4982175a000420cd74ecfe8b4d68216aef5f0986 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Tue, 30 Jan 2024 14:32:49 -0500 Subject: [PATCH 6/6] fix: update datashare location to CCBR_Pipeliner --- bin/render_report_biowulf.sh | 6 ++---- src/send_email.py | 4 ++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/bin/render_report_biowulf.sh b/bin/render_report_biowulf.sh index 0918664..7490f3b 100755 --- a/bin/render_report_biowulf.sh +++ b/bin/render_report_biowulf.sh @@ -10,8 +10,7 @@ mkdir -p datashare/$year html_filename="datashare/${year}/spacesavers2-report_${today}.html" recipient_email="kelly.sovacool@nih.gov,vishal.koparde@nih.gov" -# TODO switch this to ~CCBR_Pipeliner after we create the datashare there -url=https://hpc.nih.gov/~sovacoolkl/spacesavers2/${year}/spacesavers2-report_${today}.html +url=https://hpc.nih.gov/~CCBR_Pipeliner/spacesavers2/${year}/spacesavers2-report_${today}.html # update disk usage bash bin/disk_usage.sh @@ -26,5 +25,4 @@ echo "cd /mnt && \ " |\ singularity exec -C -B $PWD:/mnt,/data/CCBR_Pipeliner/userdata/spacesavers2/:/mnt/data docker://nciccbr/spacesavers2:0.1.1 bash -# TODO switch this to CCBR_Pipeliner after we create the datashare there -cp -r datashare/* /data/sovacoolkl/datashare/spacesavers2/ \ No newline at end of file +cp -r datashare/* /data/CCBR_Pipeliner/datashare/spacesavers2/ \ No newline at end of file diff --git a/src/send_email.py b/src/send_email.py index aeb1c59..9d8fd3c 100644 --- a/src/send_email.py +++ b/src/send_email.py @@ -6,8 +6,8 @@ python src/send_email.py Example: - python src/send_email.py docs/report.html https://hpc.nih.gov/~sovacoolkl/spacesavers2/report.html kelly.sovacool@nih.gov - python src/send_email.py docs/2024/spacesavers2-report_2024-01-17.html https://hpc.nih.gov/~sovacoolkl/spacesavers2/2024/spacesavers2-report_2024-01-17.html kelly.sovacool@nih.gov,vishal.koparde@nih.gov + python src/send_email.py docs/report.html https://hpc.nih.gov/~CCBR_Pipeliner/spacesavers2/report.html kelly.sovacool@nih.gov + python src/send_email.py docs/2024/spacesavers2-report_2024-01-17.html https://hpc.nih.gov/~CCBR_Pipeliner/spacesavers2/2024/spacesavers2-report_2024-01-17.html kelly.sovacool@nih.gov,vishal.koparde@nih.gov """