Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Beispiel Code Optimierung calculate_percentile #3

Open
StefZimm opened this issue Jun 15, 2023 · 0 comments
Open

Beispiel Code Optimierung calculate_percentile #3

StefZimm opened this issue Jun 15, 2023 · 0 comments

Comments

@StefZimm
Copy link
Collaborator

StefZimm commented Jun 15, 2023

Urzustand

  • Keine percentile Funktion
  • percentiles nicht als Vektor übergebbar nicht anpassbar
  • Zusammenhängend mit pipes
  percentile.values <- dataset[complete.cases(dataset), ] %>%
    dplyr::group_by_at(dplyr::vars(one_of(columns))) %>%
    dplyr::summarise(
      percentile_10 = round(
        Hmisc::wtd.quantile(
          usedvariable,
          weights = weight,
          probs = .1,
          na.rm = TRUE
        ),
        2
      ),
      percentile_25 = round(
        Hmisc::wtd.quantile(
          usedvariable,
          weights = weight,
          probs = .25,
          na.rm = TRUE
        ),
        2
      ),
      percentile_75 = round(
        Hmisc::wtd.quantile(
          usedvariable,
          weights = weight,
          probs = .75,
          na.rm = TRUE
        ),
        2
      ),
      percentile_90 = round(
        Hmisc::wtd.quantile(
          usedvariable,
          weights = weight,
          probs = .90,
          na.rm = TRUE
        ),
        2
      ),
      percentile_99 = round(
        Hmisc::wtd.quantile(
          usedvariable,
          weights = weight,
          probs = .99,
          na.rm = TRUE
        ),
        2
      ),
      .groups = "drop"
    )

Neuer Zustand:

  • percentile Funktion verwendbar
  • percentiles als Argument in Vectorform übergebbar
  • Verwendbar in Main Funktion
  • Länger aber weniger verschachtelt mit vielen Klammern und pipes
#' @title calculate_percentile
#'
#' @description calculate_percentile calculates percentiles by groups
#'
#' @param dataset data.frame from subset_data function
#' @param grouping_variables Vector with dimension or grouping variables
#' (e.g. c("age_gr", "sex", "education level")) (maximum 3 variables)
#' ("" possible)
#' @param percentile numeric what percetile is calculated (10,25,75,90,99)
#'
#' @return dataset_percentile_values = dataset with percentiles by group
#'
#' @author Stefan Zimmermann, \email{[email protected]}
#'
#'

calculate_percentile <- function(dataset, 
                                 grouping_variables,
                                 percentile) {
  
  dataset <- dataset[complete.cases(dataset), ]
  dataset_grouped <- dplyr::group_by_at(dataset, 
                                        dplyr::vars(one_of(grouping_variables)))
  
  calculate_single_percentile = function(percentile_number) {
    dataset_with_percentiles <- dplyr::summarize(dataset_grouped, 
                                                 value = Hmisc::wtd.quantile(
                                                 usedvariable, 
                                                 weights = weight, 
                                                 probs = percentile_number, 
                                                 na.rm = TRUE)) 
    
    dataset_with_percentiles <-dplyr::mutate(dataset_with_percentiles, 
                                             percentile = percentile_number)
    
    return(dataset_with_percentiles)
  }
  
  percentile_decimal = percentile/100
  
  dataset_percentile_values <- 
    purrr::map_df(percentile_decimal, calculate_single_percentile) 
  
  dataset_percentile_values <- 
    dplyr::mutate(dataset_percentile_values, percentile = percentile*100)
  
  dataset_percentile_values <- 
    tidyr::spread(dataset_percentile_values, percentile, value, sep = "_")
  
  return(dataset_percentile_values)
}

Percentile Funktion nun Teil von großer main Funktion calculate_numeric_statistics

#' @title calculate_numeric_statistics
#'
#' @description Main funtction calculate_numeric_statistics creates aggregated 
#' tables for numeric variables with weighted median, weighted mean, n, 
#' minimum, maximum, percentiles, confidence intervals by groups
#'
#' @param dataset data.frame from subset_data function
#' @param grouping_variables Vector with dimension or grouping variables
#' (e.g. c("age_gr", "sex", "education level")) (maximum 3 variables)
#' ("" possible)
#'
#' @return datatable_numeric = dataset with mean, median, n, percentiles, 
#' confidence interval
#'
#' @author Stefan Zimmermann, \email{[email protected]}
#'
calculate_numeric_statistics <- function(dataset,
                                         grouping_variables) {
  
  columns <- c("year", grouping_variables)
  columns <- columns[columns != ""]
  
  # Calculate weighted mean
  dataset_mean <- calculate_weighted_mean(dataset = dataset,
                                          grouping_variables = columns)
  
  # Calculate number of observations n
  dataset_n <- calculate_n(dataset = dataset, 
                           grouping_variables = columns)
  
  # Calculate sd of mean
  dataset_sd <- calculate_sd(dataset = dataset_n, 
                             grouping_variables = columns)
  
  # Calculate minimum and maximum
  dataset_min_max <- calculate_min_max(dataset = dataset, 
                                       grouping_variables = columns)
  
  # Calculate confideence interval mean with weighted mean n and sd
  dataset_confidence_interval_mean <- calculate_confidence_interval_mean(
    dataset_n = dataset_n, dataset_sd = dataset_sd, dataset_mean = dataset_mean)
  
  # Calculate percentiles 
  percentile <- c(10,25,75,90,99)
  
  dataset_percentile_values <- calculate_percentile(
    dataset = dataset,
    grouping_variables = columns,
    percentile = percentile)
  
  # Calculate confidence interval median
  dataset_confidence_interval_median <- calculate_confidence_interval_median(
    dataset = dataset, 
    grouping_variables = columns)

  datatable_numeric <-
    combine_numeric_statistics(
      grouping_variables = columns,
      dataset_confidence_interval_mean = dataset_confidence_interval_mean,
      dataset_min_max = dataset_min_max,
      dataset_percentile_values = dataset_percentile_values,
      dataset_confidence_interval_median = dataset_confidence_interval_median)
  
  return(datatable_numeric)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant