uptake · mfidino · Aug 20, 2019 · Aug 20, 2019 · Aug 20, 2019 · Aug 22, 2019
diff --git a/.gitignore b/.gitignore
@@ -93,3 +93,7 @@ ENV/
 
 # Image files
 **/*.JPG
+
+# Some R stuff
+.Rproj.user
+*.Rproj
diff --git a/autofocus/predict/process__predict_example.R b/autofocus/predict/process__predict_example.R
@@ -0,0 +1,230 @@
+#Examples of how to make requests agains the image classification endpoints
+#Note: 
+#   1. This assumes that the image_classifier_api is running 
+#        (i.e., using docker run -p 8000:8000 gaganden/autofocus_serve)
+#   2. It also assumes that the api address is at 127.0.0.1 
+#        (which should be the case)
+#   3. Assumes that your current working directory is 
+#       './GitHub/autofocus/autofocus/predict'
+#   4. Assumes that the images you are going to send to autofocus have
+#        not been preprocessed at all. 
+
+#Library requirements:
+# RCurl, jsonlite, dplyr, magick, zip, progress
+
+library(RCurl)
+library(jsonlite)
+library(magick)
+library(zip)
+library(progress)
+library(dplyr)
+
+find_image_files <- function(search_dir){
+  # Utility function to find all recursively find all image files 
+  #   starting from a directory
+
+  # Args:
+  #   search_dir(character): the starting directory path from which to search
+
+  # Returns:
+  #   image_files(list): list containing the paths of all image files found. 
+  #     Each element in this list is a vector of at least 10 images. This split
+  #     is done so that the images can be zipped and sent to autofocus.
+
+  valid_extensions <- c("jpeg", "jpg", "bmp", "png")
+  valid_extensions <- c(valid_extensions, toupper(valid_extensions))
+
+  file_list <- list.files(search_dir, recursive = TRUE, full.names = TRUE)
+  image_files <- file_list[grep(paste(valid_extensions, 
+                                      collapse = "|"), file_list)]
+  image_files <- normalizePath(image_files, winslash = "/")
+  # normalize the path, then split into groups of max 10 image
+  n_groups <- ceiling(length(image_files) / 10)
+  image_files <- split(image_files,
+                       sort(rep_len(1:n_groups, length(image_files))))
+
+  return(image_files)
+}
+
+
+process_images <- function(image_files){
+  # Utility function to preprocess images to be sent to autofocus
+
+  # Args:
+  #   image_files(list): the output object from find_images()
+
+  # Returns:
+  #   a list: This list has two elements:
+  #     1. zip(character): A vector of the temporary zip files to be sent to 
+  #          autofocus.
+  #     2. dict(named character): a key-value pair that links the temporary
+  #          image file to the actual file. The elements in this vector are
+  #          the names of the temporary files while the names are the full 
+  #          paths to the file names.
+
+  if(!is(image_files, 'list')){
+    stop('image_files must be a list.')
+  }
+
+  if(any(sapply(image_files, length)>10)){
+    stop('One of the elements is image_files has > 10 images.')
+  }
+
+  dict_list <- vector('list', length = length(image_files))
+  zip_vector <- rep(NA, length(image_files))
+
+  cat(paste('Processing', length(unlist(image_files)), 'images...\n'))
+
+  pb <- progress::progress_bar$new(
+    format = "Images processed [:bar] :elapsed | eta: :eta",
+    total = length(unlist(image_files)),
+    width = 60
+  )
+
+  for(photo_group in seq.int(length(image_files))){
+
+    # get paths to files
+    image_file_names <- image_files[[photo_group]]
+
+    # files with 0 kb are corrupt, remove them
+    file_sizes <- file.size(image_file_names)
+    if(any(file_sizes == 0)){
+      image_file_names <- image_file_names[-which(image_file_names == 0)]
+    }
+
+    # count number of files
+    num_files <- length(image_file_names)
+
+    file_pattern <- paste0("file_",
+                           stringr::str_pad(1:num_files, width = 2, pad = "0"),
+                           "_")
+    # make some temporary file names
+    tmp_names <- tempfile(pattern = file_pattern,
+                         fileext = rep('.jpg', num_files))
+
+    # sort them
+    tmp_names <- sort(tmp_names)
+
+    # line up temps to actual photo names
+    dict <- sapply(strsplit(tmp_names, "\\\\|/"), function(x) x[length(x)])
+    names(dict) <- image_file_names
+
+    # Read in iamge, crop 198 from the bottom, resize to 512 pixels tall,
+    #  then save as a temporary image.
+    for(image in seq.int(num_files)){
+      pb$tick()
+      magick::image_read(image_file_names[image]) %>% 
+        magick::image_crop(., paste0(image_info(.)$width,
+                               "x",
+                               magick::image_info(.)$height-198)) %>% 
+        magick::image_resize(., '760x512!') %>% 
+        magick::image_write(., tmp_names[image])
+    }
+
+    # zip the temporary files together
+    tmp_zip <- tempfile(fileext = ".zip")
+    zip::zipr(tmp_zip, tmp_names)
+    dict_list[[photo_group]] <- dict
+    zip_vector[photo_group] <- tmp_zip
+    if(file.exists(tmp_zip)){
+      unlink(tmp_names)
+    }
+  }
+
+    # return the dictionary and the name of the zipped file.
+    return(list(zip = zip_vector, dict = dict_list))
+  }
+
+
+post_zips <- function(processed_images,
+                      uri = "http://localhost:8000/predict_zip"){
+  # send the zip files to autofocus
+
+  # Args:
+  #   processed_images(list): the output from process_images()
+  #   uri(character): the location autofocus is running
+
+  #Returns:
+  #   response(tibble): A tibble of guesses for each image supplied to 
+  #     autofocus. The columns, save for the last one, have species names
+  #     and represent the likelihood that this species is in the image.
+  #     The last column is the file name of the image.
+cat(paste('Posting', length(processed_images$zip), 
+          'zip file(s) to autofocus...\n'))
+
+pb <- progress::progress_bar$new(
+  format = "Files processed [:bar] :elapsed | eta: :eta",
+  total = length(unlist(processed_images$zip)),
+  width = 60
+)
+# the object that initially contains the autofocus json
+response <- vector('list', length(processed_images$zip))
+for(zippy in seq.int(length(processed_images$zip))){
+  pb$tick()
+  # post to autofocus
+  response[[zippy]] <- jsonlite::fromJSON(RCurl::postForm(uri, 
+                                        file = RCurl::fileUpload(processed_images$zip[zippy]),
+                                        .checkParams = FALSE))
+
+  # get the file names from autofocus
+  file_names <- strsplit(names(response[[zippy]]), "/")
+  file_names <- sapply(file_names, function(x) x[length(x)])
+  file_names <- strsplit(file_names, "_")
+  file_names <- as.numeric(sapply(file_names, '[[', 2))
+  # and line it up with what we did during image processing
+  OG_file_names <- names(processed_images$dict[[zippy]])[file_names]
+  # provide a warning just incase autofocus did not ID a specific image
+  if(!length(OG_file_names) == length(processed_images$dict[[zippy]]) ){
+    warning(paste('Autofocus did not ID all images in zip file number', zippy))
+  }
+  # put the file name into each nested list object
+  for(image in seq.int(length(response[[zippy]]))){
+    response[[zippy]][[image]]$file <- OG_file_names[image]
+  }
+}
+# bind the list of lists, then bind the list of tibbles
+response <- lapply(response, dplyr::bind_rows) %>% dplyr::bind_rows
+return(response)
+}
+
+
+most_likely <- function(response_frame){
+  # Utility function that provides the best guess from each classification
+
+  # Args:
+  #   response_frame(tibble): the output from post_zips()
+
+  # Returns:
+  #   A tibble that has three columns: 
+  #     1) file: the file name
+  #     2) species: the species most likely to be in the image
+  #     3) probability autofocus's confidence of this classification
+
+  # Find which column has the highest likelihood
+  best_guess <- apply(response_frame[,-grep('file', colnames(response_frame))], 
+                      1, which.max)
+  # Grab the highest likelihood
+  best_prob <- apply(response_frame[,-grep('file', colnames(response_frame))], 
+                     1, max)
+  # Correspond the highest likelihood to a species name
+  species_name <- colnames(response_frame)[best_guess]
+
+  # the object to return
+  to_return <- dplyr::tibble(file = response_frame$file,
+                      species = species_name,
+                      probability = best_prob)
+  return(to_return)
+}
+
+
+# where are the photos located
+search_dir <- "./images/"
+
+all_images <- find_image_files(search_dir)
+
+processed_images <- process_images(all_images)
+
+my_ids <- post_zips(processed_images)
+
+best_ids <- most_likely(my_ids)
+