-
Notifications
You must be signed in to change notification settings - Fork 0
/
research-software-alliance.R
61 lines (48 loc) · 1.73 KB
/
research-software-alliance.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# text cleaning script using textclean library
# see https://github.com/trinker/textclean#functions for instructions
# Tina Keil, [email protected], February 2022
# blog data can be big, so increase java heap
# but adjust to RAM available on your machine!
options(java.parameters = "-Xmx8000m") #8GB ram
options(scipen=999) #turn off scientific notation
#load required libraries
library(dplyr)
library(filesstrings)
library(lubridate) #for converting dates
library(data.table) #fread is much faster for reading csv
library(readr)
library(textclean)
library(beepr)
#set working directory to directory of script
path <- dirname(rstudioapi::getSourceEditorContext()$path)
setwd(path)
source("functions.R")
############ settings ##############
in_file <- "research-software-alliance.csv" #name of file to import
out_name <- tools::file_path_sans_ext(in_file)
out_file <- paste0("cleaned/","clean_",out_name,".csv") #name of file after cleaning
out_csv <- paste0(out_file, ".csv")
infilepath <- paste0("originals/",in_file)
############## process ##############
now <- start_time()
#get data from csv file
if (file.exists(infilepath)) {
raw_data <- read.csv(infilepath, sep=",")
} else {
stop("Can't find input file. Please check.")
}
url <- raw_data$article.href
#from July 29, 2021 -> 2021-07-29
cat("* Converting date\n")
pubdate <- raw_data$published
#reformat author
author <- str_replace_all(raw_data$author, "\\.", ". ")
author <- str_to_title(author)
#clean text
cat("* Starting to process title and body\n")
title <- cleantext(raw_data$title)
content <- cleantext(raw_data$content)
#add to new data frame
clean_data <- data.frame(url,title,content,author,pubdate)
save2file(clean_data, out_name, out_file)
show_alltime(now, out_name)