-
Notifications
You must be signed in to change notification settings - Fork 0
/
1_parse.R
67 lines (61 loc) · 1.9 KB
/
1_parse.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# 20/06/2015
# Dom Bennett
# Parsing abstracts into 'abstracts/'
# DIRS
data.dir <- '0_data'
out.dir <- '1_parsed'
if (!file.exists (out.dir)) {
dir.create(out.dir)
}
# PARAMETERS
slot <- 'title'
# EXTRACT PREDICTS ABSTRACTS
predicts.pubs <- readRDS(file.path (data.dir, "predicts",
'predicts_bib-2015-06-12-02-45-49.rds'))
if (slot == 'title') {
output <- predicts.pubs[,'Source_title']
} else {
output <- predicts.pubs[,'Abstract']
}
write.table (output,
file.path (out.dir, 'predicts.txt'),
quote=FALSE, row.names=FALSE, col.names=FALSE)
rm(predicts.pubs)
# EXTRACT NON-PREDICTS ABSTRACTS
# read in as lines -- if 'incomplete final line error' add return at bottom of file
# if (slot == 'abstracts') {
# raw.data <- read.delim (file.path (data.dir, 'non_predicts_abstracts.txt'))[,1]
# } else {
# raw.data <- read.delim (file.path (data.dir, 'non_predicts_title.txt'))[,1]
# }
# lines <- ''
# add <- FALSE
# for (line in raw.data) {
# # go through lines removing just abstracts
# if (grepl ('^AB', line)) {
# add <- TRUE
# line <- sub ('AB', '', line)
# }
# if (grepl ('^TC', line)) {
# add <- FALSE
# }
# if (add) {
# lines <- paste (lines, line)
# }
# }
# write.table (lines, file=file.path (out.dir, 'non_predicts.txt'))
# rm (raw.data)
# EXTRACT LPI ABSTRACTS
# only needs moving
if (slot == 'abstracts') {
raw.data <- read.delim (file.path (data.dir, 'lpi_abstracts.txt'))[,1]
} else {
raw.data <- read.delim (file.path (data.dir, 'lpi_titles.txt'))[,1]
}
write.table (raw.data, file.path (out.dir, 'lpi.txt'),
quote=FALSE, row.names=FALSE, col.names=FALSE)
# EXTRACT PUBMED ABSTRACTS
# USE COPY AND PASTE
# raw.data <- read.delim (file.path (data.dir, 'pubmed_abstracts.txt'))[,1]
# write.table (raw.data, file.path (out.dir, 'pubmed_abstracts.txt'),
# quote=FALSE, row.names=FALSE, col.names=FALSE)