-
Notifications
You must be signed in to change notification settings - Fork 46
/
Copy pathupdate.sh
executable file
·146 lines (129 loc) · 3.61 KB
/
update.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/bin/bash
# PATENT_CSVFILE: location of CSV file for maintaining state
# PATENT_DOWNLOADDIR: destination of downloaded data
# PATENT_LOGFILE: destination of log file
# check that all requisite variables are set
csvfile=${PATENT_CSVFILE:?"Need to set PATENT_CSVFILE to destination of state file."}
datadir=${PATENT_DATADIR:?"Need to set PATENT_DATADIR to desired destination of downloaded files"}
logfile=${PATENT_LOGFILE:?"Need to set PATENT_LOGFILE to location of log file"}
patyear=${PATENT_YEAR:?"Need to set PATENT_YEAR to year of patents found in PATENT_DATADIR"}
p1=`date +"%Y" | cut -c3-`
p2=`date +"%m"`
p3=`date +"%d"`
most_current=$p1$p2$p3
##################################################
#
# Function Definitions
#
##################################################
function have_file() {
while IFS=, read col1 col2 ; do
if [[ "$col1" == "$1" ]] ; then
return 1
fi
done < $csvfile
return 0
}
function next_filename() {
last=`echo $1 | tail -c3`
first=`echo $1 | head -c2`
mid=`echo $1 | head -c4 | tail -c2`
last=$(((10#$last+1)%32))
if [[ "$last" -eq 0 ]] ; then
mid=$(((10#$mid + 1)%12))
if [[ $mid -lt 10 ]] ; then
mid=0$mid
fi
fi
if [[ $last -lt 10 ]] ; then
last=0$last
fi
week_id=$first$mid$last
return 0
}
function download_next() {
printf "\e[0m"
week_id=`echo $1 | cut -d. -f1 | cut -c4-`
next_filename $week_id
until [[ $week_id > $most_current ]] ; do
next_filename $week_id
wget -q -P "$datadir" "http://commondatastorage.googleapis.com/patents/grant_full_text/${patyear}/ipg${week_id}.zip"
status=$?
if [[ "0" -eq $status ]] ; then
printf "\e[32m" ;
echo "=> Downloaded ipg${week_id}.zip"
printf "\e[0m"
echo ipg${week_id}.zip,`date +"%T@%m-%d-%Y"` >> $csvfile
fi
echo "Attempted download of ipg${week_id}.zip, max of ${most_current}" >> $logfile
done
}
##################################################
#
# Create state files if necessary
#
##################################################
printf "\e[34m" ;
echo "Data directory location: ${datadir}"
if [[ ! -d $datadir ]] ; then
# create the data directory
printf "\e[32m" ;
echo "=> Creating ${datadir}"
printf "\e[0m"
mkdir -p $datadir
fi
printf "\e[34m" ;
echo "CSV file location: ${csvfile}"
if [[ ! -f $csvfile ]] ; then
# create the CSV file
printf "\e[32m" ;
echo "=> Creating ${csvfile}"
printf "\e[0m"
touch $csvfile
fi
printf "\e[34m" ;
echo "Logfile location: ${logfile}"
if [[ ! -f $logfile ]] ; then
# create the logfile
printf "\e[32m" ;
echo "=> Creating ${logfile}"
printf "\e[0m"
touch $logfile
fi
##################################################
#
# Populate CSV file
#
##################################################
printf "\e[32m" ;
for file in `ls "$datadir"/*.zip`; do
found=`ls $file | rev | cut -d'/' -f1 | rev`
if have_file $found ; then
echo "=> Found $found"
echo $found,`date +"%T@%m-%d-%Y"` >> $csvfile
fi
done
################################################
#
# Download latest files and unzip them all
#
###############################################
for file in `ls "$datadir"/*.zip`; do
found=`ls $datadir/$file | rev | cut -d'/' -f1 | rev`
xml=`echo $found | rev | cut -d'.' -f2 | rev`
if [[ ! -f $datadir/$xml.xml ]] ; then
unzip $datadir/$found -d $datadir
fi
done
download_next `sort $csvfile | tail -n1`
#################################################
#
# Run preprocessor
#
#################################################
echo "Parsing..."
python parse.py -p. -d $datadir
echo "Cleaning..."
python clean.py
echo "Consolidating..."
python consolidate.py