-
Notifications
You must be signed in to change notification settings - Fork 0
/
hathifiles2FE.py
115 lines (93 loc) · 3.66 KB
/
hathifiles2FE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# HTRC-FE-metadata-formatter
# ==========================
#
# Reads a hathifiles text file one line at a time and creates HTRC Feature Extraction metadata json files.
#
# hathifiles2FE.py
import sys
import json
import os
import time
import datetime
from collections import OrderedDict
# require the name of the hathifile to read
if len(sys.argv) > 1:
hathifile = sys.argv[1]
else:
print('missing name of hathifile -- Usage: python3 ' + sys.argv[0] + ' hathifile [outDirectory] [startLine] [endLine]')
sys.exit(1)
# set defaults values for optional arguments
outDir = os.getcwd()
startLine = 0
def file_line_count(filename):
with open(filename) as f:
for i, l in enumerate(f):
pass
return i+1
endLine = file_line_count(hathifile)
# use the provided output directory, or default to current directory
if len(sys.argv) > 2:
outDir = sys.argv[2]
def is_int(string):
try:
int(string)
return True
except ValueError:
return False
# use provided start line, if valid
if len(sys.argv) > 3 and is_int(sys.argv[3]) and int(sys.argv[3]) < endLine:
startLine = int(sys.argv[3])
# use provided end line, if valid
if len(sys.argv) > 4 and is_int(sys.argv[4]) and int(sys.argv[4]) < endLine and int(sys.argv[4]) >= startLine :
endLine = int(sys.argv[4])
ts = time.time();
batchtime = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d'+'T'+'%H:%M')
linenum = 1
# log progress
with open(outDir + os.sep + "hathifiles2FE_log_" + batchtime + ".txt", 'w+') as log:
print("startLine: " + str(startLine))
print("endLine: " + str(endLine))
log.write("startLine: " + str(startLine) + "\n")
log.write("endLine: " + str(endLine) + "\n")
# read in one line at a time, write out one file at a time, logging progress
with open(hathifile) as f:
for line in f:
if linenum >= startLine:
print ("reading line number " + str(linenum) + " ...")
log.write("reading line number " + str(linenum) + "\n")
row = (line.split('\t'))
institutionId = (row[0].split('.'))[0]
cleanVolumeId = row[0].replace(':', "+")
cleanVolumeId = cleanVolumeId.replace('/', "=")
if not os.path.exists(outDir + os.sep + institutionId):
os.makedirs(outDir + os.sep + institutionId)
# remove dollar sign if it is present to be compatible with text zip file names
outfileName = (outDir + os.sep + institutionId + os.sep + cleanVolumeId + '.json').replace('$', '')
with open(outfileName, 'w') as outfile:
print("writing " + outfileName + " ..." )
log.write("writing " + outfileName + "\n")
meta = OrderedDict()
meta["schemaVersion"] = "1.2"
meta["dateCreated"] = batchtime
meta["title"] = row[11]
meta["pubDate"] = row[16]
meta["language"] = row[18]
meta["htBibUrl"]="http://catalog.hathitrust.org/api/volumes/full/htid/"+row[0]+ ".json"
meta["handleUrl"] = "http://hdl.handle.net/2027/"+row[0]
meta["oclc"] = row[7]
meta["imprint"] = row[12]
record = OrderedDict()
record["metadata"] = meta
json.dump(record, outfile)
linenum+=1
if linenum > endLine:
f.close()
log.close()
print("done")
exit()
f.close()
log.close()
print("done")