-
Notifications
You must be signed in to change notification settings - Fork 0
/
converter.py
125 lines (95 loc) · 4.35 KB
/
converter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import csv
import sys
import ast
from .utilities import Utilities
class Converter(object):
def __init__(self):
self.u = Utilities()
def convert_file(self, input_file_path, output_file_path, filename, delimiter, quotechar, doc_type,
file_path_field, unique_id, omit, header_mapping, return_file=True):
"""Convert ordered dip filed to self-configured
Keyword arguments:
input_file_path -- The path to the input file (excluding filename)
output_file_path -- The path to the output file (excluding filename)
filename -- Input and output filename
delimiter -- Delimiter in the input file
quotechar -- Quote Character used in the input file
doc_type -- Doc Type used for all documents
file_path_field -- The header name in the input file that contains the full path to the file
unique_id -- The header that contains the unique identifier for a document
omit -- List of headers to omit from the input file
header_mapping -- Dictionary mapping header name to value to write out in the output file
"""
# set the csv field size limit to max to handle large fields
csv.field_size_limit(sys.maxsize)
# If quotechar isn't set, set to None
if len(quotechar) == 0:
quotechar = None
# Turn the omitted field into a list and strip whitespace from the ends
omit_list = omit.split(',')
omit_list = [i.strip() for i in omit_list]
# If the list is empty, set to None
if len(omit_list[0]) == 0:
omit_list = []
# Turn the header mapping into a dictionary
header_mapping_dict = ast.literal_eval('{' + header_mapping + '}')
# Open the input file
try:
f = open(input_file_path + filename, 'rb')
except IOError:
print "Couldn't open source file"
sys.exit()
# Build a dictionary from the csv
if quotechar:
d = csv.DictReader(f, delimiter=delimiter, quotechar=quotechar)
else:
d = csv.DictReader(f, delimiter=delimiter)
# Open the output file
try:
out_file = open(output_file_path + filename, 'wb')
except IOError:
print "Couldn't open out file"
sys.exit()
# Write out the output file
out_file.write('>>>>Self Configuring Tagged DIP<<<<' + '\n')
# placeholder for unique id
last_unique_id = ''
current_unique_id = ''
for r in d:
# get the current unique id
current_unique_id = r[unique_id]
# if the current unique id equals the last, continue the document
if current_unique_id == last_unique_id:
out_file.write('>>FullPath: ' + r[file_path_field] + '\n')
last_unique_id = current_unique_id
else: # else, begin a new document
out_file.write('BEGIN:' + '\n')
# write out mapped values
if header_mapping_dict:
for i in header_mapping_dict:
out_file.write(header_mapping_dict[i] + ': ' + r[i] + '\n')
# write out the file type num
out_file.write('>>FileTypeNum: ' + self.u.get_file_type_num(r[file_path_field]) + '\n')
# write out doc type name, if needed
if doc_type:
out_file.write('>>DocType: ' + doc_type)
# remove omitted values
if omit_list:
for i in omit_list:
del r[i]
# write out the rest of the values
for key, value in r.iteritems():
if value and key != file_path_field:
out_file.write(str(key) + ': ' + str(value) + '\n')
# write out the full path
if r[file_path_field]:
out_file.write('>>FullPath: ' + r[file_path_field] + '\n')
# set the last unique id
last_unique_id = r[unique_id]
out_file.write('END:' + '\n')
# Close out the output file
out_file.close()
if return_file:
# Read in the output file and return it
completed_file = open(output_file_path + filename, 'rb').read()
return completed_file