-
Notifications
You must be signed in to change notification settings - Fork 2
/
xml2xls.py
174 lines (146 loc) · 5.76 KB
/
xml2xls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import utils.xml as xml
import utils.pandas as pd_util
import argparse
import textwrap
from os.path import splitext
try:
from lxml import etree as et
except ImportError:
try:
import xml.etree.ElementTree as et
except ImportError:
print(" Failed to import ElementTree from any known place")
def convert_and_save(
input_file_name: str,
columns,
output_file_name: str='',
namespace: str='',
root_element_name: str='',
file_format: str='',
preview: bool=False
):
print(' Loading the XML...');
# Load the XML file and parse it
xtree = et.parse(input_file_name)
xroot = xtree.getroot()
print(' ...done.');
if root_element_name is None or root_element_name == '':
data_root = xroot
else:
root_query = root_element_name
data_root = xtree.find(root_query)
if data_root is None:
# TODO: Log
print(' ERROR: Root not found. Query used: {}'.format(root_query))
raise ValueError('Root not found.')
'''
print('Debug #1:')
xml.parse_element(data_root)
print('Debug #2:')
xml.parse_element(data_root[0])
print('Debug #3:')
print(columns)
'''
print(' ');
if columns is None or len(columns) == 0 or (isinstance(columns, list) and columns[0] == '[all]'):
columns = [x.tag for x in data_root[1]]
# TODO: include also attributes
# Convert the file to Pandas dataFrame
print(' Converting to dataframe...')
df = xml.xml_element_to_df(data_root, columns, namespace)
print(' ...done')
if preview:
print(' Output preview:')
print(df.head(10))
print(' Total rows: {}.'.format(len(df)))
# Determine file format and file name
if output_file_name is None or output_file_name == '':
output_file_name, _ = splitext(input_file_name)
if file_format is None or file_format == '':
file_format = 'xlsx'
output_file_name += '.' + file_format
if file_format is None or file_format == '':
_, extension = splitext(output_file_name)
if extension != '' and (file_format is None or file_format == ''):
file_format = extension[1:]
print(' Writing output data to: "{}" ({} format).'.format(
output_file_name, file_format
))
# Save the file
pd_util.save_df_to_file(df, output_file_name)
#df.to_excel(output_file_name)
print(' ...done')
def parse_args():
parser = argparse.ArgumentParser(
description=textwrap.dedent('''\
Extract specified elements from the XML file and save them as xlsx file (or alternative formats -
xls, csv, json). Command loops through all children elements of a given XML element (default: root) and
extracts specified fields from each of them, putting each set as a new row to the output file.
Output file format is determined by extension of filename provided with -o parameter (default: xlsx)
Example usage:
xml2xls input.xml column_1 column_2
xml2xls input.xml -p "parent_element" -o output.csv "./@attribute_01|.//@attribute_02" ".//column_1" -l "Attributes" "Column values" -v
'''),
formatter_class=argparse.RawTextHelpFormatter
)
parser.add_argument('input_file', type=str)
parser.add_argument('-o', '--output_file', type=str,
help='Optional name of the output file. Specified path must exist; file will be created. If not provided, file name is determined from name of the input file.'
)
parser.add_argument('-n', '--namespace', type=str, help='XML namespace to be added to all columns.', default='')
parser.add_argument('-p', '--parent_element', type=str,
help='XML element whose children will be considered as rows in output. If not provided, XML root is used.',
default='')
'''
# TODO: Implement.
# Requires adding parameter below and modifications in main, xml_element_to_df and convert_and_save.
parser.add_argument('-r', '--row_element', type=str,
help='Name or XPath of a child element to be used as a row. \
IE all parent\'s children with satisfying this selection will be transformed to rows in resulting table.',
default='')
'''
parser.add_argument(
'column',
nargs='+',
action='append',
help='Repeatable. Name or XPath of a child element to be used as an column. Special value "[all]" indicates that all the child element should be extracted.'
)
parser.add_argument(
'-l', '--labels',
nargs='+',
action='append',
help='Optional list of column labels. \
If not given, their XML names (or XPaths) given in "column" argument are used.'
)
parser.add_argument('-v', '--preview', action='store_true',
help='Display the preview of the output file in the console.')
# TODO: null value, explicit file_format
# TODO: head(n) support
return parser.parse_args()
def main(args):
# Prepare the parameters
columns = args.column[0]
labels = args.labels
if labels != None and len(labels) > 0:
columns = dict(zip(columns, labels[0]))
# Do the action
convert_and_save(
input_file_name=args.input_file,
columns=columns,
output_file_name=args.output_file,
namespace=args.namespace,
root_element_name=args.parent_element,
# file_format = file_format,
preview=args.preview
)
if __name__ == "__main__":
print("Initializing...")
args = parse_args()
try:
main(args)
except:
print('ERROR: Failed to convert the data. Arguments:')
print(' {}'.format(args))
print('Stack trace:')
raise
print("...OK, conversion is done.")