-
Notifications
You must be signed in to change notification settings - Fork 3
/
Report.py
279 lines (224 loc) · 13.5 KB
/
Report.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
"""
Report generator
This script reads the template from 'template.txt' file in the 'template' folder and generates the report for all the files present in 'excel folder
according to the template provided by the user. The report is saved in a file 'report.md' in markdown folder. The report
is generated in Markdown format.
This script works only with '.xlsx' files.
For this script to work, 'pandas' and 'glob' must be installed.
This script contains the following functions:
* column_location - returns a list conatining indices of columns in correct sequence according to 'template.txt' file.
* date_sort_and_format - returns list of dataframes of all sheets of an excel file after sorting the data in
descending order according to date column (if any) in a sheet. It also converts the date
format to ddmmyyyy with a separator of yser's choice.
* gen_list_sheets - returns a lost of dataframes after combining relevant sheets of the excel file using the '_uuid'
column which contains the unique id for each submission.
* write_table - writes a dataframe in tabular format in report.md.
* write_list - writes a dataframe in list format in report.md.
* xls2md - this function determines whether the data for a particular excel file is to be writeen in tabular or list
format and invokes wrute_table or write_list functions accordingly.
* rename_submission__uuid - this function renames '_submission__uuid' column to '_uuid' and returns the list of
dataframes with renamed column.
"""
import pandas as pd
import os
import glob
import datetime
def column_location(spreadsheet, column_names):
"""
This function finds the indices of columns in the order specified in template file.
Parameters:
spreadsheet (DataFrame): A pandas DataFrame.
column_names (List of srtings): list of strings that specifies the name of column headers as in template file.
Returns:
column_location (list of integers): list of integers specifying the column locations in the DataFrame according tp the template specified.
"""
column_location = []
for i in range(len(column_names)):
for j in range(spreadsheet.shape[1]):
if column_names[i] in spreadsheet.columns[j]:
column_location.append(j)
return column_location
def date_sort_and_format(list_sheets, separator):
"""
Returns the DataFrame after sorting the data in descending order of date or year column if there is any column that
contains date or simply year in the DataFrame.
Parameters:
list_sheets (list of dataframes): a list of Pandas DataFrames.
separator (character): a special character to separate the day month and year
Returns:
all_sheets (list of DataFrames): a list of DataFrames with all data sorted in descending order of date column(if any) and format of date converted to ddmmyyyy
"""
all_sheets = []
for i in range(len(list_sheets)):
all_sheets.append(list_sheets[i])
for j in range(len(all_sheets)):
for i in range(all_sheets[j].shape[1]):
if all_sheets[j].columns[i][0] != '_' and 'Date' in all_sheets[j].columns[i]: # finds the column conataining date
all_sheets[j][all_sheets[j].columns[i]] = pd.to_datetime(all_sheets[j][all_sheets[j].columns[i]]) # converts all the date to 'datetime' format
all_sheets[j] = all_sheets[j].sort_values(by=[all_sheets[j].columns[i]], ascending=False) # sorts in descending order accrding to first encountered date column.
break
elif all_sheets[j].columns[i][0] != '_' and 'Year' in all_sheets[j].columns[i]: # finds the column containing year
all_sheets[j] = all_sheets[j].sort_values(by=[all_sheets[j].columns[i]], ascending=False) # sorts data in descending order according to year if date not present.
break
for j in range(len(all_sheets)): # converts date format to 'ddmmyyyy'
for i in range(all_sheets[j].shape[1]):
if all_sheets[j].columns[i][0] != '_' and 'Date' in all_sheets[j].columns[i]:
all_sheets[j][all_sheets[j].columns[i]] = pd.to_datetime(all_sheets[j][all_sheets[j].columns[i]])
all_sheets[j][all_sheets[j].columns[i]] = all_sheets[j][all_sheets[j].columns[i]].dt.strftime(
"%d" + separator + "%m" + separator + "%Y")
return all_sheets
def gen_list_sheets(sheets):
"""
Returns a list of DataFrames which can be worked upon to generate reports.
This function takes a list of dataframes which are all the sheets in an excel file and merges the relevant sheets
according to the unique id column '_uuid'
Parameters:
sheets (list of DataFrames): A list of pandas DataFrames. All the sheets of the excel file opened as DataFrames and stored in a list
Returns:
all_sheets (List of DataFrames):A list of DataFrames after performing the merging operation.
"""
all_sheets = []
if len(sheets) == 1: # if there is only one sheet in the file then it returns the sheet itself
all_sheets.append(sheets[0])
return all_sheets # returns the first sheet if there is only one
all_sheets.append(pd.merge(sheets[0], sheets[1], on='_uuid'))
if (len(sheets)) == 2: # if the form has 2 sheets then both are merged according to '_uuid' column and #
return all_sheets # returned as a dataframe
for j in range(len(sheets) - 2):
all_sheets.append(pd.merge(all_sheets[0], sheets[j + 2], on='_uuid'))
# if more than 2 sheets i.e. multi-category form then combines 3rd sheet onwards
return all_sheets[1:] # to 1st and 2nd sheet and returns a list of combined sheets
def write_table(combined_sheet, column_names, f):
"""
Writes the data of DataFrame in tabular format in Markdown.
Parameters:
combined_sheet (DataFrame): A Pandas DataFrame whose data is to be written in Tabular Form in Markdown.
column_names (List of Strings): A list of strings containing all the relevant column names of the DataFrame.
f (file object): file object to write the data in 'report.md'
"""
for i in range(len(column_names)):
f.write('| ' + column_names[i])
f.write(' |\n')
for i in range(len(column_names)):
f.write('|-----')
f.write(' |\n')
col_loc = column_location(combined_sheet, column_names)
for i in range(combined_sheet.shape[0]):
for j in range(len(col_loc)):
for k in range(len(combined_sheet.iloc[i, col_loc[j]])):
if combined_sheet.iloc[i, col_loc[j]][k] == '\n':
combined_sheet.iloc[i, col_loc[j]] = combined_sheet.iloc[i, col_loc[j]][:k - 1] + ' <br> ' + combined_sheet.iloc[i, col_loc[j]][k + 1:] # for lists in data
if combined_sheet.iloc[i, col_loc[j]].lower() == 'nan': # replaces 'nan' with '-' in tables.
f.write('| ')
else:
f.write('| ' + combined_sheet.iloc[i, col_loc[j]])
f.write(' |\n')
f.write('<br><br><br>\n')
def write_list(combined_sheet, column_names, f):
"""
Writes the data of DataFrame in list format in Markdown. The function works for basic lists. However, it currently doesn't
work for nested list.
Parameters:
combined_sheet (DataFrame): A Pandas DataFrame whose data is to be written in list Form in Markdown.
column_names (List of strings): A list of strings containing all the relevant column names of the DataFrame.
f (file object): file object to write the data in 'report.md'
"""
col_loc = column_location(combined_sheet, column_names)
for i in range(combined_sheet.shape[0]):
f.write('- ')
for j in range(len(col_loc) - 1):
if combined_sheet.iloc[i, col_loc[j]].lower() == 'nan': # omits 'nan' in list format
continue
else:
f.write(combined_sheet.iloc[i, col_loc[j]] + ', ')
if not combined_sheet.iloc[i, col_loc[len(col_loc) - 1]] == 'nan':
f.write(combined_sheet.iloc[i, col_loc[len(col_loc) - 1]] + '.')
f.write('\n')
f.write('<br><br><br>\n')
def rename_submission__uuid(sheets):
"""
Renames the '_submission__uuid' column to '_uuid'.
The unique id column name is '_uuid' in the first sheet and '_submission__uuid' in the subsequent sheets.
This function renames '_submission__uuid' to '_uuid' so that the sheets can be merged.
Parameters:
sheets (list of DataFrames): a list of DataFrames containing data of all the sheets in an excel file.
Returns:
sheets (list of DataFrames): list of DataFrames with '_submission__uuid' renamed as '_uuid'
"""
for j in range(len(sheets)):
sheets[j].rename(columns={'_submission__uuid': '_uuid'}, inplace=True)
return sheets
def xls2md(excel_spreadsheet, file_name, template, f, separator):
"""
Writes all data to 'reprt.md' file in markdown format
This function takes in the excel file, its name, its specified template, file object and separator to write in dates
and wrtes that data in tabular or list format as specified in the template.
Parameters:
excel_spreadsheet (DataFrame): DataFrame of all the excel spreadsheeets in a file.
file_name (string): String specifying name of the excel file.
template (list of strings): A list of string specifying the template for the Excel file
f (file object): File object to write data in Markdown format in the file 'report.md'
separator (character): A character for separating the day, month and year for the dates in the report.
"""
sheetnames = excel_spreadsheet.sheet_names
sheets = []
for m in range(len(sheetnames)): # stores all sheets in an excel file in a list 'sheet'
sheets.append(pd.read_excel(excel_spreadsheet, sheetnames[m]))
column_names = template[3:len(template) - 1] # gets a list of names of column headers in a sheet
sheets = rename_submission__uuid(sheets)
list_sheets = gen_list_sheets(sheets)
list_sheets = date_sort_and_format(list_sheets, separator)
f.write('## **')
for i in range(file_name.index('_-_')): # determines the form name from file name and converts it to all caps
if file_name[i] != '_': # and writes it in 'report.md'
f.write(file_name[i].upper())
else:
f.write(' ')
if len(sheets)<=2: # if the form has single set of repeating questions or no repeating questions
f.write('**\n')
for i in range(list_sheets[0].shape[0]):
for j in range(list_sheets[0].shape[1]):
list_sheets[0].iloc[i, j] = str(list_sheets[0].iloc[i, j])
if template[0] == 't':
write_table(list_sheets[0], column_names, f)
elif template[0] == 'l':
write_list(list_sheets[0], column_names, f)
else: # if the form has multiple sets of repeating questions the=at vary according to category selected.
f.write(' - ' + template[2].replace('_', ' ').upper() + '**\n')
for l in range(2, len(sheets)): # checks for the correct sheetname as in the template
if template[2] == sheetnames[l]:
break
for i in range(list_sheets[l-2].shape[0]):
for j in range(list_sheets[l-2].shape[1]):
list_sheets[l-2].iloc[i, j] = str(list_sheets[l-2].iloc[i, j])
if template[0] == 't': # l-2 because the categories of form start from sheet number 3.
write_table(list_sheets[l - 2], column_names, f)
elif template[0] == 'l':
write_list(list_sheets[l - 2], column_names, f)
def main():
cur_path = os.getcwd()
template_path = cur_path + r'\template'
path = cur_path + r'\excel'
template = open(template_path + '\\' + 'template.txt')
template_fields = template.readlines() # gets the template for all the forms from 'template.txt' file line by line
for i in range(len(template_fields)):
template_fields[i] = template_fields[i].split(';')
files = [f for f in glob.glob(path + '**/*.xlsx',recursive=True)] # gets the paths for all the excel files stored in 'excel' folder
filename = []
lenpath = len(path) + 1
dest_path = os.getcwd() + '\\'
dest_path = dest_path + 'markdown\\'
dest_path = dest_path + 'report.md' # destination file for markdown
sep = str(input("which separator do you want for dates? Please Enter\n"))
f = open(dest_path, 'w', encoding="utf-8")
for file in files:
filename.append(file[lenpath:len(file) - 5]) # gets the filenames of all the files in the 'excel' folder
for k in range(len(template_fields)):
for i in range(len(filename)):
if filename[i] in template_fields[k][1]:
break
spreadsheet = pd.ExcelFile(files[i]) # opens all the excel files one by one
xls2md(spreadsheet, filename[i], template_fields[k], f, sep) # the function that converts the excel spreadsheet to md
print("All Files have been converted to Markdown")
if __name__ == '__main__':
main()