-
Notifications
You must be signed in to change notification settings - Fork 0
/
ALIPDF.py
85 lines (77 loc) · 3.02 KB
/
ALIPDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import pdfreader
import re
import os
import openpyxl
class Result:
def __init__(self,exBtw,btw,total,date):
self.exBtw = exBtw
self.btw = btw
self.total = total
self.date = date
def toList(self):
return [self.date,"","",self.exBtw,self.btw,self.total]
#returns a result object form a given pdf
def analysePDF(pathName):
print(f"processing: {pathName}")
content = getPdfText(pathName)
#indexed relative to the end of the list, because the index compared to the start of the document is not constant
totalEur = content[-17]
exBtw = content[-5]
btw = content[-4]
invoiceDate = content[7]
#following asserts are to verify input is in correct format by asserting that al instances of a number are equal
assert totalEur == content[-20],"totals dont match"
assert exBtw == content[-8] , "exBtw values dont match"
assert btw == content[-7], "btw values dont match"
invoiceDateRe = re.compile("Invoice Date : \d{4}-\d{2}-\d{2}")
assert invoiceDateRe.match(invoiceDate), "invoicedate not in right format"
#put the date in the DD-MM-YYYY format instead of the YYYY-DD-MM format that aliexpres uses
date = invoiceDate[20:25]+"-"+invoiceDate[15:19]
newResult = Result(exBtw,btw,totalEur,date)
return newResult
#get an array of strings of a given pdf file
def getPdfText(pathName):
res = []
fd = open(pathName,"rb")
doc = pdfreader.PDFDocument(fd)
viewer = pdfreader.SimplePDFViewer(fd)
page_length = len([p for p in doc.pages()])
for i in range(page_length):
viewer.navigate(i+1)
viewer.render()
viewer.canvas.strings
pageContent = viewer.canvas.strings
res += pageContent
return res
#get results from all pdf files in a directory
def getResults(directory):
bestandjes = getFilesInDirectory(directory)
validBestandjeRE = re.compile("\d+\_payment.pdf")
results = []
for bestandje in bestandjes:
assert validBestandjeRE.match(bestandje), "bestand naam niet in correct formaat"
res = analysePDF(os.path.join(directory,bestandje))
results.append(res)
return results
#returns a list of all the files in a directory
def getFilesInDirectory(directory):
directoryContent = os.listdir(directory)
files = [f for f in directoryContent if os.path.isfile(os.path.join(directory,f))]
return files
#puts a list of results in to an openpyxl workbook
def resultatenToWorkbook(resultaten):
workbook = openpyxl.Workbook()
worksheet = workbook.active
worksheet.append(["date","","","without taxes","taxes","total"])
for res in resultaten:
worksheet.append(res.toList())
return workbook
def main():
directory = input("name of folder containing payment pdfs: ")
output = input("name of output excel file (should end with .xlsx): ")
resultaten = getResults(directory)
workbook = resultatenToWorkbook(resultaten)
workbook.save(output)
print(f"saved at: {os.path.abspath(output)}")
if __name__=="__main__":
main()