-
Notifications
You must be signed in to change notification settings - Fork 0
/
v3_data_collection.py
59 lines (48 loc) · 2.11 KB
/
v3_data_collection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#this program will scrape for emails and us/canadians phone numbers in a given text
#! python3
import re
import os
import pdftotext
#this regex identifies and us/canadian phone number
regex_phone = re.compile(r"""
(
((\d\d\d) | (\(\d\d\d\)) | (\(\d\d\d\) ))? #ZIP CODE AREA opt.
(\s|-) #SEPARATOR 1
\d\d\d #3 NUM GROUP
(\s|-) #SEPARATOR 2
\d\d\d\d #4 NUM GROUP
((ext(\.)?\s|x?) #EXTENSION word opt.
(\d{2,5}))? #EXTENSION number opt.
)
"""
, re.VERBOSE)
#this regex identifies an email
regex_email = re.compile(r"""
[a-zA-Z0-9_.+]+ #part1 I made this ((\w+) | (\w+\.\w+)) but it's better to use what's running now for cleaner reading
\@ # at sign @
[a-zA-Z0-9_.+]+ #part2
#\. #dot .
#(\w+) #part 3
#\.?
#(\w+)?
""",re.VERBOSE)
#We open a given pdf to analyze
print("Insert absolute path without '' or \"\" \n")
path_to_pdf = input()
with open(path_to_pdf,"rb") as wrk: #rb READ BINARY OR -> UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb5 in position 11: invalid start byte
pdf = pdftotext.PDF(wrk)
#We create the string variable to parse with regex expressions
pdf_final = ""
for page in pdf:
pdf_final = pdf_final + str(page)
#we search for phone numbers and emails
mo_phone = regex_phone.findall(pdf_final)
mo_mail = regex_email.findall(pdf_final)
#we create a cleaner output for phone findall
new_mo_phone = []
#
for p_numbers in mo_phone: #for clear phone number viewing: findall() returns a list of tuples we need to iterate each tuple and return the first element of it so: tuple[0]
new_mo_phone.append(p_numbers[0])
#we create a list by joining two lists
end_document = '\n'.join(new_mo_phone) + '\n' + '\n'.join(mo_mail)
print(end_document)