-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
154 lines (138 loc) · 4.73 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import pdf2text
import re
import os
import pandas as pd
import spacy
import pattern as pt
import nltk
from nltk.tag.stanford import StanfordNERTagger
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from face_recognize import face_recognize
import sys
# if code not working
# import os
# java_path = "C:/Program Files/Java/jdk1.8.0_221/bin/java.exe"
# os.environ['JAVAHOME'] = java_path
def main(file_path):
# file_path = '/home/thongtran/projects/cv-extraction/pdf/Elliot-Alderson-Resume-Software-Developer-1.pdf'
text = ''
for page in pdf2text.extract_text_from_pdf(file_path):
text += ' ' + page
email = extract_email(text)
phone = extract_mobile_number(text)
address = extract_address(text)
person_name = extract_person_name(text)
skills = extract_skills(text)
education = extract_education(text)
list(dict.fromkeys(email))
list(dict.fromkeys(skills))
list(dict.fromkeys(phone))
print('Fullname:')
print(person_name)
print('Email: ')
print(email)
print('Phone: ')
print(phone)
print('Address: ')
print(address)
print('Job Skills: ')
print(skills)
print('Education: ')
print(education)
def extract_person_name(text):
person_name = ''
try:
st = StanfordNERTagger('/home/thongtran/projects/cv-extraction/stanford-ner/english.all.3class.distsim.crf.ser.gz',
'/home/thongtran/projects/cv-extraction/stanford-ner/stanford-ner.jar')
for sent in nltk.sent_tokenize(text):
tokens = nltk.tokenize.word_tokenize(sent)
tags = st.tag(tokens)
for tag in tags:
if tag[1] in ["PERSON"]:
person_name += tag[0] + ' '
except Exception as e:
print(e)
return person_name
def extract_address(text):
address = None
try:
regular_expression = re.compile(r"[0-9]+ [a-z0-9,\.# ]+\bCA\b", re.IGNORECASE)
address = re.search(regular_expression, text)
if address:
address = address.group()
except Exception as e:
print(e)
return address
def extract_email(text):
email = None
try:
pattern = re.compile(r'\S*@\S*')
matches = pattern.findall(text)
email = matches
except Exception as e:
print(e)
return email
def extract_mobile_number(text):
phones = None
try:
pattern = re.compile(r'([+(]?\d+[)\-]?[ \t\r\f\v]*[(]?\d{2,}[()\-]?[ \t\r\f\v]*\d{2,}[()\-]?[ \t\r\f\v]*\d*[ \t\r\f\v]*\d*[ \t\r\f\v]*)')
matchs = pattern.findall(text)
matchs = [re.sub(r'[,.]', '', el) for el in matchs if len(re.sub(r'[()\-.,\s+]', '', el))>6]
matchs = [re.sub(r'\D$', '', el).strip() for el in matchs]
matchs = [el for el in matchs if len(re.sub(r'\D','',el)) <= 15]
try:
for el in list(matchs):
if len(el.split('-')) > 3: continue
for x in el.split("-"):
try:
if x.strip()[-4:].isdigit():
if int(x.strip()[-4:]) in range(1900, 2100):
match.remove(el)
except:
pass
except:
pass
phones = list(dict.fromkeys(matchs))
except Exception as e:
print(e)
return phones
def extract_skills(text):
tokens = nltk.tokenize.word_tokenize(text)
data = pd.read_csv(os.path.join(os.path.dirname(__file__), 'skills.csv'))
skills = list(data.columns.values)
skillset = []
for token in tokens:
if token.lower() in skills:
skillset.append(token)
return skillset
def extract_education(text):
education = []
tokens = nltk.tokenize.word_tokenize(text)
for index, token in enumerate(tokens):
if token.upper() in pt.EDUCATION and token not in pt.STOPWORDS:
temp = ""
for i in range(0,4):
temp += tokens[index + i] + ' '
education.append(temp)
return education
def string_found(string1, string2):
if re.search(r"\b" + re.escape(string1) + r"\b", string2):
return True
return False
# def extract_text(file_path, extension):
# '''
# Wrapper function to detect the file extension and call text extraction function accordingly
# :param file_path: path of file of which text is to be extracted
# :param extension: extension of file `file_name`
# '''
# text = ''
# if extension == '.pdf':
# for page in extract_text_from_pdf(file_path):
# text += ' ' + page
# elif extension == '.docx':
# text = extract_text_from_docx(file_path)
# return text
if __name__ == '__main__':
file_path = sys.argv[1]
main(file_path)