-
Notifications
You must be signed in to change notification settings - Fork 12
/
common.py
146 lines (128 loc) · 4.37 KB
/
common.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import re
import json
import string
class IdCardStraight:
"""
模拟阿里云身份证OCR返回结果,待改进。
"""
def __init__(self, result):
self.result = [
i.replace(" ", "").translate(str.maketrans("", "", string.punctuation))
for i in result
]
self.out = {"Data": {"FrontResult": {}}}
self.res = self.out["Data"]["FrontResult"]
self.res["Name"] = ""
self.res["IDNumber"] = ""
self.res["Address"] = ""
self.res["Gender"] = ""
self.res["Nationality"] = ""
def birth_no(self):
"""
身份证号码
"""
for i in range(len(self.result)):
txt = self.result[i]
# 身份证号码
if "X" in txt or "x" in txt:
res = re.findall("\d*[X|x]", txt)
else:
res = re.findall("\d{16,18}", txt)
if len(res) > 0:
if len(res[0]) == 18:
self.res["IDNumber"] = res[0].replace("号码", "")
self.res["Gender"] = "男" if int(res[0][16]) % 2 else "女"
break
def full_name(self):
"""
身份证姓名
"""
for i in range(len(self.result)):
txt = self.result[i]
if ("姓名" or "名" in txt) and len(txt) > 2:
res = re.findall("名[\u4e00-\u9fa5]{1,4}", txt)
if len(res) > 0:
self.res["Name"] = res[0].split("名")[-1]
self.result[i] = "temp" # 避免身份证姓名对地址造成干扰
break
def sex(self):
"""
性别女民族汉
"""
for i in range(len(self.result)):
txt = self.result[i]
if "男" in txt:
self.res["Gender"] = "男"
elif "女" in txt:
self.res["Gender"] = "女"
def national(self):
# 性别女民族汉
for i in range(len(self.result)):
txt = self.result[i]
res = re.findall(".*民族[\u4e00-\u9fa5]+", txt)
if len(res) > 0:
self.res["Nationality"] = res[0].split("族")[-1]
break
def address(self):
"""
身份证地址
"""
addString = []
for i in range(len(self.result)):
txt = self.result[i]
txt = txt.replace("号码", "")
if "公民" in txt:
txt = "temp"
# 身份证地址
if (
"住址" in txt
or "址" in txt
or "省" in txt
or "市" in txt
or "县" in txt
or "街" in txt
or "乡" in txt
or "村" in txt
or "镇" in txt
or "区" in txt
or "城" in txt
or "组" in txt
or "号" in txt
):
if "住址" in txt or "省" in txt or "址" in txt:
addString.insert(0, txt.split("址")[-1])
else:
addString.append(txt)
self.result[i] = "temp"
if len(addString) > 0:
self.res["Address"] = "".join(addString)
else:
self.res["Address"] = ""
def predict_name(self):
"""
如果PaddleOCR返回的不是姓名xx连着的,则需要去猜测这个姓名,此处需要改进
"""
for i in range(len(self.result)):
txt = self.result[i]
if self.res["Name"] == "":
if len(txt) > 1 and len(txt) < 5:
if (
"性别" not in txt
and "姓名" not in txt
and "民族" not in txt
and "住址" not in txt
and "出生" not in txt
and "号码" not in txt
and "身份" not in txt
):
result = re.findall("[\u4e00-\u9fa5]{2,4}", txt)
if len(result) > 0:
self.res["Name"] = result[0]
break
def run(self):
self.full_name()
self.national()
self.birth_no()
self.address()
self.predict_name()
return json.dumps(self.out)