-
Notifications
You must be signed in to change notification settings - Fork 0
/
readData.py
151 lines (130 loc) · 4.52 KB
/
readData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import PIL.Image
import cv2
import xml.etree.ElementTree as ET
import os
import numpy as np
from seqlearn.datasets import load_conll
from seqlearn.hmm import MultinomialHMM
from seqlearn.evaluation import bio_f_score
# Recursively extracts filenames from a directory and its subdirectory
def list_files(dir):
r = []
subdirs = [x[0] for x in os.walk(dir)]
for subdir in subdirs:
files = os.walk(subdir).next()[2]
if (len(files) > 0):
for file in files:
r.append(subdir + "/" + file)
return r
# Gets the features from test.txt
def features(sequence, i):
split_sequence = sequence[i].split(" ")
# print i
print len(split_sequence)
yield str(sequence)
# An image directory
IMAGE_DIRECTORY = "Datasets/sentences/a01/a01-000x/"
my_images = []
# Reads image from dataset and stores it as a numpy array
for i in range(0, 2):
for j in range(0, 3):
myimage = cv2.imread(
IMAGE_DIRECTORY + "a01-000x-s0" + str(i) +
"-0" + str(j) + ".png", 0)
(thresh, im_bw) = cv2.threshold(myimage, 128,
255, cv2.THRESH_BINARY |
cv2.THRESH_OTSU)
my_images.append(im_bw / 255)
myimage = cv2.imread(
IMAGE_DIRECTORY + "a01-000x-s01-03.png", 0)
(thresh, im_bw) = cv2.threshold(myimage, 128,
255, cv2.THRESH_BINARY |
cv2.THRESH_OTSU)
my_images.append(im_bw / 255)
# cv2.imwrite("bw.jpg", im_bw)
print len(my_images)
# Captures the jpeg file and extracts METADATA
jpgfile = PIL.Image.open(IMAGE_DIRECTORY + "/a01-000x-s01-00.png")
# The METADATA is an XML, hence parsing
myxml = ET.fromstring(jpgfile.info["Description"])
print jpgfile.info["Description"]
# Gets the text corresponding to the file a01-000u-s00-00.png
sentence = myxml.find("handwritten-part")[0].attrib["text"]
# Gets <lines from the xml
lines = myxml.findall("./handwritten-part/line")
# Initialize image iterator
i = 0
# Maps the line to a word
line_mapping = [[], [], [], [], [], [], [], [], []]
for line in lines:
lower = line.find("./lower-contour")
upper = line.find("./upper-contour")
my_image = my_images[i]
print "image shape: ", my_image.shape
x1 = []
x2 = []
if lower is not None:
#get lower contour values
for point in lower.findall("./point"):
x1.append(int(point.attrib["x"]))
if upper is not None:
for point in upper.findall("./point"):
x2.append(int(point.attrib["x"]))
# Get words in a line
words = line.findall("./word")
for word in words:
print "The Word: ****", word.attrib["text"], "****"
word_width = 0
x = int(word.findall("./cmp")[0].attrib["x"])
print "starts at ", x
for ch in word.findall("./cmp"):
word_width += int(ch.attrib["width"])
print "word width: ", word_width
if len(x1) == 0:
print "minimum = ", x2[0]
offset = x - x2[0]
else:
print "minimum = ", x1[0]
offset = x - x1[0]
if offset <= 0:
offset = 0
print "offset ", offset
word_image = my_image[
:, offset:offset + word_width]
my_word = {
"matrix": word_image,
"word": word.attrib["text"]
}
line_mapping[i].append(my_word)
# cv2.imwrite("line" + str(i) + "-" +
# str(len(line_mapping[i]) - 1) + "-" +
# my_word["word"] + ".jpg",
# my_word["matrix"] * 255)
print my_word["matrix"].shape
print '\n'
# Go to the next image
i += 1
# Saves the image into a TXT file
for line in line_mapping:
for word in line:
if word["matrix"].shape[1] == 0:
print "Zero matrix... Skipping..."
else:
f_handle = file('test.txt', 'a')
np.savetxt(f_handle, word['matrix'], delimiter=" ",
fmt="%i", newline=" ",
header='', footer="" + word["word"] + "\n\n",
comments='')
f_handle.close()
# # Extracts features from the datasets
X_train, y_train, lengths_train = load_conll("test.txt", features)
# # Models it as an HMM
clf = MultinomialHMM()
clf.fit(X_train, y_train, lengths_train)
print X_train, y_train
# Validation after training
X_test, y_test, lengths_test = load_conll("test.txt", features)
y_pred = clf.predict(X_test, lengths_test)
print y_pred
# # Final score
# print(bio_f_score(y_test, y_pred))