forked from awslabs/rekognition-image-search-engine
-
Notifications
You must be signed in to change notification settings - Fork 0
/
indexer.py
120 lines (92 loc) · 2.83 KB
/
indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
'''
Local Image Search Engine Indexer
@Author: Sunil Mallya
'''
import boto3
import glob
import json
import pickle
from multiprocessing.dummy import Pool as ThreadPool
from threading import Lock
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
# PHOTOS Directory
photo_dir = 'photos/'
# Rekognition client
rekognition = boto3.client('rekognition', region_name='us-east-1',
endpoint_url='https://rekognition.us-east-1.amazonaws.com')
# Document Index: Lets create a "image document" vector
d_index = {} # doc => [{l,c}....]
lock = Lock()
# List of images to index
files = glob.glob(photo_dir + "*.jpg")
files.extend(glob.glob(photo_dir + "*.JPG"))
# Make calls to Rekognition
label_counts = {}
## Extract features helper
def get_labels(f):
try:
with open(f, 'rb') as image:
resp = rekognition.detect_labels(Image={'Bytes': image.read()})
# TODO: if person/human found, then get face features as well
# TODO: Index exif data
labels = resp['Labels']
dt = {}
for v in labels:
l = v['Name'].lower()
c = v['Confidence']
# Choose an appopriate confidence level
# based on your application
if c < 70:
continue
# insert features
dt[l] = c
# count label freq for word cloud generation
try:
label_counts[l]
except KeyError:
label_counts[l] = 0
label_counts[l] += 1
lock.acquire()
d_index[f] = dt
lock.release()
except Exception, e:
print e
finally:
pass #lock.release()
# Use ThreadPool to make concurrent requests
N_THREADS = 25
pool = ThreadPool(N_THREADS)
results = pool.map(get_labels, files)
pool.close()
pool.join()
### Lets save the data ###
data = pickle.dumps(d_index)
with open('myphotoindex.dvect', 'w') as f:
f.write(data)
# Generate Search Index/ TF-IDF Model
vec = DictVectorizer()
counts = vec.fit_transform(d_index.values()).toarray()
transformer = TfidfTransformer(smooth_idf=True)
tfidf = transformer.fit_transform(counts)
# TF_IDF Model
vals = pickle.dumps(tfidf)
with open('tfidf_model.pkl', 'w') as f:
f.write(vals)
# Features available
features = vec.get_feature_names()
vals = pickle.dumps(features)
with open('features.pkl', 'w') as f:
f.write(vals)
### Generate word tag cloud
from wordcloud import WordCloud
wordcloud = WordCloud()
from operator import itemgetter
item1 = itemgetter(1)
frequencies = sorted(label_counts.items(), key=item1, reverse=True)
wordcloud.generate_from_frequencies(frequencies)
# save image
import matplotlib.pyplot as plt
plt.imshow(wordcloud)
plt.axis("off")
plt.savefig('web/photo_tags')