-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbuild_search_index.py
95 lines (73 loc) · 2.6 KB
/
build_search_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import urllib2
import os.path
import json
from pprint import pprint
from fuzzywuzzy import process
import os
import cv2
VERSION_FILE = 'data/version.txt'
DATA_FILE = 'data/all-sets.json'
TEMP_FILE = 'tmp'
TEMP_FULL_TEXT_FILE = 'tmp-full-text'
INPUT_DIRECTORY = 'input'
CROPPED_DIRECTORY = 'cropped'
def check_output(tmp_file):
success = True
with open(tmp_file, 'r') as f:
found_text = f.read().strip()
print "Found text:" + found_text
if len(found_text) == 0: # error condition
success = False
return (success, found_text)
# check to see if stored version of card data is outdated
new_version = urllib2.urlopen('http://mtgjson.com/json/version.json').read()
old_version = ""
if os.path.isfile(VERSION_FILE):
with open(VERSION_FILE, 'r') as f:
old_version = f.read()
if new_version != old_version:
with open(VERSION_FILE, 'w') as f:
f.write(new_version)
new_data = urllib2.urlopen('http://mtgjson.com/json/AllSets-x.json')
print "Fetching version " + new_version + " of the json data. Please wait."
with open(DATA_FILE, 'w') as f:
f.write(new_data.read())
all_cards = ""
with open(DATA_FILE, 'r') as f:
json_data = f.read()
all_cards = json.loads(json_data)
# pprint(all_cards["LEA"]["cards"][0])
all_unique_card_names = set()
for card_set in all_cards.values():
for card in card_set["cards"]:
all_unique_card_names.add(card["name"])
# set the language location
os.environ['TESSDATA_PREFIX'] = '/home/brian/dev/tesseract-ocr-3.02'
failed_to_identify = []
# attempt to identify each card
for card in os.listdir(INPUT_DIRECTORY):
if card == '.gitignore':
continue
card_path = INPUT_DIRECTORY + '/' + card
print card_path
image = cv2.imread(card_path)
crop_image = image[100:150, 50:1300]
cropped_path = CROPPED_DIRECTORY + '/' + card
cv2.imwrite(cropped_path, crop_image)
os.system('tesseract ' + cropped_path + ' ' + TEMP_FILE)
tmp_file = TEMP_FILE + '.txt'
result = check_output(tmp_file)
full_text = ''
if not result[0]: # error condition, retry
os.system('tesseract -psm 7 ' + cropped_path + ' ' + TEMP_FILE)
result = check_output(tmp_file)
#os.system('tesseract ' + card_path + ' ' + TEMP_FULL_TEXT_FILE)
#full_text = check_output(TEMP_FULL_TEXT_FILE + '.txt')
possible = process.extract(result[1], list(all_unique_card_names), limit=3)
if possible[0][1] < 70:
failed_to_identify.append([cropped_path, result[1]])
else:
pprint(possible[0])
#pprint(full_text)
print "Failures:"
pprint(failed_to_identify)