-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathnook-dictionary.py
128 lines (101 loc) · 3.5 KB
/
nook-dictionary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/python
# -*- coding: utf-8 -*-
# by Jiri Orsag, 2014
# https://github.com/geoRG77/nook-dictionary
# Many thanks to Homeless Ghost for his script 'createRenateNSTdictionaryfromnookdictionarydb.py'
# which was a great source of ideas for my work
import sqlite3, sys, zipfile, zlib, os
# config
DICTIONATY_FILE = 'slovnik.txt' # input file (needed)
OUTPUT_DB = 'test.db' # output file
TEMP_DIRECTORY = './temp/' # will be deleted after successful run
STEP = 10000 # for print message
########################################################
def wordType(s):
if s == 'n:': return 'n: podstatné jméno'
elif s == 'v:': return 'v: sloveso'
elif s == 'adj:': return 'adj: přídavné jméno'
elif s == 'adv:': return 'adv: příslovce'
elif s == 'prep:': return 'prep: předložka'
elif s == 'conj:': return 'conj: spojka'
elif s == 'interj:': return 'interj: citoslovce'
elif s == 'num:': return 'num: číslovka'
else: return s
print 'Converting dictionary...'
con = sqlite3.connect(OUTPUT_DB)
con.text_factory = str
cur = con.cursor()
index = 0
duplicateCount = 1
prevTerm = ''
try:
if not os.path.exists(TEMP_DIRECTORY):
os.makedirs(TEMP_DIRECTORY)
# open dict file
dict = open(DICTIONATY_FILE, 'r')
# delete previous tables
cur.execute('DROP TABLE IF EXISTS android_metadata')
cur.execute('DROP TABLE IF EXISTS tblWords')
# create tables
cur.execute('CREATE TABLE "android_metadata"("locale" TEXT)')
cur.execute('CREATE TABLE "tblWords"(_id INTEGER PRIMARY KEY AUTOINCREMENT, "term" TEXT, "description" BLOB)')
# convert dict to sql
for line in dict:
index += 1
# split line
data = line.split('\t')
term = data.pop(0)
# create HTML
html = '<div class="entry"><b><span class="searchterm-headword">' + term + '</span></b><br/>'
for j in range(len(data)):
if data[j] != '':
if j == 1:
html += wordType(data[j].strip()) + '<br/>'
elif j == 3:
html += '[<small>AUTOR</small>: ' + data[j].strip() + ']'
else:
html += data[j].strip() + '<br/>'
html += '</div>'
# check for duplicates
if term == prevTerm:
duplicateCount += 1
termEdited = term + '[' + str(duplicateCount) + ']'
else:
termEdited = term
duplicateCount = 1
# create html file
term_stripped = termEdited.replace('/', '')
temp_html = open(TEMP_DIRECTORY + term_stripped, 'wb')
temp_html.write(html)
temp_html.close()
# compress & save
zf = zipfile.ZipFile('_temp', mode='w')
zf.write(TEMP_DIRECTORY + term_stripped)
zf.close()
# read & insert compressed data
temp_compressed = open('_temp', 'rb')
compressed = temp_compressed.read()
cur.execute('INSERT INTO tblWords (_id, term, description) VALUES(?, ?, ?)', (index, termEdited, sqlite3.Binary(compressed)))
# if duplicate then update previous row with [1]
if duplicateCount == 2:
cur.execute('UPDATE tblWords SET term="' + str(term + "[1]") + '" WHERE _id=' + str(index - 1) + '')
os.remove(TEMP_DIRECTORY + term_stripped)
prevTerm = term
# print _id, term, description
if ((index % STEP) == 0):
print '# current line = %d' % index
#if index == 100:
# break;
# create term_index
cur.execute('CREATE INDEX term_index on tblWords (term ASC)')
cur.execute('SELECT * FROM tblWords order by _id LIMIT 10')
dict.close
os.remove('_temp')
os.rmdir(TEMP_DIRECTORY)
except Exception, e:
raise
else:
pass
finally:
pass
print 'Done. ' + str(index) + ' lines converted.'