-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvert.py
executable file
·194 lines (174 loc) · 6.61 KB
/
convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#!/usr/bin/env python
import sys
import re
import datetime
# Define a MalformedLine exception
class MalformedLine:
pass # don't actually need to do anything
# Some helper methods
def make_header(title):
# Get info needed for the date lol
now = datetime.date.today()
time_string = now.isoformat()
# Returns the contents of the header as a string
return '<?xml version="1.0" encoding="UTF-8"?>\n\
<!DOCTYPE ktvml PUBLIC "kvtml2.dtd" "http://edu.kde.org/kvtml/kvtml2.dtd">\n\
<kvtml version="2.0">\n\
<information>\n\
<generator>wikicode-to-kvtml-convertor (NEEDS A NAME)</generator>\n\
<title>' + title + '</title>\n\
<date>' + time_string + '</date>\n\
</information>\n\
<identifiers>\n\
<identifier id="0">\n\
<name>Term</name>\n\
<locale>en</locale>\n\
</identifier>\n\
<identifier id="1">\n\
<name>Definition</name>\n\
<locale>en</locale>\n\
</identifier>\n\
</identifiers>\n\
<entries>\n'
# Give it a line, will return the term and definition (tuple)
# If it can't, it will raise a MalformedLine exception
def format_line(line):
"""
SUPPORTED FORMATS: (whitespace is not significant)
*'''term''': definition (' or '' instead of ''' also possible)
*'''term''' definition (same as above)
; term : definition
*term: definition
"""
# Check if it's in this format: *[''']term[''']: definition
# Note this means the first : is significant
# But after the first : there can be others, too
if re.match("^\*[^\*:]+:", line):
# The term is between * and :
separator_index = line.find(':')
term = line[1:separator_index]
# Strip it of any single quotes (also whitespace, first)
# Could be problematic think of workarounds for this later
term = term.strip().strip("'")
# Now get the definition - after separator to end of string
definition = line[separator_index+1:].strip()
elif re.match("^;[^;:]+:", line):
# If it's in the format ; term : definition
separator_index = line.find(':')
term = line[1:separator_index]
# Maybe put this code below so I don't have to repeat it
# Later.
term = term.strip().strip("'")
definition = line[separator_index+1:].strip()
elif re.match("^*'{1,3}[^']+'{1,3}", line):
# Format *'''term''' definition, 1-3 quotation marks accepted
# Separator is the last quotation mark in the second group
# This is kind of tricky ... split it into (presumably) 3
# The second thing in the list returned = term
# THIS METHOD MEANS YOU CAN'T USE SINGLE QUOTES IN THE TERM
# SO DON'T USE IT IF YOU HAVE THEM OKAY?
# This is actually incredibly buggy not sure why i have it
# I give up, maybe later
raise MalformedLine
else:
# Not in any of the supported formats ... raise an exception
# Might be a good idea to separate the formats later
# Make them into a list or something
# To make it easier to expand etc
raise MalformedLine
return term, definition
# Handle the command line arguments
# Give it an input file and an output file
# If the output filename is not *.kvtml, will make it so
if len(sys.argv) < 3:
print 'Usage: ./convert.py inputfile outputfile'
print 'Check the readme file for more info and where to get help'
sys.exit(1)
# Now save the input and output file data
input_filename = sys.argv[1]
output_filename = sys.argv[2]
# First make sure the file exists and can be read
try:
input_file = open(input_filename, 'r')
except IOError:
sys.exit("The input file could not be read!")
# If the title is not saved in the file, prompt the user for it
# For the title to be saved, it must be the first thing in the file
# Like this: =Title= with that being the only thing on the line
# ==Title== or ===Title=== is also valid or even ==Title= etc
lines = input_file.readlines()
first_line = lines[0]
if re.match("^=+[^=]+=+$", first_line):
# Just make it a substring of the first line
# Also get rid of the trailing newline is there is one
title = first_line.strip().strip('=')
need_first_line = False # First line is the title
else:
print 'The input file had no associated title.'
print "Please enter a title (enter nothing for 'Untitled')."
title = raw_input('Desired title: ')
if len(title) == 0:
# Default title - Untitled if none entered
title = 'Untitled'
need_first_line = True # As the first line is not the title
# Open the output file and first print the header
# If the output file does not have a .kvtml extension, give it one
if not output_filename.endswith('.kvtml'):
output_filename = output_filename + '.kvtml'
try:
output_file = open(output_filename, 'w')
except IOError:
sys.exit("The output file could not be opened for writing!")
output_file.write(make_header(title))
# Now we try to figure out which format it is ...
# Note that the format can differ between things in the file
# It would make sense if it had to be the same
# But honestly it's easier to code it this way
# For keeping track of which line we're on. Only needed for i = 0
i = 0
bad_lines = []
for line in lines:
# Strip it of whitespace characters
line = line.strip()
if i == 0 and not need_first_line:
# ignore the first line
pass
elif len(line) > 0:
try:
entry = format_line(line)
entry_str = '\
<entry id="' + str(i) + '">\n\
<translation id="0">\n\
<text>%s</text>\n\
</translation>\n\
<translation id="1">\n\
<text>%s</text>\n\
</translation>\n\
</entry>\n' % (entry[0], entry[1])
output_file.write(entry_str)
except MalformedLine:
# Add this line number to the list of bad lines
# Starts indexing from 0 of course
bad_lines.append(i)
else:
# Blank lines, ignore
pass
i = i + 1
# Now write the footer to the output file
output_file.write('\
</entries>\n\
</kvtml>')
# Done with both files ...
input_file.close()
output_file.close()
print "Proccesing complete!"
# If we have any bad lines, display an error message
# But try to convert the rest of the file first
if len(bad_lines) > 0:
print "However, there were %d lines that were not in a supported format: lines " % len(bad_lines),
line_num = 1
for bad_line in bad_lines:
print bad_line + 1,
# Do we need a comma?
if line_num < len(bad_lines):
print ',',