txt2tags-0.9.py

#!/usr/bin/python
import re, string, os, sys, getopt
from time import strftime,time,localtime

my_url = 'http://txt2tags.sourceforge.net'
my_email = 'aurelio@verde666.org'
my_version = '0.9'

tags = ['txt', 'sgml', 'html', 'pm6', 'mgp', 'moin', 'man']
f_noheaders = f_enumtitle = f_maskemail = f_toconly = f_toc = 0
splitlevel = '' ; lang = 'english'
doctype = outfile = ''
pipefileid = '-'

versionstr = "txt2tags version %s <%s>"%(my_version,my_url)
usage = """
%s

usage: txt2tags -t <type> [OPTIONS] file.t2t
       txt2tags -t html -s <split level> -l <lang> file.t2t

  -t, --type       target document type. actually supported:
                   %s

      --stdout     by default, the output is written to file.<type>
                   with this option, STDOUT is used (no files written)
      --noheaders  suppress header, title and footer information
      --enumtitle  enumerate all title lines as 1, 1.1, 1.1.1, etc
      --maskemail  hide email from spam robots. x@y.z turns to <x (a) y z>

      --toc        add TOC (Table of Contents) to target document
      --toconly    print document TOC and exit

  -h, --help       print this help information and exit
  -V, --version    print program version and exit

extra options for HTML target (needs sgml-tools):
  -s, --split      split documents. values: 0, 1, 2 (default 0)
  -l, --lang       document language (default english)
"""%(versionstr, re.sub(r"[]'[]",'',repr(tags)))

def Quit(msg, exitcode=0): print msg ; sys.exit(exitcode)

# get cmdline options
errormsg = 'ERROR: bad option or missing argument (try --help)'
try: (opt, args) = getopt.getopt(sys.argv[1:], 'hVt:',
      ['help', 'version', 'stdout', 'type=', 'split=', 'lang=',
       'noheaders', 'enumtitle', 'maskemail', 'toc', 'toconly'])
except getopt.GetoptError: Quit(errormsg, 1)
for o in opt:
	if   o[0] == '--noheaders': f_noheaders = 1
	elif o[0] == '--enumtitle': f_enumtitle = 1
	elif o[0] == '--maskemail': f_maskemail = 1
	elif o[0] == '--split'    : splitlevel = o[1]
	elif o[0] == '--lang'     : lang = o[1]
	elif o[0] == '--toc'      : f_toc = 1
	elif o[0] == '--toconly'  : outfile = pipefileid ; f_toconly = 1
	elif o[0] == '--stdout'   : outfile = pipefileid
	elif o[0] in ['-t','--type']: doctype = o[1]
	elif o[0] in ['-h','--help']: Quit(usage)
	elif o[0] in ['-V','--version']: Quit(versionstr)

if not doctype and f_toconly: doctype = 'txt' # toconly defaults to txt
if not args or not doctype: Quit(usage, 1)    # no filename/doctype
infile = args[0]   # set source txt file name

# sanity check: validate target type
try: T = tags.index(doctype)
except ValueError:
	Quit("ERROR: invalid document type '%s' (try --help)"%(doctype),1)

# sanity check: file exists?
if infile != pipefileid and not os.path.isfile(infile):
	Quit('ERROR: file not found: %s'%infile, 1)

# sanity check: validate split level
if doctype != 'html': splitlevel = '' # only valid for HTML target
if splitlevel:
	doctype = 'sgml'  # 1st do a sgml, then sgml2html
	outfile = ''      # --stdout is forbidden
	# check splitlevel value
	if splitlevel[0] not in '012':
		Quit('ERROR: option --split must be 0, 1 or 2')
	# check for sgml-tools	
	#TODO how to test (in a clever way) if an executable is in path?
	# Quit("Sorry, you must have 'sgml2html' program to use --split")

# set outfile name
if not outfile:
	outfile = re.sub('\.(txt|t2t)$','',os.path.basename(infile))
	outfile = "%s.%s"%(outfile,doctype)
if infile == pipefileid: outfile = pipefileid

# sanity check: source loss!
if infile != pipefileid and infile == outfile:
	warn = 'SUICIDE WARNING!!!   (try --stdout)\n  '
	Quit("%ssource and target files has the same name: %s"%(warn,outfile),1)
### yes, i've got my sample.t2t file deleted before add this test... :/


### all the registered tags
TAGparagraph = ['', '<p>', '<P>', '<@Normal:>', '%font "normal", size 5\n', '', '.P']
TAGtitle1 = ['  \a'      , '<sect>\a<p>' , '<H1>\a</H1>', '\n<@Title1:>\a', '%page\n\n\a', '= \a =', '.SH \a']
TAGtitle2 = ['\t\a'      , '<sect1>\a<p>', '<H2>\a</H2>', '\n<@Title2:>\a', '%page\n\n\a', '== \a ==', '.SS \a']
TAGtitle3 = ['\t\t\a'    , '<sect2>\a<p>', '<H3>\a</H3>', '\n<@Title3:>\a', '%page\n\n\a', '=== \a ===', '.SS \a']
TAGtitle4 = ['\t\t\t\a'  , '<sect3>\a<p>', '<H4>\a</H4>', '\n<@Title4:>\a', '%page\n\n\a', '==== \a ====', '.SS \a']
TAGtitle5 = ['\t\t\t\t\a', '<sect4>\a<p>', '<H5>\a</H5>', '\n<@Title5:>\a', '%page\n\n\a', '===== \a =====', '.SS \a']
TAGareaPreOpen = ['',  '<tscreen><verb>',   '<PRE>', '<@PreFormat:>', '\n%font "mono"', '{{{', '.nf']
TAGareaPreClose = ['', '</verb></tscreen>', '</PRE>', '', '%font "normal"', '}}}', '.fi\n']
TAGareaQuoteOpen = ['    ',   '<quote>', '<BLOCKQUOTE>', '<@Quote:>', '%prefix "       "', ' ', '\n']
TAGareaQuoteClose = ['', '</quote>', '</BLOCKQUOTE>', '', '%prefix "  "', '', '\n']
TAGfontMonoOpen  = ['',  '<tt>',  '<CODE>', '<FONT "Lucida Console"><SIZE 9>', '\n%cont, font "mono"\n', '{{{', '']
TAGfontMonoClose = ['', '</tt>', '</CODE>', '<SIZE$><FONT$>', '\n%cont, font "normal"\n', '}}}', '']
TAGfontBoldOpen  = ['',  '<bf>',  '<B>', '<B>', '\n%cont, font "normal-b"\n', "'''", r'\\fB']
TAGfontBoldClose = ['', '</bf>', '</B>', '<P>', '\n%cont, font "normal"\n', "'''", r'\\fP']
TAGfontItalicOpen  = ['',  '<em>',  '<I>', '<I>', '\n%cont, font "normal-i"\n', "''", r'\\fI']
TAGfontItalicClose = ['', '</em>', '</I>', '<P>', '\n%cont, font "normal"\n', "''", r'\\fP']
TAGfontBoldItalicOpen  = ['',  '<bf><em>',   '<B><I>', '<B><I>', '\n%cont, font "normal-bi"\n', "'''''", '\n.BI ']
TAGfontBoldItalicClose = ['', '</em></bf>', '</I></B>', '<P>',   '\n%cont, font "normal"\n', "'''''", '\n\\&']
TAGfontUnderlineOpen = ['', TAGfontBoldItalicOpen[1], '<U>', '<U>', '\n%cont, fore "cyan"\n', TAGfontBoldItalicOpen[5], '']
TAGfontUnderlineClose = ['', TAGfontBoldItalicClose[1], '</U>', '<P>', '\n%cont, fore "white"\n', TAGfontBoldItalicClose[5], '']
TAGlistOpen     = ['', '<itemize>', '<UL>', '<@Bullet:>', '', '', '\n'+TAGareaPreOpen[6]]
TAGlistClose    = ['', '</itemize>', '</UL>', '', '', '', TAGareaPreClose[6]]
TAGlistItem     = ['- ', '<item>', '<LI>', '•	', '', '* ', '* ']
TAGnumlistOpen  = ['', '<enum>', '<OL>', '<@Bullet:>', '', '', '\n'+TAGareaPreOpen[6]]
TAGnumlistClose = ['', '</enum>', '</OL>', '', '', '', TAGareaPreClose[6]]
TAGnumlistItem  = ['\a. ', '<item>', '<LI>', '~U    ', '\a. ', '\a. ', '\a. ']
TAGdeflistOpen  = ['', '', '<DL>'       , '', '', '', '']
TAGdeflistItem1 = ['', '', '<DT>\a</DT>', '', '', '', '']
TAGdeflistItem2 = ['', '', '<DD>'       , '', '', '', ''] #TODO must close?
TAGdeflistClose = ['', '', '</DL>'      , '', '', '', '']
TAGbar1 = ['\a', '<!-- \a -->', '<HR NOSHADE SIZE=1>', '\a', '%bar "white" 5', '----', '\n\n']
TAGbar2 = ['\a', '<!-- \a -->', '<HR NOSHADE SIZE=5>', '\a', '%pause', '----', '\n\n']
TAGurl = ['\a', '<htmlurl url="\a" name="\a">', '<A HREF="\a">\a</A>', TAGfontUnderlineOpen[3]+'\a'+TAGfontUnderlineClose[3], '\n%cont, fore "cyan"\n\a\n%cont, fore "white"\n', '[\a]', '\a']
TAGurlMark = ['\a (\a)', TAGurl[1], TAGurl[2], '\a '+TAGurl[3], '\a '+TAGurl[4], '[\a \a]', '\a (\a)']
TAGemail = ['\a', '<htmlurl url="mailto:\a" name="\a">', '<A HREF="mailto:\a">\a</A>', '\a', TAGurl[4], '[\a]', '\a']
TAGemailMark = ['\a (\a)', TAGemail[1], TAGemail[2], '\a '+TAGemail[3], '\a '+TAGemail[4], '[\a \a]', '\a (\a)']
TAGemail = ['\a', '<htmlurl url="mailto:\a" name="\a">', '<A HREF="mailto:\a">\a</A>', '\a', TAGurl[4], '[\a]', '\a']
TAGimg = ['[\a]', '<figure><ph vspace=""><img src="\a"></figure>', '<IMG ALIGN="\a" SRC="\a">', '\a', '\n%center\n%newimage "\a", left\n', '[\a]', '\a']
TAGtableOpen     = [ '', '<table><tabular ca="c">', '<table align=center cellpadding=4 border=\a>', '', '', '', '']
TAGtableLineOpen = [ '', '', '<tr>', '', '', '||', '']
TAGtableLineClose = [ '', '<rowsep>', '</tr>', '', '', '', '']
TAGtableCellOpen = [ '', '', '<td>', '', '', '', '']
TAGtableCellClose = [ '', '<colsep>', '</td>', '', '', '||', '']
TAGtableTitleCellOpen = [ '', '', '<th>', '', '', '', '']
TAGtableTitleCellClose = [ '', '<colsep>', '</th>', '', '', '||', '']
TAGtableClose = [ '', '</tabular></table>', '</table>', '', '', '', '']
TAGanchor = ['', '', '<a name="\a">', '', '', '', '']
TAGEOD = ['', '</article>', '</BODY></HTML>', '', '%%EOD', '', '']


### the cool regexes
re_title = re.compile(r'^\s*(?P<tag>={1,5})(?P<txt>[^=].*[^=])\1(\[(?P<label>\w+)\])?$')
re_areaPreOpen = re_areaPreClose = re.compile(r'^---$')
re_quote = re.compile(r'^\t+')
re_1linePreOld = re.compile(r'^ {4}([^\s-])')
re_1linePre = re.compile(r'^--- ')
re_mono = re.compile(r'`([^`]+)`')
re_bold = re.compile(r'\*\*([^\s*].*?)\*\*')
re_italic = re.compile(r'(^|[^:])//([^ /].*?)//')
re_underline = re.compile(r'__([^_].*?)__') # underline lead/trailing blank
re_bolditalic = re.compile(r'\*/([^/].*?)/\*')
re_list    = re.compile(r'^( *)([+-]) ([^ ])')
re_deflist = re.compile(r'^( *)(=) ([^:]+):')
re_bar =re.compile(r'^\s*([_=-]{20,})\s*$')
re_table = re.compile(r'^ *\|\|?[<:>]*\s')

# link things
urlskel = {
  'proto' : r'(https?|ftp|news|telnet|gopher|wais)://',
  'guess' : r'(www[23]?|ftp)\.',   # w/out proto, try to guess
  'login' : r'A-Za-z0-9_.-',       # for ftp://login@domain.com
  'pass'  : r'[^ @]*',             # for ftp://login:password@domain.com
  'chars' : r'A-Za-z0-9%._/~:,=-', # %20(space), :80(port)
  'anchor': r'A-Za-z0-9%.-',       # %nn(encoded)
  'form'  : r'A-Za-z0-9/%&=+.@*_-',# .@*_-(as is)
  'punct' : r'.,;:!?'
}
patt_url_login = r'([%s]+(:%s)?@)?'%(urlskel['login'],urlskel['pass'])
retxt_url = r'\b(%s%s|%s)[%s]+(#[%s]+|\?[%s]+)?(?=[%s]|[^%s]|$)\b'%(
             urlskel['proto'],patt_url_login, urlskel['guess'],
             urlskel['chars'],urlskel['anchor'],
             urlskel['form'] ,urlskel['punct'],urlskel['form'])
retxt_url_local = r'[%s]+|[%s]*(#[%s]+)'%(
             urlskel['chars'],urlskel['chars'],urlskel['anchor'])
retxt_email = r'\b[%s]+@([A-Za-z0-9_-]+\.)+[A-Za-z]{2,4}(\?[%s]+)?\b'%(
             urlskel['login'],urlskel['form'])
re_link = re.compile(r'%s|%s'%(retxt_url,retxt_email), re.I)
re_linkmark = re.compile(r'\[([^]]*) (%s|%s|%s)\]'%(
             retxt_url, retxt_email, retxt_url_local))

re_x = re.compile('\a')
re_blankline = re.compile(r'^\s*$')
re_comment = re.compile(r'^//')
re_date = re.compile(r'%%date\b(\((?P<fmt>.*?)\))?', re.I)
re_img = re.compile(r'\[([\w_,.+%$#@!?+~/-][\w_,.+%$#@!?+~/ -]+\.(png|jpe?g|gif|eps|bmp))\]', re.L+re.I)


def doHeader(title, author, date):
	ret = []
	title = string.strip(title)
	author = string.strip(author)
	date = string.strip(date)
	if doctype == 'txt':
		ret.append("%s\n%s\n%s"%(title,author,date))
	elif doctype == 'sgml':
		ret.append("<!doctype linuxdoc system>\n<article>")
		ret.append("<title>%s\n<author>%s\n<date>%s\n" %(title,author,date))
	elif doctype == 'html':
		ret.append('<HTML>\n<HEAD><TITLE>%s</TITLE></HEAD>'%title)
		ret.append('<BODY BGCOLOR="white" TEXT="black">')
		ret.append('<P ALIGN="center"><CENTER><H1>%s</H1>'%title)
		ret.append('<FONT SIZE=4><I>%s</I><BR>'%author)
		ret.append('%s</FONT></CENTER>\n'%date)
	elif doctype == 'man':
		# TODO man section 1 is hardcoded...
		ret.append('.TH "%s" 1 %s "%s"'%(title,date,author))
	elif doctype == 'pm6':
		# TODO style to <HR>
		# TODO unix2dos before apply
		ret.append("""\
<PMTags1.0 win><C-COLORTABLE ("Preto" 1 0 0 0)
><@Normal=
  <FONT "Times New Roman"><CCOLOR "Preto"><SIZE 11>
  <HORIZONTAL 100><LETTERSPACE 0><CTRACK 127><CSSIZE 70><C+SIZE 58.3>
  <C-POSITION 33.3><C+POSITION 33.3><P><CBASELINE 0><CNOBREAK 0><CLEADING -0.05>
  <GGRID 0><GLEFT 7.2><GRIGHT 0><GFIRST 0><G+BEFORE 7.2><G+AFTER 0>
  <GALIGNMENT "justify"><GMETHOD "proportional"><G& "ENGLISH">
  <GPAIRS 12><G% 120><GKNEXT 0><GKWIDOW 0><GKORPHAN 0><GTABS $>
  <GHYPHENATION 2 34 0><GWORDSPACE 75 100 150><GSPACE -5 0 25>
><@Bullet=<@-PARENT "Normal"><FONT "Abadi MT Condensed Light">
  <GLEFT 14.4><G+BEFORE 2.15><G% 110><GTABS(25.2 l "")>
><@PreFormat=<@-PARENT "Normal"><FONT "Lucida Console"><SIZE 8><CTRACK 0>
  <GLEFT 0><G+BEFORE 0><GALIGNMENT "left"><GWORDSPACE 100 100 100><GSPACE 0 0 0>
><@Title1=<@-PARENT "Normal"><FONT "Arial"><SIZE 14><B>
  <GCONTENTS><GLEFT 0><G+BEFORE 0><GALIGNMENT "left">
><@Title2=<@-PARENT "Title1"><SIZE 12><G+BEFORE 3.6>
><@Title3=<@-PARENT "Title1"><SIZE 10><GLEFT 7.2><G+BEFORE 7.2>
><@Title4=<@-PARENT "Title3">
><@Title5=<@-PARENT "Title3">
><@Quote=<@-PARENT "Normal"><SIZE 10><I>>
""")
	elif doctype == 'mgp':
		ret.append("""\
#!/usr/X11R6/bin/mgp -t 90
%deffont "normal"    xfont "utopia-medium-r", charset "iso8859-1"
%deffont "normal-i"  xfont "utopia-medium-i", charset "iso8859-1"
%deffont "normal-b"  xfont "utopia-bold-r",   charset "iso8859-1"
%deffont "normal-bi" xfont "utopia-bold-i",   charset "iso8859-1"
%deffont "mono"     xfont "courier-medium-r", charset "iso8859-1"
%default 1 size 5
%default 2 size 8, fore "yellow", font "normal-b", center
%default 3 size 5, fore "white",  font "normal", left, prefix "  "
%tab 1 size 4, vgap 30, prefix "     ", icon arc "red" 40, leftfill
%tab 2 prefix "            ", icon arc "orange" 40, leftfill
%tab 3 prefix "                   ", icon arc "brown" 40, leftfill
%tab 4 prefix "                          ", icon arc "darkmagenta" 40, leftfill
%tab 5 prefix "                                ", icon arc "magenta" 40, leftfill
%%%%%%%%%%%%%%%%%%%%%%%%%% end of headers %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%page
""")
		# 1st title page
		ret.append('\n\n\n\n%%size 10, center, fore "yellow"\n%s'%title)
		ret.append('\n%%font "normal-i", size 6, fore "white", center')
		ret.append('%s\n\n%%font "mono", size 7, center\n%s'%(author,date))
	return ret

def doCommentLine(txt):
	if doctype == 'sgml' or doctype == 'html': ret = "<!-- %s -->"%txt
	elif doctype == 'mgp': ret = "%%%% %s"%txt
	elif doctype == 'man': ret = '.\\" %s'%txt
	else: ret = ''
	return ret

def doFooter():
	ret = []
	ppgd = '%s code generated by txt2tags %s (%s)'%(doctype,my_version,my_url)
	cmdline = 'cmdline: txt2tags %s'%string.join(sys.argv[1:], ' ')
	ret.append('\n'+doCommentLine(ppgd))
	ret.append(doCommentLine(cmdline))
	ret.append(TAGEOD[T])
	return ret

def doEscape(txt):
	if doctype == 'html' or doctype == 'sgml':
		txt = re.sub('&','&amp;',txt)
		txt = re.sub('<','&lt;',txt)
		txt = re.sub('>','&gt;',txt)
		if doctype == 'sgml': txt = re.sub('ÿ','&yuml;',txt)
	elif doctype == 'pm6' : txt = re.sub('<','<\#60>',txt)
	elif doctype == 'mgp' : txt = re.sub('^%([^%])','%prefix ""\n  %\n%cont, prefix "  "\n\\1',txt)
	elif doctype == 'man' : txt = re.sub('^\.', ' .',txt) # command ID
	return txt

def doEscapeEscape(txt):
	while re.search(r'\\<', txt):
		txt = re.sub(r'\\<','<\#92><',txt)
	return txt

def addLineBreaks(list):
	ret = []
	for line in list:
		ret.append(line+'\n')
	return ret


################################################################################
###MerryChristmas,IdontwanttofighttonightwithyouImissyourbodyandIneedyourlove###
################################################################################


def doitall(inlines, doctype):
	# the defaults
	title = 'document title'
	author = 'author name'
	currdate = strftime('%Y%m%d',localtime(time()))    # ISO current date
	date = currdate
	linkmask = '@@_@_@@'
	monomask = '@@_m_@@'
	
	ret = []
	toclist = []
	header = []
	f_tt = 0
	listident = []
	listids = []
	listcount = []
	titlecount = ['',0,0,0,0,0]
	f_header = 0
	f_lastblank = 0
	holdspace = ''
	listholdspace = ''
	quotedepth = 0
	istable = 0
	tableborder = 0
	tablealign = []
	
	if outfile != pipefileid:
		print "--- %s..."%doctype
	
	# let's mark it up!
	linenr = 0
	for lineref in range(len(inlines)):
		skip_continue = 0
		urlbank = [] ; emailbank = []
		linkbank = []
		monobank = []
		linenr = lineref +1
		line = string.rstrip(inlines[lineref])
		
		# we need (not really) to mark each paragraph
		if doctype == 'pm6' and f_lastblank:
			if f_tt or f_header or listident: holdspace = ''
			else: holdspace = TAGparagraph[T]+'\n'
	
	# PRE-formatted area
		# we'll never support beautifiers inside pre-formatted
		if f_tt:
			f_lastblank = 0
			line = doEscape(line)
			
			# closing PRE
			if re_areaPreClose.search(line):
				if doctype != 'pm6': ret.append(TAGareaPreClose[T])
				f_tt = 0
				continue
			
			# normal PRE-inside line
			if doctype == 'pm6': line = doEscapeEscape(line)
			elif doctype in ('txt', 'man', 'html'): line = '  '+line # align
			ret.append(line)
			continue
		
		# detecting PRE
		if re_areaPreOpen.search(line):
			line = doEscape(line)
			
			if f_tt:
				warn = "WARNING:%d:opening PRE-formatted tag"%linenr 
				print warn, "without closing previous one"
				ret.append(line)
				continue
			
			ret.append(TAGareaPreOpen[T])
			f_tt = 1
			continue
	
	# one line PRE-formatted text
		if re_1linePre.search(line):
			f_lastblank = 0
			line = doEscape(line)
			line = re_1linePre.sub('',line)
			if   doctype == 'pm6': line = doEscapeEscape(line)
			if doctype in ('txt', 'man', 'html'): line = '  '+line  # align
			ret.append('%s\n%s\n%s'%(TAGareaPreOpen[T],line,TAGareaPreClose[T]))
			continue
	
	# blank lines
		#TODO "holdspace" to save <p> to not show in closelist
		if re_blankline.search(line):
		
			if istable:
				if istableaware: ret.append(TAGtableClose[T])
				else: ret.append(TAGareaPreClose[T])
				istable = tableborder = 0
				continue
			
			#TODO generic class or function to close quotes/lists/tables
			#     when entering pre,list,table,etc
			# closing quotes
			while quotedepth:
				quotedepth = quotedepth-1
				ret.append(TAGareaQuoteClose[T])
			
			if f_lastblank:      # 2nd consecutive blank line
				if listident:    # closes list (if any)
					while len(listident):
						if   listids[-1] == '-': tag = TAGlistClose[T]
						elif listids[-1] == '+': tag = TAGnumlistClose[T]
						elif listids[-1] == '=': tag = TAGdeflistClose[T]
						if not tag: tag = TAGlistClose[T] # default
						if tag: # man tags just for mother-list and at ^
							if doctype == 'man':
								if len(listident) == 1: ret.append(tag)
							else: ret.append(listident[-1]+tag)
						del listident[-1]
						del listids[-1]
						# add visual separator line for the mother list
						if not listids and doctype == 'txt': ret.append('\n')
					holdspace = ''
				continue         # consecutive blanks are trash
			
			if f_header or linenr == 1:  # 1st blank after header (if any)
				if not f_noheaders: header = doHeader(title,author,date)
				if doctype != 'pm6': ret.append(TAGparagraph[T])
				f_header = 0     # we're done with header
				continue
			
			# normal blank line
			if doctype != 'pm6':
				# paragraph (if any) is wanted inside lists also
				if listident:
					holdspace = holdspace+TAGparagraph[T]+'\n'
				elif doctype == 'html': ret.append(TAGparagraph[T])
				# sgml: the quote close tag must not be \n\n</quote>
				elif doctype == 'sgml' and quotedepth:
					skip_continue = 1
				# otherwise we just print a blank line
				else: ret.append('')
			
			f_lastblank = 1
			if not skip_continue: continue
		else:
			f_lastblank = 0      # reset blank status
	
	# first line with no header
		if f_noheaders and linenr == 1 and doctype != 'pm6':
			ret.append(TAGparagraph[T])
	
	# comments
		# just skip them
		if re_comment.search(line):
			f_lastblank = 1
			continue
	
	
	# protect pre-formatted font text from escaping and formatting
		if not f_tt:
			while re_mono.search(line):
				txt = re_mono.search(line).group(1)
				monobank.append(doEscape(txt))
				line = re_mono.sub(monomask,line,1)
	
	# protect URLs and emails from escaping and formatting
	# changing them by a mask
		if not f_tt:
			while re_linkmark.search(line):    # search for named link
				m = re_linkmark.search(line)
				# remove quotes from old ["" link] tag
				label = re.sub('^"|"$','',m.group(1))
				link = m.group(2)
				linkbank.append([label, link])
				line = re_linkmark.sub(linkmask,line,1)
			
			while re_link.search(line):        # simple url or email
				link = re_link.search(line).group()
				linkbank.append(['', link])
				line = re_link.sub(linkmask,line,1)
	
	# the target-specific special char escapes 
		line = doEscape(line)
	
	
	# HR line
		if re_bar.search(line):
			txt = re_bar.search(line).group(1)
			if txt[0] == '=': bar = TAGbar2[T]
			else            : bar = TAGbar1[T]
			line = re_bar.sub(bar,line)
			ret.append(re_x.sub(txt,line))
			continue
	
	# quote
		if re_quote.search(line):
			currquotedepth = len(re_quote.search(line).group(0)) # TABs number
			if doctype == 'sgml' and quotedepth and currquotedepth > quotedepth:
				currquotedepth = quotedepth
			if not TAGareaQuoteClose[T]:
				line = re_quote.sub(TAGareaQuoteOpen[T]*currquotedepth, line)
			else:
				# new (sub)quote
				if not quotedepth or currquotedepth > quotedepth:
					quotedepth = currquotedepth
					ret.append(TAGareaQuoteOpen[T])
				
				if doctype != 'html'and doctype != 'sgml':
					line = re_quote.sub('', line)
				
				# closing quotes
				while currquotedepth < quotedepth:
					quotedepth = quotedepth-1
					ret.append(TAGareaQuoteClose[T])
		else:
			# closing quotes
			while quotedepth:
				quotedepth = quotedepth-1
				ret.append(TAGareaQuoteClose[T])
	
	
	# title
		#TODO set next blank and set f_lastblank or f_lasttitle
		if re_title.search(line) and not listident:
			m = re_title.search(line)
			tag = m.group('tag')
			level = len(tag)
			tag = eval('TAGtitle%s[T]'%level)
			
			txt = string.strip(m.group('txt'))
			# initpos = string.find(line,txt)
			if doctype == 'sgml':
				txt = re.sub(r'\[', r'&lsqb;', txt)
				txt = re.sub(r'\\', r'&bsol;', txt)
			
			if f_enumtitle:                       ### numbered title
				id = '' ; n = level               #
				titlecount[n] = titlecount[n] +1  # add count
				if n < len(titlecount)-1:         # reset sublevels count
					for i in range(n+1, len(titlecount)): titlecount[i] = 0
				for i in range(n):                # compose id from hierarchy
					id = "%s%d."%(id,titlecount[i+1])
				txt = "%s %s"%(id, txt)           # add id to title
			
			anchorid = '#toc%d'%(len(toclist)+1)
			if TAGanchor[T] and f_toc:
				ret.append(re_x.sub(anchorid,TAGanchor[T]))
			
			line = re_title.sub(tag,line)
			ret.append(re_x.sub(txt,line))
			
			# let's do some TOC!
			if TAGanchor[T]:
				tocitemid = '#toc%d'%(len(toclist)+1)
				tocitem = '%s- [%s %s]'%(' '*level,txt,anchorid)
			else:
				tocitem = '%s- %s'%(' '*level,txt)
				if doctype in ['txt', 'man']:
					tocitem = '%s%s' %('  '*level,txt)
			toclist.append(tocitem)
			
			# add "underline" to text titles
			if doctype == 'txt': ret.append(re_x.sub('='*len(txt),tag))
			
			continue
	
	#TODO!		
	#		labeltxt = ''
	#		label = m.group('label')
	#		if label: labeltxt = '<label id="%s">' %label
	
	
	# list
		if re_list.search(line) or re_deflist.search(line):
			if re_list.search(line): rgx = re_list
			else                   : rgx = re_deflist
			
			m = rgx.search(line)
			listitemident = m.group(1)
			listtype = m.group(2)
			extra = m.group(3)        # regex anchor char
			
			if listtype == '=':
				listdefterm = m.group(3)
				extra = ''
			
			# new sublist
			if not listident or len(listitemident) > len(listident[-1]):
				listident.append(listitemident)
				listids.append(listtype)
				if   listids[-1] == '-': tag = TAGlistOpen[T]
				elif listids[-1] == '+': tag = TAGnumlistOpen[T]
				elif listids[-1] == '=': tag = TAGdeflistOpen[T]
				if not tag: tag = TAGlistOpen[T] # default
				# no need to reopen <pre> tag on man sublists
				if doctype == 'man' and len(listident) != 1: tag = ''
				openlist = listident[-1]+tag
				if doctype == 'pm6': listholdspace = openlist
				else:
					if string.strip(openlist): ret.append(openlist)
				# reset item manual count
				listcount.append(0)
			
			# closing sublists
			while len(listitemident) < len(listident[-1]):
				if   listids[-1] == '-': tag = TAGlistClose[T]
				elif listids[-1] == '+': tag = TAGnumlistClose[T]
				elif listids[-1] == '=': tag = TAGdeflistClose[T]
				if not tag: tag = TAGlistClose[T] # default
				if tag: # man list is just a <pre> text, closed at mother-list
					if doctype != 'man': ret.append(listident[-1]+tag)
				del listident[-1]
				del listids[-1]
				if listcount: del listcount[-1]
			
			# normal item
			listid = listident[-1]
			if listids[-1] == '-':
				tag = TAGlistItem[T]
			elif listids[-1] == '+':
				tag = TAGnumlistItem[T]
				listcount[-1] = listcount[-1] +1
				if doctype in ['txt', 'man', 'moin', 'mgp']:
					tag = re_x.sub(str(listcount[-1]), tag)
			elif listids[-1] == '=':
				if not TAGdeflistItem1[T]:
					# emulate def list, with <li><b>def</b>:
					tag = TAGlistItem[T] +TAGfontBoldOpen[T] +listdefterm
					tag = tag +TAGfontBoldClose[T] +':'
				else:
					tag = re_x.sub(listdefterm, TAGdeflistItem1[T])
				tag = tag + TAGdeflistItem2[T]  # open <DD>
			if doctype == 'mgp': listid = len(listident)*'\t'
			
			line = rgx.sub(listid+tag+extra,line)
			if listholdspace:
				line = listholdspace+line
				listholdspace = ''
			if doctype == 'sgml': line = re.sub(r'\[', r'&lsqb;', line)
	
	
	# table
	#TODO escape undesired format inside table
	#TODO not rstrip if table line (above)
	#TODO add man, pm6 targets
		if re_table.search(line): # only HTML for now
			
			closingbar = re.compile(r'\| *$')
			tableid = line[re_table.search(line).end()-1]
			
			if not istable:  # table header
				if doctype in ['sgml', 'html', 'moin']:
					istableaware = 1
					if tableid == '\t': tableborder = 1
					if closingbar.search(line): tableborder = 1
					# add border=1
					ret.append(re_x.sub(`tableborder`, TAGtableOpen[T]))
				else:
					istableaware = 0 ; ret.append(TAGareaPreOpen[T])
			
			istable = 1
			
			if istableaware:
				line = re.sub(r'^ *'  , '', line)    # del leading spaces
				line = closingbar.sub('', line)      # del last bar |
				
				tablefmt, tablecel = re.split(r'\s', line, 1)
				tablefmt = tablefmt[1:]  # cut mark off
				tablecel = re.split(r'\t\|?| \|', tablecel)
				line = ''
				
				# setting cell and line tags
				tl1, tl2 = TAGtableLineOpen[T], TAGtableLineClose[T]
				tc1, tc2 = TAGtableCellOpen[T], TAGtableCellClose[T]
				if tablefmt and tablefmt[0] == '|': # title cell
					tc1, tc2 = TAGtableTitleCellOpen[T], TAGtableTitleCellClose[T]
				if doctype == 'html': tc2 = tc2+'\n' ; tl1 = tl1+'\n'
				
				if tablecel:
					while tablecel:
						cel = tablecel.pop(0)
						if not cel and doctype == 'html':
							cel = '&nbsp;'
						else:
							# user escaped (not delim!)
							cel = string.replace(cel,'\|', '|')
						if not tablecel and doctype == 'sgml':
							tc2 = '' # last cell
						line = '%s%s%s%s'%(line,tc1,string.strip(cel),tc2)
				line = '%s%s%s'%(tl1,line,tl2)
	
	
	### BEGIN of at-any-part-of-the-line/various-per-line TAGs.
	
	# date
		while re_date.search(line):
			m = re_date.search(line)
			fmt = m.group('fmt') or ''
			dateme = currdate
			if fmt: dateme = strftime(fmt,localtime(time()))
			line = re_date.sub(dateme,line,1)
	
	# bold
		if re_bold.search(line):
			txt = r'%s\1%s'%(TAGfontBoldOpen[T],TAGfontBoldClose[T])
			line = re_bold.sub(txt,line)
	
	# italic
		if re_italic.search(line):
			txt = r'\1%s\2%s'%(TAGfontItalicOpen[T],TAGfontItalicClose[T])
			line = re_italic.sub(txt,line)
	
	# bolditalic
		if re_bolditalic.search(line):
			txt = r'%s\1%s'%(TAGfontBoldItalicOpen[T],TAGfontBoldItalicClose[T])
			line = re_bolditalic.sub(txt,line)
	
	# underline
		if re_underline.search(line):
			txt = r'%s\1%s'%(TAGfontUnderlineOpen[T],TAGfontUnderlineClose[T])
			line = re_underline.sub(txt,line)
	
	# image
		# first store blanks to detect image at ^
		try: leadingblanks = re.match(' +',line).end()
		except: leadingblanks = 0
		# moin and txt tags are the same as the mark
		while re_img.search(line) and doctype not in ['moin','txt']:
			m = re_img.search(line)
			txt = m.group(1)
			ini = m.start() ; head = leadingblanks 
			end = m.end()   ; tail = len(line)
			tag = TAGimg[T]
			
			if doctype == 'html': # do img align
				
				align = 'center'  # default align         # text + img + text
				if   ini == head and end == tail:
					tag = '<P ALIGN="center">%s</P>'%tag  # ^img$
				elif ini == head: align = 'left'          # ^img + text$
				elif end == tail: align = 'right'         # ^text + img$
				tag = re_x.sub(align, tag, 1)             # add align on tag
			
			line = re_img.sub(tag,line,1)
			line = re_x.sub(txt,line,1)
			
			if doctype == 'sgml': line = re.sub(r'\[', r'&lsqb;', line)
		line = '%s%s'%(' '*leadingblanks,line) # put blanks back
	
	# font PRE
		for mono in monobank:
			line = string.replace(line, monomask, "%s%s%s"%(
			       TAGfontMonoOpen[T],mono,TAGfontMonoClose[T]),1)
	
	# URL & email
		for link in linkbank:
			linktype = 'url'; label = link[0]; url = link[1]
			if re.match(retxt_email, url): linktype = 'email'
			
			guessurl = ''                    # adding protocol to guessed link
			if linktype == 'url' and re.match(urlskel['guess'], url):
				if url[0] == 'w': guessurl = 'http://' +url
				else: guessurl = 'ftp://' +url
			
			if not label and not guessurl:   # simple link
				if f_maskemail and linktype == 'email':
					url = string.replace(url,'@',' (a) ')
					url = string.replace(url,'.',' ')
					url = doEscape("<%s>"%url)
					line = string.replace(line, linkmask, url, 1)
				else:
					line = eval('string.replace(line,linkmask,TAG%s[T],1)'%linktype)
					line = re_x.sub(url,line)
			else:                            # named link!
				if not label: label = url
				if guessurl: url = guessurl
				# putting data on the right appearance order
				urlorder = [label, url]                 # label before link
				if doctype in ('html', 'sgml', 'moin'): # link before label
					urlorder = [url, label]
				
				# replace mask with tag
				line = eval('string.replace(line,linkmask,TAG%sMark[T],1)'%linktype)
				for data in urlorder:        # fill \a from tag with data
					line = re_x.sub(data,line,1)
	
	# header
		if not f_noheaders:
			if linenr == 1:
				title = line
				f_header = 1
				continue
			if f_header:
				if   linenr == 2: author = line ; continue
				elif linenr == 3: date   = line ; continue
				else:
					header = doHeader(title,author,date)
					f_header = 0
	
		# FINAL scapes. TODO function for it
		# convert all \ before <...> to tag
		if doctype == 'pm6': line = doEscapeEscape(line)
		elif doctype == 'man' : line = re.sub('-',r'\-',line)
		
		ret.append(holdspace+line)
		holdspace = ''
	
	if not f_noheaders: ret.extend(doFooter())
	return header,toclist,ret


################################################################################
################################################################################


# reading our source file
if infile == pipefileid:
	lines = sys.stdin.readlines()
else:
	f = open(infile, 'r') ; lines = f.readlines() ; f.close

# let's do it!
header,toc,doc = doitall(lines, doctype)

# deal with the TOC options
if f_toc or f_toconly:
	# format TOC lines
	#TODO list is piggy, try QUOTE+BR
	### here we do toc as a valid t2t marked text (list type)
	f_noheaders = 1
	x,y,toc = doitall(['']+toc+['',''], doctype)
	
	# TOC between bars (not for --toconly)
	if f_toc:
		para = TAGparagraph[T]
		tocbar = [para, re_x.sub('-'*72,TAGbar1[T]), para]
		toc = tocbar + toc + tocbar
	
	doc = toc + doc

outlist = header + doc
if f_toconly: outlist = toc[:]      # TOC only!

# writing output to screen or file
if outfile == pipefileid:
	for line in outlist: print line
else:
	f = open(outfile, 'w') ; f.writelines(addLineBreaks(outlist)) ; f.close()
	print 'wrote %s'%(outfile)

if splitlevel:
	print "--- html..."
	os.system('sgml2html --language=%s --split=%s %s'%(lang,splitlevel,outfile))

sys.exit(0)

###  RESOURCES
# html: http://www.w3.org/TR/WD-html-lex
# man: man 7 man
# sgml: www.linuxdoc.org
# moin: http://twistedmatrix.com/users/jh.twistd/moin/moin.cgi/WikiSandBox
# moin: http://moin.sf.net
# pm6: <font$> turn all formatting to the style's default
# pm6: <#comments#> <font #comment# $>
#  pagemaker table
#  1 = 0,55
#  2 = 1,10
#  3 = 1,65
#  4 = 2,20
#
#        |__1_|    |    |    |    |    |
#        |_______2_|    |    |    |    |
#        |____________3_|    |    |    |