#!/bin/env python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab # # xPml2XHtml.py # # This is a python script. You need a Python interpreter to run it. # For example, ActiveState Python, which exists for windows. # # Based on Code, Input and Ideas from: # The Dark Reverser (original author) # Kevin Hendricks # Logan Kennelly # John Schember (Calibre project) # WayneD's (perl pml2html.pl) # Changelog # 0.02 - tried to greatly improve html conversion especially with \t tags # 0.03 - more cleanup, a few fixes, and add in use of tidy to output xhtml # 0.04 - incorporate more cleanups # 0.05 - add check to fix block elements nested in inline elements which are not allowed # 0.07 - handle clean up for remains left over from fixing nesting issues rampant in pml # 0.08 - deal with inline style tags nesting issues in new way using a style tag list # 0.09 - add in support for wrapping all text not in a block in
tags # 0.10 - treat links effectively as block elements for style markup # 0.11 - add in various paragraphs indentations to handle leading spaces that html would ignore or compress # 0.12 - add in support for handling xml based pml footnotes and sidebars - using pseudo pml tags # 0.14 - add in full header info parsing and remove need for bookinfo.txt # 0.15 - cleanup high chars better handled, optional use of tidy with command line switch # 0.16 - use proper and safe temporary file when passing things to tidy # 0.17 - add support for tidy.exe under windows # 0.18 - fix corner case of lines that start with \axxx or \Uxxxx tags # 0.19 - change to use auto flushed stdout, and use proper return values __version__='0.19' class Unbuffered: def __init__(self, stream): self.stream = stream def write(self, data): self.stream.write(data) self.stream.flush() def __getattr__(self, attr): return getattr(self.stream, attr) import sys sys.stdout=Unbuffered(sys.stdout) import struct, binascii, zlib, os, getopt, os.path, urllib, re, tempfile import logging from subprocess import Popen, PIPE, STDOUT logging.basicConfig() #logging.basicConfig(level=logging.DEBUG) class PmlConverter(object): def __init__(self, s): def cleanupHighChars(src): # special win1252 chars 0x80 - 0xa0 properly handled src = re.sub('[\x80-\xff]', lambda x: '\\a%03d' % ord(x.group()), src) src = re.sub('[^\x00-\xff]', lambda x: '\\U%04x' % ord(x.group()), src) return src def convertFootnoteXMLtoPseudoPML(src): # creates pseudo tag \Ft="id"footnote text\Ft p = re.compile(r'', '
\n'), # pseudo tag indicating a paragraph (imputed from pml file contents) #'x' : ('' % nb return_s += applyStyles(False) return_s += line return_s += applyStyles(True) return_s += '
\n' else: return_s += '\n' elif inParaNow(): # text is a continuation of a previously started paragraph return_s += line return_s += applyStyles(True) return_s += '\n' j = len(in_tags) del in_tags[j-1] else: if len(line) > 0: return_s += line + '
' % nb return_s += applyStyles(False) return_s += linefrag ppair = ('P', None) in_tags.append(ppair) else: return_s += linefrag return return_s while True: r = self.next() if not r: break text, cmd, attr = r if text: final += makeText(text) if cmd: # handle pseudo paragraph P tags # close if starting a new block element if cmd in self.html_block_tags or cmd == 'w': j = len(in_tags) if j > 0: if in_tags[j-1][0] == 'P': final += applyStyles(True) final += getTag(in_tags[j-1],True) del in_tags[j-1] if cmd in self.html_block_tags: pair = (cmd, attr) if cmd not in [a for (a,b) in in_tags]: # starting a new block tag final += getTag(pair, False) final += applyStyles(False) in_tags.append(pair) else: # process ending tag for a tag pair # ending tag should be for the most recently added start tag j = len(in_tags) if cmd == in_tags[j-1][0]: final += applyStyles(True) final += getTag(in_tags[j-1], True) del in_tags[j-1] else: # ow: things are not properly nested # process ending tag for block # ending tag **should** be for the most recently added block tag # but in too many cases it is not so we must fix this by # closing all open tags up to the current one and then # reopen all of the tags we had to close due to improper nesting of styles print 'Warning: Improperly Nested Block Tags: expected %s found %s' % (cmd, in_tags[j-1][0]) print 'after processing %s' % final[-40:] j = len(in_tags) while True: j = j - 1 final += applyStyles(True) final += getTag(in_tags[j], True) if in_tags[j][0] == cmd: break del in_tags[j] # now create new block start tags if they were previously open while j < len(st_tags): final += getTag(in_tags[j], False) final += applyStyles(False) j = j + 1 self.skipNewLine() elif cmd in self.html_link_tags: pair = (cmd, attr) if cmd not in [a for (a,b) in in_tags]: # starting a new link tag # first close out any still open styles if inBlock(): final += applyStyles(True) # output start tag and styles needed final += getTag(pair, False) final += applyStyles(False) in_tags.append(pair) else: # process ending tag for a tag pair # ending tag should be for the most recently added start tag j = len(in_tags) if cmd == in_tags[j-1][0]: j = len(in_tags) # apply closing styles and tag final += applyStyles(True) final += getTag(in_tags[j-1], True) # if needed reopen any style tags if inBlock(): final += applyStyles(False) del in_tags[j-1] else: # ow: things are not properly nested print 'Error: Improperly Nested Link Tags: expected %s found %s' % (cmd, in_tags[j-1][0]) print 'after processing %s' % final[-40:] elif cmd in self.html_style_tags: spair = (cmd, attr) if cmd not in [a for (a,b) in st_tags]: # starting a new style if inBlock() or inLink(): final += getSTag(spair,False) st_tags.append(spair) else: # process ending tag for style # ending tag **should** be for the most recently added style tag # but in too many cases it is not so we must fix this by # closing all open tags up to the current one and then # reopen all of the tags we had to close due to improper nesting of styles j = len(st_tags) while True: j = j - 1 if inBlock() or inLink(): final += getSTag(st_tags[j], True) if st_tags[j][0] == cmd: break del st_tags[j] # now create new style start tags if they were previously open while j < len(st_tags): if inBlock() or inLink(): final += getSTag(st_tags[j], False) j = j + 1 elif cmd in self.html_one_tags: final += self.html_one_tags[cmd] elif cmd == 'p': # create page breaks at the
level so # they can be easily used for safe html file segmentation breakpoints # first close any open tags j = len(in_tags) if j > 0: while True: j = j - 1 if in_tags[j][0] in self.html_block_tags: final += applyStyles(True) final += getTag(in_tags[j], True) if j == 0: break # insert the page break tag final += '\n\n' if sigil_breaks: if (len(final) - lastbreaksize) > 3000: final += '' final += applyStyles(False) final += self.pml_chars.get(attr, '%d;' % attr) ppair = ('P', None) in_tags.append(ppair) else: final += self.pml_chars.get(attr, '%d;' % attr) elif cmd == 'U': if not inBlock() and not inLink() and not inComment(): final += '
' final += applyStyles(False) final += '%d;' % attr ppair = ('P', None) in_tags.append(ppair) else: final += makeText('%d;' % attr) elif cmd == 'w': # hr width and align parameters are not allowed in strict xhtml but style widths are possible final += '\n
' % attr final += applyStyles(False) ppair = ('P', None) in_tags.append(ppair) else: logging.warning("Unknown tag: %s-%s", cmd, attr) # handle file ending condition for imputed P tags j = len(in_tags) if (j > 0): if in_tags[j-1][0] == 'P': final += '
' final += '\n\n' # recode html back to a single slash final = final.replace('_amp#92_', '\\') # cleanup the html code for issues specifically generated by this translation process # ending divs already break the line at the end so we don't need the','') final = final.replace('','') final = final.replace('','') final = final.replace('','') final = final.replace('','') final = final.replace('','') final = final.replace('','') final = final.replace('\n','') final = final.replace('\n','') final = final.replace('\n','') final = final.replace('\n','') final = final.replace('\n','') final = final.replace('\n','') final = final.replace('\n','') final = final.replace('\n','') final = final.replace('\n','') if s == final: break return final def tidy(rawhtmlfile): # processes rawhtmlfile through command line tidy via pipes rawfobj = file(rawhtmlfile,'rb') # --doctype strict forces strict dtd checking # --enclose-text yes - enclosees non-block electment text inside into its own block to meet xhtml spec # -w 100 -i will wrap text at column 120 and indent it to indicate level of nesting to make structure clearer # -win1252 sets the input encoding of pml files # -asxhtml convert to xhtml # -q (quiet) cmdline = 'tidy -w 120 -i -q -asxhtml -win1252 --enclose-text yes --doctype strict ' if sys.platform[0:3] == 'win': cmdline = 'tidy.exe -w 120 -i -q -asxhtml -win1252 --enclose-text yes --doctype strict ' p2 = Popen(cmdline, shell=True, stdin=rawfobj, stdout=PIPE, stderr=PIPE, close_fds=False) stdout, stderr = p2.communicate() # print "Tidy Original Conversion Warnings and Errors" # print stderr return stdout def usage(): print "Converts PML file to XHTML" print "Usage:" print " xpml2xhtml [options] infile.pml outfile.html " print " " print "Options: " print " -h prints this message" print " --sigil-breaks insert Sigil Chapterbbreaks" print " --use-tidy use tidy to further clean up the html " print " " return def main(argv=None): global bookname global footnote_ids global sidebar_ids global sigil_breaks try: opts, args = getopt.getopt(sys.argv[1:], "h", ["sigil-breaks", "use-tidy"]) except getopt.GetoptError, err: print str(err) usage() return 1 if len(args) != 2: usage() return 1 sigil_breaks = False use_tidy = False for o, a in opts: if o == "-h": usage() return 0 elif o == "--sigil-breaks": sigil_breaks = True elif o == "--use-tidy": use_tidy = True infile, outfile = args[0], args[1] bookname = os.path.splitext(os.path.basename(infile))[0] footnote_ids = { } sidebar_ids = { } try: print "Processing..." import time start_time = time.time() print " Converting pml to raw html" pml_string = file(infile,'rb').read() pml = PmlConverter(pml_string) html_src = pml.process() if use_tidy: print " Tidying html to xhtml" fobj = tempfile.NamedTemporaryFile(mode='w+b',suffix=".html",delete=False) tempname = fobj.name fobj.write(html_src) fobj.close() html_src = tidy(tempname) os.remove(tempname) file(outfile,'wb').write(html_src) end_time = time.time() convert_time = end_time - start_time print 'elapsed time: %.2f seconds' % (convert_time, ) print 'output is in file %s' % outfile print "Finished Processing" except ValueError, e: print "Error: %s" % e return 1 return 0 if __name__ == "__main__": #import cProfile #command = """sys.exit(main())""" #cProfile.runctx( command, globals(), locals(), filename="cprofile.profile" ) sys.exit(main())