From 24f001c61e0e7b313d0ac461315e07a4c2e62e57 Mon Sep 17 00:00:00 2001 From: some_updates Date: Sun, 24 Jan 2010 12:19:20 +0000 Subject: [PATCH] topazscripts 1.8 --- Topaz_Tools/lib/changes.txt | 20 ++- Topaz_Tools/lib/cmbtc_dump.py | 1 + Topaz_Tools/lib/cmbtc_dump_nonK4PC.py | 1 + Topaz_Tools/lib/convert2xml.py | 1 + Topaz_Tools/lib/decode_meta.py | 1 + Topaz_Tools/lib/flatxml2html.py | 204 ++++++++++++++++++++++---- Topaz_Tools/lib/genhtml.py | 10 +- Topaz_Tools/lib/gensvg.py | 83 +++++++++-- Topaz_Tools/lib/genxml.py | 1 + Topaz_Tools/lib/getpagedim.py | 1 + Topaz_Tools/lib/readme.txt | 80 ++++++---- Topaz_Tools/lib/stylexml2css.py | 1 + 12 files changed, 332 insertions(+), 72 deletions(-) diff --git a/Topaz_Tools/lib/changes.txt b/Topaz_Tools/lib/changes.txt index 8d6b4f0..83910c0 100644 --- a/Topaz_Tools/lib/changes.txt +++ b/Topaz_Tools/lib/changes.txt @@ -1,3 +1,22 @@ +Changes in version 1.8 + + - gensvg.py now builds wonderful xhtml pages with embedded svg + that can be easily paged through as if reading a book! + (tested in Safari for Mac and Win and Firefox) + (requires javascript to be enabled) + + - genhtml.py now REQUIRES that gensvg.py be run FIRST + this allows create of images on the fly from glyphs + + - genhtml.py now automatically makes tables of words into svg + based images and will handle glyph based ornate first + letters of words + + - cmbtc_dump_mac_linux.py has been renamed to be + cmbtc_dump_nonK4PC.py to make it clearer + when it needs to be used + + Changes in version 1.7 - gensvg.py has been improved so that the glyphs render exactly (ClarkNova) - gensvg.py has fixed a render order "bug" that allowed some images to cover or hide text. (ClarkNova) @@ -5,7 +24,6 @@ Changes in version 1.7 - add missing tag - make xhtml compliant doctype and minor changes to write correct xhtml - make divs that act as anchors be hidden visually and to take up 0 height and 0 width to prevent any impact on layout - - added support for new version of the <_span> tag called <span> Changes in version 1.6 - support for books whose paragraphs have no styles diff --git a/Topaz_Tools/lib/cmbtc_dump.py b/Topaz_Tools/lib/cmbtc_dump.py index 9cd32de..ac7e33c 100644 --- a/Topaz_Tools/lib/cmbtc_dump.py +++ b/Topaz_Tools/lib/cmbtc_dump.py @@ -1,4 +1,5 @@ #! /usr/bin/python +# For use in Topaz Scripts version 1.8 """ diff --git a/Topaz_Tools/lib/cmbtc_dump_nonK4PC.py b/Topaz_Tools/lib/cmbtc_dump_nonK4PC.py index 8a03a3a..ed7ff87 100644 --- a/Topaz_Tools/lib/cmbtc_dump_nonK4PC.py +++ b/Topaz_Tools/lib/cmbtc_dump_nonK4PC.py @@ -1,4 +1,5 @@ #! /usr/bin/python +# For use with Topaz Scripts Version 1.8 from __future__ import with_statement diff --git a/Topaz_Tools/lib/convert2xml.py b/Topaz_Tools/lib/convert2xml.py index 06bd89f..12ca934 100644 --- a/Topaz_Tools/lib/convert2xml.py +++ b/Topaz_Tools/lib/convert2xml.py @@ -1,5 +1,6 @@ #! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab +# For use with Topaz Scripts Version 1.8 from __future__ import with_statement import csv diff --git a/Topaz_Tools/lib/decode_meta.py b/Topaz_Tools/lib/decode_meta.py index f038310..ba831ec 100644 --- a/Topaz_Tools/lib/decode_meta.py +++ b/Topaz_Tools/lib/decode_meta.py @@ -1,5 +1,6 @@ #! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab +# For use with Topaz Scripts Version 1.8 from __future__ import with_statement import csv diff --git a/Topaz_Tools/lib/flatxml2html.py b/Topaz_Tools/lib/flatxml2html.py index 0b28468..4182502 100644 --- a/Topaz_Tools/lib/flatxml2html.py +++ b/Topaz_Tools/lib/flatxml2html.py @@ -1,21 +1,27 @@ #! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab +# For use with Topaz Scripts Version 1.8 from __future__ import with_statement import csv import sys import os +import math import getopt from struct import pack from struct import unpack class DocParser(object): - def __init__(self, flatxml, classlst, fileid): + def __init__(self, flatxml, classlst, fileid, bookDir): self.id = os.path.basename(fileid).replace('.dat','') + self.svgcount = 0 self.docList = flatxml.split('\n') self.docSize = len(self.docList) self.classList = {} + self.bookDir = bookDir + self.glyphPaths = { } + self.numPaths = 0 tmpList = classlst.split('\n') for pclass in tmpList: if pclass != '': @@ -30,6 +36,107 @@ class DocParser(object): self.paracont_stemid = [] self.parastems_stemid = [] + + def getGlyph(self, gid): + result = '' + id='gl%d' % gid + return self.glyphPaths[id] + + + def glyphs_to_image(self, glyphList): + + def extract(path, key): + b = path.find(key) + len(key) + e = path.find(' ',b) + return int(path[b:e]) + + def extractID(path, key): + b = path.find(key) + len(key) + e = path.find('"',b) + return path[b:e] + + + svgDir = os.path.join(self.bookDir,'svg') + glyfile = os.path.join(svgDir,'glyphs.svg') + + imgDir = os.path.join(self.bookDir,'img') + imgname = self.id + '_%04d.svg' % self.svgcount + imgfile = os.path.join(imgDir,imgname) + + # build hash table of glyph paths keyed by glyph id + if self.numPaths == 0: + gfile = open(glyfile, 'r') + while True: + path = gfile.readline() + if (path == ''): break + glyphid = extractID(path,'id="') + self.glyphPaths[glyphid] = path + self.numPaths += 1 + gfile.close() + + + # get glyph information + gxList = self.getData('info.glyph.x',0,-1) + gyList = self.getData('info.glyph.y',0,-1) + gidList = self.getData('info.glyph.glyphID',0,-1) + + gids = [] + maxws = [] + maxhs = [] + xs = [] + ys = [] + gdefs = [] + + # get path defintions, positions, dimensions for ecah glyph + # that makes up the image, and find min x and min y to reposition origin + minx = -1 + miny = -1 + for j in glyphList: + gid = gidList[j] + gids.append(gid) + + xs.append(gxList[j]) + if minx == -1: minx = gxList[j] + else : minx = min(minx, gxList[j]) + + ys.append(gyList[j]) + if miny == -1: miny = gyList[j] + else : miny = min(miny, gyList[j]) + + path = self.getGlyph(gid) + gdefs.append(path) + + maxws.append(extract(path,'width=')) + maxhs.append(extract(path,'height=')) + + + # change the origin to minx, miny and calc max height and width + maxw = maxws[0] + xs[0] - minx + maxh = maxhs[0] + ys[0] - miny + for j in xrange(0, len(xs)): + xs[j] = xs[j] - minx + ys[j] = ys[j] - miny + maxw = max( maxw, (maxws[j] + xs[j]) ) + maxh = max( maxh, (maxhs[j] + ys[j]) ) + + # open the image file for output + ifile = open(imgfile,'w') + ifile.write('<?xml version="1.0" standalone="no"?>\n') + ifile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n') + ifile.write('<svg width="%dpx" height="%dpx" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (math.floor(maxw/10), math.floor(maxh/10), maxw, maxh)) + ifile.write('<defs>\n') + for j in xrange(0,len(gdefs)): + ifile.write(gdefs[j]) + ifile.write('</defs>\n') + for j in xrange(0,len(gids)): + ifile.write('<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (gids[j], xs[j], ys[j])) + ifile.write('</svg>') + ifile.close() + + return 0 + + + # return tag at line pos in document def lineinDoc(self, pos) : if (pos >= 0) and (pos < self.docSize) : @@ -77,6 +184,17 @@ class DocParser(object): return startpos + # returns a vector of integers for the tagpath + def getData(self, tagpath, pos, end): + argres=[] + (foundat, argt) = self.findinDoc(tagpath, pos, end) + if (argt != None) and (len(argt) > 0) : + argList = argt.split('|') + argres = [ int(strval) for strval in argList] + return argres + + + # build a description of the paragraph def getParaDescription(self, start, end): @@ -120,6 +238,7 @@ class DocParser(object): # this type of paragrph may be made up of multiple _spans, inline # word monograms (images) and words with semantic meaning # and now a new type "span" versus the old "_span" + # plus glyphs used to form starting letter of first word # need to parse this type line by line line = start + 1 @@ -143,6 +262,21 @@ class DocParser(object): result.append(('ocr', wordnum)) line += 1 + elif name.endswith('word.firstGlyph') : + first = int(argres) + (name, argres) = self.lineinDoc(line+1) + if not name.endswith('word.lastGlyph'): + print 'Error: - incorrect glyph ordering inside word in paragraph' + last = int(argres) + glyphList = [] + for glyphnum in xrange(first, last): + glyphList.append(glyphnum) + num = self.svgcount + self.glyphs_to_image(glyphList) + self.svgcount += 1 + result.append(('svg', num)) + line += 1 + elif name.endswith('word.class'): (cname, space) = argres.split('-',1) if space == '' : space = '0' @@ -241,6 +375,11 @@ class DocParser(object): parares += '<img src="img/img%04d.jpg" alt="" />' % num parares += sep + elif wtype == 'svg' : + sep = '' + parares += '<img src="img/' + self.id + '_%04d.svg" alt="" />' % num + parares += sep + if len(sep) > 0 : parares = parares[0:-1] if (type == 'full') or (type == 'end') : parares += '</p>' @@ -260,10 +399,7 @@ class DocParser(object): if argres : self.ocrtext = argres.split('|') # get information to dehyphenate the text - (pos, argres) = self.findinDoc('info.dehyphen.rootID',0,-1) - if argres: - argList = argres.split('|') - self.dehyphen_rootid = [ int(strval) for strval in argList] + self.dehyphen_rootid = self.getData('info.dehyphen.rootID',0,-1) # determine if first paragraph is continued from previous page (pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1) @@ -274,16 +410,10 @@ class DocParser(object): last_para_continued = (self.paracont_stemid != None) # collect link ids - (pos, argres) = self.findinDoc('info.word.link_id',0,-1) - if argres: - argList = argres.split('|') - self.link_id = [ int(strval) for strval in argList] + self.link_id = self.getData('info.word.link_id',0,-1) # collect link destination page numbers - (pos, argres) = self.findinDoc('info.links.page',0,-1) - if argres : - argList = argres.split('|') - self.link_page = [ int(strval) for strval in argList] + self.link_page = self.getData('info.links.page',0,-1) # collect link titles (pos, argres) = self.findinDoc('info.links.title',0,-1) @@ -382,23 +512,45 @@ class DocParser(object): elif (regtype == 'table') : - ptype = 'full' - if first_para_continued : - ptype = 'end' - first_para_continued = False - (pclass, pdesc) = self.getParaDescription(start,end) - htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) - print "Warnings - Table Conversions are notoriously poor" - print "Strongly recommend taking a screen capture image of the " - print "table in %s.svg and using it to replace this attempt at a table" % self.id - + # translate first and last word into first and last glyphs + # and generate table as an image and include a link to it + glyphList = [] + (pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end) + (pos, slast) = self.findinDoc('paragraph.lastWord',start,end) + firstglyphList = self.getData('word.firstGlyph',0,-1) + gidList = self.getData('info.glyph.glyphID',0,-1) + if (sfirst != None) and (slast != None) : + first = int(sfirst) + last = int(slast) + firstGlyph = firstglyphList[first] + if last < len(firstglyphList): + lastGlyph = firstglyphList[last] + else : + lastGlyph = len(gidList) + for glyphnum in xrange(firstGlyph, lastGlyph): + glyphList.append(glyphnum) + num = self.svgcount + self.glyphs_to_image(glyphList) + self.svgcount += 1 + htmlpage += '<div class="graphic"><img src="img/' + self.id + '_%04d.svg" alt="" /></div>' % num + else : + ptype = 'full' + if first_para_continued : + ptype = 'end' + first_para_continued = False + (pclass, pdesc) = self.getParaDescription(start,end) + htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) + print " " + print "Warning: - Table Conversions are notoriously poor" + print " Strongly recommend taking a screen capture image of the " + print " table in %s.svg and using it to replace this attempt at a table" % self.id + print " " elif (regtype == 'synth_fcvr.center') or (regtype == 'synth_text.center'): (pos, simgsrc) = self.findinDoc('img.src',start,end) if simgsrc: htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc) - else : print 'Warning: region type', regtype (pos, temp) = self.findinDoc('paragraph',start,end) @@ -437,10 +589,10 @@ class DocParser(object): -def convert2HTML(flatxml, classlst, fileid): +def convert2HTML(flatxml, classlst, fileid, bookDir): # create a document parser - dp = DocParser(flatxml, classlst, fileid) + dp = DocParser(flatxml, classlst, fileid, bookDir) htmlpage = dp.process() diff --git a/Topaz_Tools/lib/genhtml.py b/Topaz_Tools/lib/genhtml.py index 3333f82..58d9e9a 100644 --- a/Topaz_Tools/lib/genhtml.py +++ b/Topaz_Tools/lib/genhtml.py @@ -1,5 +1,6 @@ #! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab +# For use with Topaz Scripts Version 1.8 import os, sys, getopt @@ -65,6 +66,12 @@ def main(argv): print "Can not find image directory in unencrypted book" sys.exit(-1) + svgDir = os.path.join(bookDir,'svg') + if not os.path.exists(svgDir) : + print "Can not find svg directory in unencrypted book" + print "please run gensvg.py before running genhtml.py" + sys.exit(-1) + otherFile = os.path.join(bookDir,'other0000.dat') if not os.path.exists(otherFile) : print "Can not find other0000.dat in unencrypted book" @@ -75,7 +82,6 @@ def main(argv): print "Can not find metadata0000.dat in unencrypted book" sys.exit(-1) - htmlFileName = "book.html" htmlstr = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n' htmlstr += '<html>\n' @@ -133,7 +139,7 @@ def main(argv): print ' ', filename fname = os.path.join(pageDir,filename) flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname) - htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname) + htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir) htmlstr += '</body>\n</html>\n' diff --git a/Topaz_Tools/lib/gensvg.py b/Topaz_Tools/lib/gensvg.py index 5db6456..fce15b2 100644 --- a/Topaz_Tools/lib/gensvg.py +++ b/Topaz_Tools/lib/gensvg.py @@ -1,11 +1,11 @@ #! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab +# For use with Topaz Scripts Version 1.8 import os, sys, getopt # local routines import convert2xml -import flatxml2html import decode_meta @@ -45,6 +45,13 @@ class GParser(object): argres[j] = int(argres[j]) return result + + def getGlyphDim(self, gly): + maxh = (self.gh[gly] * self.dpi) / self.gdpi[gly] + maxw = (self.gw[gly] * self.dpi) / self.gdpi[gly] + return maxh, maxw + + def getPath(self, gly): path = '' if (gly < 0) or (gly >= self.count): @@ -172,8 +179,10 @@ class PParser(object): def usage(): print 'Usage: ' print ' ' - print ' gensvg.py unencryptedBookDir' + print ' gensvg.py [options] unencryptedBookDir' print ' ' + print ' -x : output browseable XHTML+SVG pages (default)' + print ' -r : output raw SVG images' def main(argv): @@ -185,7 +194,7 @@ def main(argv): argv = argv.split() try: - opts, args = getopt.getopt(argv[1:], "h:") + opts, args = getopt.getopt(argv[1:], "xrh") except getopt.GetoptError, err: print str(err) @@ -196,10 +205,15 @@ def main(argv): usage() sys.exit(2) + raw = 0 for o, a in opts: if o =="-h": usage() sys.exit(0) + if o =="-x": + raw = 0 + if o =="-r": + raw = 1 bookDir = args[0] @@ -264,7 +278,9 @@ def main(argv): gp = GParser(flat_xml) for i in xrange(0, gp.count): path = gp.getPath(i) - glyfile.write('<path id="gl%d" d="%s" fill="black" />\n' % (counter * 256 + i, path)) + maxh, maxw = gp.getGlyphDim(i) + # glyfile.write('<path id="gl%d" d="%s" fill="black" />\n' % (counter * 256 + i, path)) + glyfile.write('<path id="gl%d" d="%s" fill="black" /><!-- width=%d height=%d -->\n' % (counter * 256 + i, path, maxw, maxh )) counter += 1 glyfile.write('</defs>\n') glyfile.write('</svg>\n') @@ -274,7 +290,7 @@ def main(argv): # Books are at 1440 DPI. This is rendering at twice that size for # readability when rendering to the screen. - scaledpi = 720 + scaledpi = 1440 filenames = os.listdir(pageDir) filenames = sorted(filenames) counter = 0 @@ -283,11 +299,45 @@ def main(argv): fname = os.path.join(pageDir,filename) flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname) pp = PParser(flat_xml) - pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w') + if (raw) : + pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w') + else : + pfile = open(os.path.join(svgDir,'page%04d.xhtml' % counter), 'w') + pfile.write('<?xml version="1.0" standalone="no"?>\n') - pfile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n') - pfile.write('<svg width="%fin" height="%fin" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1)) - pfile.write('<title>Page %d - %s by %s\n' % (counter, metadata['Title'],metadata['Authors'])) + if (raw): + pfile.write('\n') + pfile.write('\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1)) + pfile.write('Page %d - %s by %s\n' % (counter, metadata['Title'],metadata['Authors'])) + else: + pfile.write('\n'); + pfile.write('\n'); + pfile.write('Page %d - %s by %s\n' % (counter, metadata['Title'],metadata['Authors'])) + pfile.write('\n') + pfile.write('\n') + pfile.write('\n') + pfile.write('
\n') + if (counter == 0) : + pfile.write('\n') + else: + pfile.write('\n') + pfile.write('' % (pp.pw, pp.ph)) + if (pp.gid != None): pfile.write('\n') gdefs = pp.getGlyphs(glyfname) @@ -303,7 +353,18 @@ def main(argv): pfile.write('\n' % (pp.gid[j], pp.gx[j], pp.gy[j])) if (img == None or len(img) == 0) and (pp.gid == None or len(pp.gid) == 0): pfile.write('This page intentionally left blank.\nUntil this notice unintentionally gave it content. (gensvg.py)\n'); - pfile.write('') + if (raw) : + pfile.write('') + else : + pfile.write('\n') + if (counter == len(filenames) - 1) : + pfile.write('\n') + else : + pfile.write('\n') + pfile.write('
\n') + pfile.write('
zoom in - zoom out
\n') + pfile.write('\n') + pfile.write('\n') pfile.close() counter += 1 @@ -312,4 +373,4 @@ def main(argv): return 0 if __name__ == '__main__': - sys.exit(main('')) \ No newline at end of file + sys.exit(main('')) diff --git a/Topaz_Tools/lib/genxml.py b/Topaz_Tools/lib/genxml.py index c335e88..cfc5325 100644 --- a/Topaz_Tools/lib/genxml.py +++ b/Topaz_Tools/lib/genxml.py @@ -1,5 +1,6 @@ #! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab +# For use with Topaz Scripts Version 1.8 import os, sys, getopt diff --git a/Topaz_Tools/lib/getpagedim.py b/Topaz_Tools/lib/getpagedim.py index dd1071c..33c722a 100644 --- a/Topaz_Tools/lib/getpagedim.py +++ b/Topaz_Tools/lib/getpagedim.py @@ -1,5 +1,6 @@ #! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab +# For use with Topaz Scripts Version 1.8 from __future__ import with_statement import csv diff --git a/Topaz_Tools/lib/readme.txt b/Topaz_Tools/lib/readme.txt index 5d4eadb..eca30f0 100644 --- a/Topaz_Tools/lib/readme.txt +++ b/Topaz_Tools/lib/readme.txt @@ -19,25 +19,16 @@ Here are the steps: 1. Unzip the topazscripts.zip file to get the full set of python scripts. The files you should have after unzipping are: -cmbtc_dump.py - (author: cmbtc) unencrypts and dumps sections into separate files -decode_meta.py - converts metadata0000.dat to human readable text (for the most part) +cmbtc_dump.py - (author: cmbtc) unencrypts and dumps sections into separate files for Kindle for PC +cmbtc_dump_nonK4PC.py - (author - DiapDealer) for use with standalone Kindle and ipod/iphone topaz books +decode_meta.py - converts metadata0000.dat to make it available convert2xml.py - converts page*.dat, other*.dat, and glyphs*.dat files to pseudo xml descriptions flatxml2html.py - converts a "flattened" xml description to html using the ocrtext stylexml2css.py - converts stylesheet "flattened" xml into css (as best it can) getpagedim.py - reads page0000.dat to get the book height and width parameters genxml.py - main program to convert everything to xml genhtml.py - main program to generate "book.html" -gensvg.py - (author: clarknova) main program to create an svg grpahic of each page - - -In addition there is now a new file: - -cmbtc_dump_mac_linux.py - -If you know the pid of your ipod and/or your standalone Kindle and your book -was meant for that device, you can use this program to dump the proper sections -on Mac OSX and Linux (and even Windows if you do not have Kindle4PC installed). -Thank DiapDealer for creating it! +gensvg.py - (author: clarknova) main program to create an xhmtl page with embedded svg graphics Please note, gensvg.py, genhtml.py, and genxml.py import and use @@ -52,8 +43,20 @@ of its contents as files All Thanks go to CMBTC who broke the DRM for Topaz - without it nothing else would be possible +If you purchased the book for Kindle For PC, you must do the following: + cmbtc_dump.py -d -o TARGETDIR [-p pid] YOURTOPAZBOOKNAMEHERE + +However, if you purchased the book for a standalone Kindle or ipod/iphone +and you know your pid (at least the first 8 characters) then you should +instead do the following + + cmbtc_dump_nonK4PC.py -d -o TARGETDIR -p 12345678 YOURTOPAZBOOKNAMEHERE + +where 12345678 should be replaced by the first 8 characters of your PID + + This should create a directory called "TARGETDIR" in your current directory. It should have the following files in it: @@ -64,35 +67,48 @@ page - directory filled with page*.dat files glyphs - directory filled with glyphs*.dat files +3. REQUIRED: Create xhtml page descriptions with embedded svg +that show the exact representation of each page as an image +with proper glyphs and positioning. -3. Convert the files in "TARGETDIR" to their xml descriptions -which can be found in TARGETDIR/xml/ upon completion. +The step must NOW be done BEFORE attempting conversion to html - genxml.py TARGETDIR + gensvg.py TARGETDIR + +When complete, use a web-browser to open the page*.xhtml files +in TARGETDIR/svg/ to see what the book really looks like. + +All thanks go to CLARKNOVA for this program. This program is +needed to actually see the true image of each page and so that +the next step can properly create images from glyphs for +monograms, dropcaps and tables. - -4. Create book.html which can be found in "TARGETDIR" after -completion. This html conversion can not fully capture -all of the layouts actually used in the book and needs to -be edited to include special font handling such as bold -or italics that can not be determined from the ocrText -information or the style information. If you want to -see things exactly as they were, see step 5 below. +4. Create "book.html" which can be found in "TARGETDIR" after +completion. genhtml.py TARGETDIR +***IMPORTANT NOTE*** This html conversion can not fully capture +all of the layouts and styles actually used in the book +and the resulting html will need to be edited by hand to +properly set bold and/or italics, handle font size changes, +and to fix the sometimes horiffic mistakes in the ocrText +used to create the html. -5. Create an svg description of each page which can -be found in TARGETDIR/svg/ upon completion. +FYI: Sigil is a wonderful, free cross- +platform program that can be used to edit the html and +create an epub if you so desire. -All thanks go to CLARKNOVA for this program. This program is -needed to actually see the true image of each page so that hand -editing of the html created by step 4 can be done. -Or use the resulting svg files to read each page of the book -exactly as it has been laid out originally. +5. Optional Step: Convert the files in "TARGETDIR" to their +xml descriptions which can be found in TARGETDIR/xml/ +upon completion. - gensvg.py TARGETDIR + genxml.py TARGETDIR + + +These conversions are important for allowing future (and better) +conversions to come later. diff --git a/Topaz_Tools/lib/stylexml2css.py b/Topaz_Tools/lib/stylexml2css.py index 0d2739b..791d067 100644 --- a/Topaz_Tools/lib/stylexml2css.py +++ b/Topaz_Tools/lib/stylexml2css.py @@ -1,5 +1,6 @@ #! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab +# For use with Topaz Scripts Version 1.8 from __future__ import with_statement import csv