topazscripts 1.8
This commit is contained in:
parent
c93f8e1edd
commit
24f001c61e
|
@ -1,3 +1,22 @@
|
||||||
|
Changes in version 1.8
|
||||||
|
|
||||||
|
- gensvg.py now builds wonderful xhtml pages with embedded svg
|
||||||
|
that can be easily paged through as if reading a book!
|
||||||
|
(tested in Safari for Mac and Win and Firefox)
|
||||||
|
(requires javascript to be enabled)
|
||||||
|
|
||||||
|
- genhtml.py now REQUIRES that gensvg.py be run FIRST
|
||||||
|
this allows create of images on the fly from glyphs
|
||||||
|
|
||||||
|
- genhtml.py now automatically makes tables of words into svg
|
||||||
|
based images and will handle glyph based ornate first
|
||||||
|
letters of words
|
||||||
|
|
||||||
|
- cmbtc_dump_mac_linux.py has been renamed to be
|
||||||
|
cmbtc_dump_nonK4PC.py to make it clearer
|
||||||
|
when it needs to be used
|
||||||
|
|
||||||
|
|
||||||
Changes in version 1.7
|
Changes in version 1.7
|
||||||
- gensvg.py has been improved so that the glyphs render exactly (ClarkNova)
|
- gensvg.py has been improved so that the glyphs render exactly (ClarkNova)
|
||||||
- gensvg.py has fixed a render order "bug" that allowed some images to cover or hide text. (ClarkNova)
|
- gensvg.py has fixed a render order "bug" that allowed some images to cover or hide text. (ClarkNova)
|
||||||
|
@ -5,7 +24,6 @@ Changes in version 1.7
|
||||||
- add missing <title> tag
|
- add missing <title> tag
|
||||||
- make xhtml compliant doctype and minor changes to write correct xhtml
|
- make xhtml compliant doctype and minor changes to write correct xhtml
|
||||||
- make divs that act as anchors be hidden visually and to take up 0 height and 0 width to prevent any impact on layout
|
- make divs that act as anchors be hidden visually and to take up 0 height and 0 width to prevent any impact on layout
|
||||||
- added support for new version of the <_span> tag called <span>
|
|
||||||
|
|
||||||
Changes in version 1.6
|
Changes in version 1.6
|
||||||
- support for books whose paragraphs have no styles
|
- support for books whose paragraphs have no styles
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
#! /usr/bin/python
|
#! /usr/bin/python
|
||||||
|
# For use in Topaz Scripts version 1.8
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
#! /usr/bin/python
|
#! /usr/bin/python
|
||||||
|
# For use with Topaz Scripts Version 1.8
|
||||||
|
|
||||||
from __future__ import with_statement
|
from __future__ import with_statement
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
#! /usr/bin/python
|
#! /usr/bin/python
|
||||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
# For use with Topaz Scripts Version 1.8
|
||||||
|
|
||||||
from __future__ import with_statement
|
from __future__ import with_statement
|
||||||
import csv
|
import csv
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
#! /usr/bin/python
|
#! /usr/bin/python
|
||||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
# For use with Topaz Scripts Version 1.8
|
||||||
|
|
||||||
from __future__ import with_statement
|
from __future__ import with_statement
|
||||||
import csv
|
import csv
|
||||||
|
|
|
@ -1,21 +1,27 @@
|
||||||
#! /usr/bin/python
|
#! /usr/bin/python
|
||||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
# For use with Topaz Scripts Version 1.8
|
||||||
|
|
||||||
from __future__ import with_statement
|
from __future__ import with_statement
|
||||||
import csv
|
import csv
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
import math
|
||||||
import getopt
|
import getopt
|
||||||
from struct import pack
|
from struct import pack
|
||||||
from struct import unpack
|
from struct import unpack
|
||||||
|
|
||||||
|
|
||||||
class DocParser(object):
|
class DocParser(object):
|
||||||
def __init__(self, flatxml, classlst, fileid):
|
def __init__(self, flatxml, classlst, fileid, bookDir):
|
||||||
self.id = os.path.basename(fileid).replace('.dat','')
|
self.id = os.path.basename(fileid).replace('.dat','')
|
||||||
|
self.svgcount = 0
|
||||||
self.docList = flatxml.split('\n')
|
self.docList = flatxml.split('\n')
|
||||||
self.docSize = len(self.docList)
|
self.docSize = len(self.docList)
|
||||||
self.classList = {}
|
self.classList = {}
|
||||||
|
self.bookDir = bookDir
|
||||||
|
self.glyphPaths = { }
|
||||||
|
self.numPaths = 0
|
||||||
tmpList = classlst.split('\n')
|
tmpList = classlst.split('\n')
|
||||||
for pclass in tmpList:
|
for pclass in tmpList:
|
||||||
if pclass != '':
|
if pclass != '':
|
||||||
|
@ -30,6 +36,107 @@ class DocParser(object):
|
||||||
self.paracont_stemid = []
|
self.paracont_stemid = []
|
||||||
self.parastems_stemid = []
|
self.parastems_stemid = []
|
||||||
|
|
||||||
|
|
||||||
|
def getGlyph(self, gid):
|
||||||
|
result = ''
|
||||||
|
id='gl%d' % gid
|
||||||
|
return self.glyphPaths[id]
|
||||||
|
|
||||||
|
|
||||||
|
def glyphs_to_image(self, glyphList):
|
||||||
|
|
||||||
|
def extract(path, key):
|
||||||
|
b = path.find(key) + len(key)
|
||||||
|
e = path.find(' ',b)
|
||||||
|
return int(path[b:e])
|
||||||
|
|
||||||
|
def extractID(path, key):
|
||||||
|
b = path.find(key) + len(key)
|
||||||
|
e = path.find('"',b)
|
||||||
|
return path[b:e]
|
||||||
|
|
||||||
|
|
||||||
|
svgDir = os.path.join(self.bookDir,'svg')
|
||||||
|
glyfile = os.path.join(svgDir,'glyphs.svg')
|
||||||
|
|
||||||
|
imgDir = os.path.join(self.bookDir,'img')
|
||||||
|
imgname = self.id + '_%04d.svg' % self.svgcount
|
||||||
|
imgfile = os.path.join(imgDir,imgname)
|
||||||
|
|
||||||
|
# build hash table of glyph paths keyed by glyph id
|
||||||
|
if self.numPaths == 0:
|
||||||
|
gfile = open(glyfile, 'r')
|
||||||
|
while True:
|
||||||
|
path = gfile.readline()
|
||||||
|
if (path == ''): break
|
||||||
|
glyphid = extractID(path,'id="')
|
||||||
|
self.glyphPaths[glyphid] = path
|
||||||
|
self.numPaths += 1
|
||||||
|
gfile.close()
|
||||||
|
|
||||||
|
|
||||||
|
# get glyph information
|
||||||
|
gxList = self.getData('info.glyph.x',0,-1)
|
||||||
|
gyList = self.getData('info.glyph.y',0,-1)
|
||||||
|
gidList = self.getData('info.glyph.glyphID',0,-1)
|
||||||
|
|
||||||
|
gids = []
|
||||||
|
maxws = []
|
||||||
|
maxhs = []
|
||||||
|
xs = []
|
||||||
|
ys = []
|
||||||
|
gdefs = []
|
||||||
|
|
||||||
|
# get path defintions, positions, dimensions for ecah glyph
|
||||||
|
# that makes up the image, and find min x and min y to reposition origin
|
||||||
|
minx = -1
|
||||||
|
miny = -1
|
||||||
|
for j in glyphList:
|
||||||
|
gid = gidList[j]
|
||||||
|
gids.append(gid)
|
||||||
|
|
||||||
|
xs.append(gxList[j])
|
||||||
|
if minx == -1: minx = gxList[j]
|
||||||
|
else : minx = min(minx, gxList[j])
|
||||||
|
|
||||||
|
ys.append(gyList[j])
|
||||||
|
if miny == -1: miny = gyList[j]
|
||||||
|
else : miny = min(miny, gyList[j])
|
||||||
|
|
||||||
|
path = self.getGlyph(gid)
|
||||||
|
gdefs.append(path)
|
||||||
|
|
||||||
|
maxws.append(extract(path,'width='))
|
||||||
|
maxhs.append(extract(path,'height='))
|
||||||
|
|
||||||
|
|
||||||
|
# change the origin to minx, miny and calc max height and width
|
||||||
|
maxw = maxws[0] + xs[0] - minx
|
||||||
|
maxh = maxhs[0] + ys[0] - miny
|
||||||
|
for j in xrange(0, len(xs)):
|
||||||
|
xs[j] = xs[j] - minx
|
||||||
|
ys[j] = ys[j] - miny
|
||||||
|
maxw = max( maxw, (maxws[j] + xs[j]) )
|
||||||
|
maxh = max( maxh, (maxhs[j] + ys[j]) )
|
||||||
|
|
||||||
|
# open the image file for output
|
||||||
|
ifile = open(imgfile,'w')
|
||||||
|
ifile.write('<?xml version="1.0" standalone="no"?>\n')
|
||||||
|
ifile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n')
|
||||||
|
ifile.write('<svg width="%dpx" height="%dpx" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (math.floor(maxw/10), math.floor(maxh/10), maxw, maxh))
|
||||||
|
ifile.write('<defs>\n')
|
||||||
|
for j in xrange(0,len(gdefs)):
|
||||||
|
ifile.write(gdefs[j])
|
||||||
|
ifile.write('</defs>\n')
|
||||||
|
for j in xrange(0,len(gids)):
|
||||||
|
ifile.write('<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (gids[j], xs[j], ys[j]))
|
||||||
|
ifile.write('</svg>')
|
||||||
|
ifile.close()
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# return tag at line pos in document
|
# return tag at line pos in document
|
||||||
def lineinDoc(self, pos) :
|
def lineinDoc(self, pos) :
|
||||||
if (pos >= 0) and (pos < self.docSize) :
|
if (pos >= 0) and (pos < self.docSize) :
|
||||||
|
@ -77,6 +184,17 @@ class DocParser(object):
|
||||||
return startpos
|
return startpos
|
||||||
|
|
||||||
|
|
||||||
|
# returns a vector of integers for the tagpath
|
||||||
|
def getData(self, tagpath, pos, end):
|
||||||
|
argres=[]
|
||||||
|
(foundat, argt) = self.findinDoc(tagpath, pos, end)
|
||||||
|
if (argt != None) and (len(argt) > 0) :
|
||||||
|
argList = argt.split('|')
|
||||||
|
argres = [ int(strval) for strval in argList]
|
||||||
|
return argres
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# build a description of the paragraph
|
# build a description of the paragraph
|
||||||
def getParaDescription(self, start, end):
|
def getParaDescription(self, start, end):
|
||||||
|
|
||||||
|
@ -120,6 +238,7 @@ class DocParser(object):
|
||||||
# this type of paragrph may be made up of multiple _spans, inline
|
# this type of paragrph may be made up of multiple _spans, inline
|
||||||
# word monograms (images) and words with semantic meaning
|
# word monograms (images) and words with semantic meaning
|
||||||
# and now a new type "span" versus the old "_span"
|
# and now a new type "span" versus the old "_span"
|
||||||
|
# plus glyphs used to form starting letter of first word
|
||||||
|
|
||||||
# need to parse this type line by line
|
# need to parse this type line by line
|
||||||
line = start + 1
|
line = start + 1
|
||||||
|
@ -143,6 +262,21 @@ class DocParser(object):
|
||||||
result.append(('ocr', wordnum))
|
result.append(('ocr', wordnum))
|
||||||
line += 1
|
line += 1
|
||||||
|
|
||||||
|
elif name.endswith('word.firstGlyph') :
|
||||||
|
first = int(argres)
|
||||||
|
(name, argres) = self.lineinDoc(line+1)
|
||||||
|
if not name.endswith('word.lastGlyph'):
|
||||||
|
print 'Error: - incorrect glyph ordering inside word in paragraph'
|
||||||
|
last = int(argres)
|
||||||
|
glyphList = []
|
||||||
|
for glyphnum in xrange(first, last):
|
||||||
|
glyphList.append(glyphnum)
|
||||||
|
num = self.svgcount
|
||||||
|
self.glyphs_to_image(glyphList)
|
||||||
|
self.svgcount += 1
|
||||||
|
result.append(('svg', num))
|
||||||
|
line += 1
|
||||||
|
|
||||||
elif name.endswith('word.class'):
|
elif name.endswith('word.class'):
|
||||||
(cname, space) = argres.split('-',1)
|
(cname, space) = argres.split('-',1)
|
||||||
if space == '' : space = '0'
|
if space == '' : space = '0'
|
||||||
|
@ -241,6 +375,11 @@ class DocParser(object):
|
||||||
parares += '<img src="img/img%04d.jpg" alt="" />' % num
|
parares += '<img src="img/img%04d.jpg" alt="" />' % num
|
||||||
parares += sep
|
parares += sep
|
||||||
|
|
||||||
|
elif wtype == 'svg' :
|
||||||
|
sep = ''
|
||||||
|
parares += '<img src="img/' + self.id + '_%04d.svg" alt="" />' % num
|
||||||
|
parares += sep
|
||||||
|
|
||||||
if len(sep) > 0 : parares = parares[0:-1]
|
if len(sep) > 0 : parares = parares[0:-1]
|
||||||
if (type == 'full') or (type == 'end') :
|
if (type == 'full') or (type == 'end') :
|
||||||
parares += '</p>'
|
parares += '</p>'
|
||||||
|
@ -260,10 +399,7 @@ class DocParser(object):
|
||||||
if argres : self.ocrtext = argres.split('|')
|
if argres : self.ocrtext = argres.split('|')
|
||||||
|
|
||||||
# get information to dehyphenate the text
|
# get information to dehyphenate the text
|
||||||
(pos, argres) = self.findinDoc('info.dehyphen.rootID',0,-1)
|
self.dehyphen_rootid = self.getData('info.dehyphen.rootID',0,-1)
|
||||||
if argres:
|
|
||||||
argList = argres.split('|')
|
|
||||||
self.dehyphen_rootid = [ int(strval) for strval in argList]
|
|
||||||
|
|
||||||
# determine if first paragraph is continued from previous page
|
# determine if first paragraph is continued from previous page
|
||||||
(pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1)
|
(pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1)
|
||||||
|
@ -274,16 +410,10 @@ class DocParser(object):
|
||||||
last_para_continued = (self.paracont_stemid != None)
|
last_para_continued = (self.paracont_stemid != None)
|
||||||
|
|
||||||
# collect link ids
|
# collect link ids
|
||||||
(pos, argres) = self.findinDoc('info.word.link_id',0,-1)
|
self.link_id = self.getData('info.word.link_id',0,-1)
|
||||||
if argres:
|
|
||||||
argList = argres.split('|')
|
|
||||||
self.link_id = [ int(strval) for strval in argList]
|
|
||||||
|
|
||||||
# collect link destination page numbers
|
# collect link destination page numbers
|
||||||
(pos, argres) = self.findinDoc('info.links.page',0,-1)
|
self.link_page = self.getData('info.links.page',0,-1)
|
||||||
if argres :
|
|
||||||
argList = argres.split('|')
|
|
||||||
self.link_page = [ int(strval) for strval in argList]
|
|
||||||
|
|
||||||
# collect link titles
|
# collect link titles
|
||||||
(pos, argres) = self.findinDoc('info.links.title',0,-1)
|
(pos, argres) = self.findinDoc('info.links.title',0,-1)
|
||||||
|
@ -382,23 +512,45 @@ class DocParser(object):
|
||||||
|
|
||||||
|
|
||||||
elif (regtype == 'table') :
|
elif (regtype == 'table') :
|
||||||
ptype = 'full'
|
# translate first and last word into first and last glyphs
|
||||||
if first_para_continued :
|
# and generate table as an image and include a link to it
|
||||||
ptype = 'end'
|
glyphList = []
|
||||||
first_para_continued = False
|
(pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
|
||||||
(pclass, pdesc) = self.getParaDescription(start,end)
|
(pos, slast) = self.findinDoc('paragraph.lastWord',start,end)
|
||||||
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
|
firstglyphList = self.getData('word.firstGlyph',0,-1)
|
||||||
print "Warnings - Table Conversions are notoriously poor"
|
gidList = self.getData('info.glyph.glyphID',0,-1)
|
||||||
print "Strongly recommend taking a screen capture image of the "
|
if (sfirst != None) and (slast != None) :
|
||||||
print "table in %s.svg and using it to replace this attempt at a table" % self.id
|
first = int(sfirst)
|
||||||
|
last = int(slast)
|
||||||
|
firstGlyph = firstglyphList[first]
|
||||||
|
if last < len(firstglyphList):
|
||||||
|
lastGlyph = firstglyphList[last]
|
||||||
|
else :
|
||||||
|
lastGlyph = len(gidList)
|
||||||
|
for glyphnum in xrange(firstGlyph, lastGlyph):
|
||||||
|
glyphList.append(glyphnum)
|
||||||
|
num = self.svgcount
|
||||||
|
self.glyphs_to_image(glyphList)
|
||||||
|
self.svgcount += 1
|
||||||
|
htmlpage += '<div class="graphic"><img src="img/' + self.id + '_%04d.svg" alt="" /></div>' % num
|
||||||
|
else :
|
||||||
|
ptype = 'full'
|
||||||
|
if first_para_continued :
|
||||||
|
ptype = 'end'
|
||||||
|
first_para_continued = False
|
||||||
|
(pclass, pdesc) = self.getParaDescription(start,end)
|
||||||
|
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
|
||||||
|
print " "
|
||||||
|
print "Warning: - Table Conversions are notoriously poor"
|
||||||
|
print " Strongly recommend taking a screen capture image of the "
|
||||||
|
print " table in %s.svg and using it to replace this attempt at a table" % self.id
|
||||||
|
print " "
|
||||||
|
|
||||||
elif (regtype == 'synth_fcvr.center') or (regtype == 'synth_text.center'):
|
elif (regtype == 'synth_fcvr.center') or (regtype == 'synth_text.center'):
|
||||||
(pos, simgsrc) = self.findinDoc('img.src',start,end)
|
(pos, simgsrc) = self.findinDoc('img.src',start,end)
|
||||||
if simgsrc:
|
if simgsrc:
|
||||||
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
|
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
|
||||||
|
|
||||||
|
|
||||||
else :
|
else :
|
||||||
print 'Warning: region type', regtype
|
print 'Warning: region type', regtype
|
||||||
(pos, temp) = self.findinDoc('paragraph',start,end)
|
(pos, temp) = self.findinDoc('paragraph',start,end)
|
||||||
|
@ -437,10 +589,10 @@ class DocParser(object):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def convert2HTML(flatxml, classlst, fileid):
|
def convert2HTML(flatxml, classlst, fileid, bookDir):
|
||||||
|
|
||||||
# create a document parser
|
# create a document parser
|
||||||
dp = DocParser(flatxml, classlst, fileid)
|
dp = DocParser(flatxml, classlst, fileid, bookDir)
|
||||||
|
|
||||||
htmlpage = dp.process()
|
htmlpage = dp.process()
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
#! /usr/bin/python
|
#! /usr/bin/python
|
||||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
# For use with Topaz Scripts Version 1.8
|
||||||
|
|
||||||
import os, sys, getopt
|
import os, sys, getopt
|
||||||
|
|
||||||
|
@ -65,6 +66,12 @@ def main(argv):
|
||||||
print "Can not find image directory in unencrypted book"
|
print "Can not find image directory in unencrypted book"
|
||||||
sys.exit(-1)
|
sys.exit(-1)
|
||||||
|
|
||||||
|
svgDir = os.path.join(bookDir,'svg')
|
||||||
|
if not os.path.exists(svgDir) :
|
||||||
|
print "Can not find svg directory in unencrypted book"
|
||||||
|
print "please run gensvg.py before running genhtml.py"
|
||||||
|
sys.exit(-1)
|
||||||
|
|
||||||
otherFile = os.path.join(bookDir,'other0000.dat')
|
otherFile = os.path.join(bookDir,'other0000.dat')
|
||||||
if not os.path.exists(otherFile) :
|
if not os.path.exists(otherFile) :
|
||||||
print "Can not find other0000.dat in unencrypted book"
|
print "Can not find other0000.dat in unencrypted book"
|
||||||
|
@ -75,7 +82,6 @@ def main(argv):
|
||||||
print "Can not find metadata0000.dat in unencrypted book"
|
print "Can not find metadata0000.dat in unencrypted book"
|
||||||
sys.exit(-1)
|
sys.exit(-1)
|
||||||
|
|
||||||
|
|
||||||
htmlFileName = "book.html"
|
htmlFileName = "book.html"
|
||||||
htmlstr = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n'
|
htmlstr = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n'
|
||||||
htmlstr += '<html>\n'
|
htmlstr += '<html>\n'
|
||||||
|
@ -133,7 +139,7 @@ def main(argv):
|
||||||
print ' ', filename
|
print ' ', filename
|
||||||
fname = os.path.join(pageDir,filename)
|
fname = os.path.join(pageDir,filename)
|
||||||
flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
|
flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
|
||||||
htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname)
|
htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir)
|
||||||
|
|
||||||
htmlstr += '</body>\n</html>\n'
|
htmlstr += '</body>\n</html>\n'
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
#! /usr/bin/python
|
#! /usr/bin/python
|
||||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
# For use with Topaz Scripts Version 1.8
|
||||||
|
|
||||||
import os, sys, getopt
|
import os, sys, getopt
|
||||||
|
|
||||||
# local routines
|
# local routines
|
||||||
import convert2xml
|
import convert2xml
|
||||||
import flatxml2html
|
|
||||||
import decode_meta
|
import decode_meta
|
||||||
|
|
||||||
|
|
||||||
|
@ -45,6 +45,13 @@ class GParser(object):
|
||||||
argres[j] = int(argres[j])
|
argres[j] = int(argres[j])
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def getGlyphDim(self, gly):
|
||||||
|
maxh = (self.gh[gly] * self.dpi) / self.gdpi[gly]
|
||||||
|
maxw = (self.gw[gly] * self.dpi) / self.gdpi[gly]
|
||||||
|
return maxh, maxw
|
||||||
|
|
||||||
|
|
||||||
def getPath(self, gly):
|
def getPath(self, gly):
|
||||||
path = ''
|
path = ''
|
||||||
if (gly < 0) or (gly >= self.count):
|
if (gly < 0) or (gly >= self.count):
|
||||||
|
@ -172,8 +179,10 @@ class PParser(object):
|
||||||
def usage():
|
def usage():
|
||||||
print 'Usage: '
|
print 'Usage: '
|
||||||
print ' '
|
print ' '
|
||||||
print ' gensvg.py unencryptedBookDir'
|
print ' gensvg.py [options] unencryptedBookDir'
|
||||||
print ' '
|
print ' '
|
||||||
|
print ' -x : output browseable XHTML+SVG pages (default)'
|
||||||
|
print ' -r : output raw SVG images'
|
||||||
|
|
||||||
|
|
||||||
def main(argv):
|
def main(argv):
|
||||||
|
@ -185,7 +194,7 @@ def main(argv):
|
||||||
argv = argv.split()
|
argv = argv.split()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
opts, args = getopt.getopt(argv[1:], "h:")
|
opts, args = getopt.getopt(argv[1:], "xrh")
|
||||||
|
|
||||||
except getopt.GetoptError, err:
|
except getopt.GetoptError, err:
|
||||||
print str(err)
|
print str(err)
|
||||||
|
@ -196,10 +205,15 @@ def main(argv):
|
||||||
usage()
|
usage()
|
||||||
sys.exit(2)
|
sys.exit(2)
|
||||||
|
|
||||||
|
raw = 0
|
||||||
for o, a in opts:
|
for o, a in opts:
|
||||||
if o =="-h":
|
if o =="-h":
|
||||||
usage()
|
usage()
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
if o =="-x":
|
||||||
|
raw = 0
|
||||||
|
if o =="-r":
|
||||||
|
raw = 1
|
||||||
|
|
||||||
bookDir = args[0]
|
bookDir = args[0]
|
||||||
|
|
||||||
|
@ -264,7 +278,9 @@ def main(argv):
|
||||||
gp = GParser(flat_xml)
|
gp = GParser(flat_xml)
|
||||||
for i in xrange(0, gp.count):
|
for i in xrange(0, gp.count):
|
||||||
path = gp.getPath(i)
|
path = gp.getPath(i)
|
||||||
glyfile.write('<path id="gl%d" d="%s" fill="black" />\n' % (counter * 256 + i, path))
|
maxh, maxw = gp.getGlyphDim(i)
|
||||||
|
# glyfile.write('<path id="gl%d" d="%s" fill="black" />\n' % (counter * 256 + i, path))
|
||||||
|
glyfile.write('<path id="gl%d" d="%s" fill="black" /><!-- width=%d height=%d -->\n' % (counter * 256 + i, path, maxw, maxh ))
|
||||||
counter += 1
|
counter += 1
|
||||||
glyfile.write('</defs>\n')
|
glyfile.write('</defs>\n')
|
||||||
glyfile.write('</svg>\n')
|
glyfile.write('</svg>\n')
|
||||||
|
@ -274,7 +290,7 @@ def main(argv):
|
||||||
|
|
||||||
# Books are at 1440 DPI. This is rendering at twice that size for
|
# Books are at 1440 DPI. This is rendering at twice that size for
|
||||||
# readability when rendering to the screen.
|
# readability when rendering to the screen.
|
||||||
scaledpi = 720
|
scaledpi = 1440
|
||||||
filenames = os.listdir(pageDir)
|
filenames = os.listdir(pageDir)
|
||||||
filenames = sorted(filenames)
|
filenames = sorted(filenames)
|
||||||
counter = 0
|
counter = 0
|
||||||
|
@ -283,11 +299,45 @@ def main(argv):
|
||||||
fname = os.path.join(pageDir,filename)
|
fname = os.path.join(pageDir,filename)
|
||||||
flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
|
flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
|
||||||
pp = PParser(flat_xml)
|
pp = PParser(flat_xml)
|
||||||
pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w')
|
if (raw) :
|
||||||
|
pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w')
|
||||||
|
else :
|
||||||
|
pfile = open(os.path.join(svgDir,'page%04d.xhtml' % counter), 'w')
|
||||||
|
|
||||||
pfile.write('<?xml version="1.0" standalone="no"?>\n')
|
pfile.write('<?xml version="1.0" standalone="no"?>\n')
|
||||||
pfile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n')
|
if (raw):
|
||||||
pfile.write('<svg width="%fin" height="%fin" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1))
|
pfile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n')
|
||||||
pfile.write('<title>Page %d - %s by %s</title>\n' % (counter, metadata['Title'],metadata['Authors']))
|
pfile.write('<svg width="%fin" height="%fin" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1))
|
||||||
|
pfile.write('<title>Page %d - %s by %s</title>\n' % (counter, metadata['Title'],metadata['Authors']))
|
||||||
|
else:
|
||||||
|
pfile.write('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n');
|
||||||
|
pfile.write('<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" ><head>\n');
|
||||||
|
pfile.write('<title>Page %d - %s by %s</title>\n' % (counter, metadata['Title'],metadata['Authors']))
|
||||||
|
pfile.write('<script><![CDATA[\n');
|
||||||
|
pfile.write('function gd(){var p=window.location.href.replace(/^.*\?dpi=(\d+).*$/i,"$1");return p;}\n');
|
||||||
|
pfile.write('var dpi=%d;\n' % scaledpi);
|
||||||
|
if (counter) :
|
||||||
|
pfile.write('var prevpage="page%04d.xhtml";\n' % (counter - 1))
|
||||||
|
if (counter < len(filenames)-1) :
|
||||||
|
pfile.write('var nextpage="page%04d.xhtml";\n' % (counter + 1))
|
||||||
|
pfile.write('var pw=%d;var ph=%d;' % (pp.pw, pp.ph))
|
||||||
|
pfile.write('function zoomin(){dpi=dpi*(2/3);setsize();}\n')
|
||||||
|
pfile.write('function zoomout(){dpi=dpi*1.5;setsize();}\n')
|
||||||
|
pfile.write('function setsize(){var svg=document.getElementById("svgimg");var prev=document.getElementById("prevsvg");var next=document.getElementById("nextsvg");var width=(pw/dpi)+"in";var height=(ph/dpi)+"in";svg.setAttribute("width",width);svg.setAttribute("height",height);prev.setAttribute("height",height);prev.setAttribute("width","50px");next.setAttribute("height",height);next.setAttribute("width","50px");}\n')
|
||||||
|
pfile.write('function ppage(){window.location.href=prevpage+"?dpi="+Math.round(dpi);}\n')
|
||||||
|
pfile.write('function npage(){window.location.href=nextpage+"?dpi="+Math.round(dpi);}\n')
|
||||||
|
pfile.write('var gt=gd();if(gt>0){dpi=gt;}\n')
|
||||||
|
pfile.write('window.onload=setsize;\n')
|
||||||
|
pfile.write(']]></script>\n')
|
||||||
|
pfile.write('</head>\n')
|
||||||
|
pfile.write('<body onLoad="setsize();" style="background-color:#777;text-align:center;">\n')
|
||||||
|
pfile.write('<div style="white-space:nowrap;">\n')
|
||||||
|
if (counter == 0) :
|
||||||
|
pfile.write('<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n')
|
||||||
|
else:
|
||||||
|
pfile.write('<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,150,95,5,95,295" fill="#AAAAAA" /></svg></a>\n')
|
||||||
|
pfile.write('<a href="javascript:npage();"><svg id="svgimg" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" style="background-color:#FFF;border:1px solid black;">' % (pp.pw, pp.ph))
|
||||||
|
|
||||||
if (pp.gid != None):
|
if (pp.gid != None):
|
||||||
pfile.write('<defs>\n')
|
pfile.write('<defs>\n')
|
||||||
gdefs = pp.getGlyphs(glyfname)
|
gdefs = pp.getGlyphs(glyfname)
|
||||||
|
@ -303,7 +353,18 @@ def main(argv):
|
||||||
pfile.write('<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (pp.gid[j], pp.gx[j], pp.gy[j]))
|
pfile.write('<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (pp.gid[j], pp.gx[j], pp.gy[j]))
|
||||||
if (img == None or len(img) == 0) and (pp.gid == None or len(pp.gid) == 0):
|
if (img == None or len(img) == 0) and (pp.gid == None or len(pp.gid) == 0):
|
||||||
pfile.write('<text x="10" y="10" font-family="Helvetica" font-size="100" stroke="black">This page intentionally left blank.</text>\n<text x="10" y="110" font-family="Helvetica" font-size="50" stroke="black">Until this notice unintentionally gave it content. (gensvg.py)</text>\n');
|
pfile.write('<text x="10" y="10" font-family="Helvetica" font-size="100" stroke="black">This page intentionally left blank.</text>\n<text x="10" y="110" font-family="Helvetica" font-size="50" stroke="black">Until this notice unintentionally gave it content. (gensvg.py)</text>\n');
|
||||||
pfile.write('</svg>')
|
if (raw) :
|
||||||
|
pfile.write('</svg>')
|
||||||
|
else :
|
||||||
|
pfile.write('</svg></a>\n')
|
||||||
|
if (counter == len(filenames) - 1) :
|
||||||
|
pfile.write('<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n')
|
||||||
|
else :
|
||||||
|
pfile.write('<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,5,5,295,95,150" fill="#AAAAAA" /></svg></a>\n')
|
||||||
|
pfile.write('</div>\n')
|
||||||
|
pfile.write('<div><a href="javascript:zoomin();">zoom in</a> - <a href="javascript:zoomout();">zoom out</a></div>\n')
|
||||||
|
pfile.write('</body>\n')
|
||||||
|
pfile.write('</html>\n')
|
||||||
pfile.close()
|
pfile.close()
|
||||||
counter += 1
|
counter += 1
|
||||||
|
|
||||||
|
@ -312,4 +373,4 @@ def main(argv):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
sys.exit(main(''))
|
sys.exit(main(''))
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
#! /usr/bin/python
|
#! /usr/bin/python
|
||||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
# For use with Topaz Scripts Version 1.8
|
||||||
|
|
||||||
import os, sys, getopt
|
import os, sys, getopt
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
#! /usr/bin/python
|
#! /usr/bin/python
|
||||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
# For use with Topaz Scripts Version 1.8
|
||||||
|
|
||||||
from __future__ import with_statement
|
from __future__ import with_statement
|
||||||
import csv
|
import csv
|
||||||
|
|
|
@ -19,25 +19,16 @@ Here are the steps:
|
||||||
1. Unzip the topazscripts.zip file to get the full set of python scripts.
|
1. Unzip the topazscripts.zip file to get the full set of python scripts.
|
||||||
The files you should have after unzipping are:
|
The files you should have after unzipping are:
|
||||||
|
|
||||||
cmbtc_dump.py - (author: cmbtc) unencrypts and dumps sections into separate files
|
cmbtc_dump.py - (author: cmbtc) unencrypts and dumps sections into separate files for Kindle for PC
|
||||||
decode_meta.py - converts metadata0000.dat to human readable text (for the most part)
|
cmbtc_dump_nonK4PC.py - (author - DiapDealer) for use with standalone Kindle and ipod/iphone topaz books
|
||||||
|
decode_meta.py - converts metadata0000.dat to make it available
|
||||||
convert2xml.py - converts page*.dat, other*.dat, and glyphs*.dat files to pseudo xml descriptions
|
convert2xml.py - converts page*.dat, other*.dat, and glyphs*.dat files to pseudo xml descriptions
|
||||||
flatxml2html.py - converts a "flattened" xml description to html using the ocrtext
|
flatxml2html.py - converts a "flattened" xml description to html using the ocrtext
|
||||||
stylexml2css.py - converts stylesheet "flattened" xml into css (as best it can)
|
stylexml2css.py - converts stylesheet "flattened" xml into css (as best it can)
|
||||||
getpagedim.py - reads page0000.dat to get the book height and width parameters
|
getpagedim.py - reads page0000.dat to get the book height and width parameters
|
||||||
genxml.py - main program to convert everything to xml
|
genxml.py - main program to convert everything to xml
|
||||||
genhtml.py - main program to generate "book.html"
|
genhtml.py - main program to generate "book.html"
|
||||||
gensvg.py - (author: clarknova) main program to create an svg grpahic of each page
|
gensvg.py - (author: clarknova) main program to create an xhmtl page with embedded svg graphics
|
||||||
|
|
||||||
|
|
||||||
In addition there is now a new file:
|
|
||||||
|
|
||||||
cmbtc_dump_mac_linux.py
|
|
||||||
|
|
||||||
If you know the pid of your ipod and/or your standalone Kindle and your book
|
|
||||||
was meant for that device, you can use this program to dump the proper sections
|
|
||||||
on Mac OSX and Linux (and even Windows if you do not have Kindle4PC installed).
|
|
||||||
Thank DiapDealer for creating it!
|
|
||||||
|
|
||||||
|
|
||||||
Please note, gensvg.py, genhtml.py, and genxml.py import and use
|
Please note, gensvg.py, genhtml.py, and genxml.py import and use
|
||||||
|
@ -52,8 +43,20 @@ of its contents as files
|
||||||
All Thanks go to CMBTC who broke the DRM for Topaz - without it nothing else
|
All Thanks go to CMBTC who broke the DRM for Topaz - without it nothing else
|
||||||
would be possible
|
would be possible
|
||||||
|
|
||||||
|
If you purchased the book for Kindle For PC, you must do the following:
|
||||||
|
|
||||||
cmbtc_dump.py -d -o TARGETDIR [-p pid] YOURTOPAZBOOKNAMEHERE
|
cmbtc_dump.py -d -o TARGETDIR [-p pid] YOURTOPAZBOOKNAMEHERE
|
||||||
|
|
||||||
|
|
||||||
|
However, if you purchased the book for a standalone Kindle or ipod/iphone
|
||||||
|
and you know your pid (at least the first 8 characters) then you should
|
||||||
|
instead do the following
|
||||||
|
|
||||||
|
cmbtc_dump_nonK4PC.py -d -o TARGETDIR -p 12345678 YOURTOPAZBOOKNAMEHERE
|
||||||
|
|
||||||
|
where 12345678 should be replaced by the first 8 characters of your PID
|
||||||
|
|
||||||
|
|
||||||
This should create a directory called "TARGETDIR" in your current directory.
|
This should create a directory called "TARGETDIR" in your current directory.
|
||||||
It should have the following files in it:
|
It should have the following files in it:
|
||||||
|
|
||||||
|
@ -64,35 +67,48 @@ page - directory filled with page*.dat files
|
||||||
glyphs - directory filled with glyphs*.dat files
|
glyphs - directory filled with glyphs*.dat files
|
||||||
|
|
||||||
|
|
||||||
|
3. REQUIRED: Create xhtml page descriptions with embedded svg
|
||||||
|
that show the exact representation of each page as an image
|
||||||
|
with proper glyphs and positioning.
|
||||||
|
|
||||||
3. Convert the files in "TARGETDIR" to their xml descriptions
|
The step must NOW be done BEFORE attempting conversion to html
|
||||||
which can be found in TARGETDIR/xml/ upon completion.
|
|
||||||
|
|
||||||
genxml.py TARGETDIR
|
gensvg.py TARGETDIR
|
||||||
|
|
||||||
|
When complete, use a web-browser to open the page*.xhtml files
|
||||||
|
in TARGETDIR/svg/ to see what the book really looks like.
|
||||||
|
|
||||||
|
All thanks go to CLARKNOVA for this program. This program is
|
||||||
|
needed to actually see the true image of each page and so that
|
||||||
|
the next step can properly create images from glyphs for
|
||||||
|
monograms, dropcaps and tables.
|
||||||
|
|
||||||
|
|
||||||
|
4. Create "book.html" which can be found in "TARGETDIR" after
|
||||||
4. Create book.html which can be found in "TARGETDIR" after
|
completion.
|
||||||
completion. This html conversion can not fully capture
|
|
||||||
all of the layouts actually used in the book and needs to
|
|
||||||
be edited to include special font handling such as bold
|
|
||||||
or italics that can not be determined from the ocrText
|
|
||||||
information or the style information. If you want to
|
|
||||||
see things exactly as they were, see step 5 below.
|
|
||||||
|
|
||||||
genhtml.py TARGETDIR
|
genhtml.py TARGETDIR
|
||||||
|
|
||||||
|
|
||||||
|
***IMPORTANT NOTE*** This html conversion can not fully capture
|
||||||
|
all of the layouts and styles actually used in the book
|
||||||
|
and the resulting html will need to be edited by hand to
|
||||||
|
properly set bold and/or italics, handle font size changes,
|
||||||
|
and to fix the sometimes horiffic mistakes in the ocrText
|
||||||
|
used to create the html.
|
||||||
|
|
||||||
5. Create an svg description of each page which can
|
FYI: Sigil is a wonderful, free cross-
|
||||||
be found in TARGETDIR/svg/ upon completion.
|
platform program that can be used to edit the html and
|
||||||
|
create an epub if you so desire.
|
||||||
|
|
||||||
All thanks go to CLARKNOVA for this program. This program is
|
|
||||||
needed to actually see the true image of each page so that hand
|
|
||||||
editing of the html created by step 4 can be done.
|
|
||||||
|
|
||||||
Or use the resulting svg files to read each page of the book
|
5. Optional Step: Convert the files in "TARGETDIR" to their
|
||||||
exactly as it has been laid out originally.
|
xml descriptions which can be found in TARGETDIR/xml/
|
||||||
|
upon completion.
|
||||||
|
|
||||||
gensvg.py TARGETDIR
|
genxml.py TARGETDIR
|
||||||
|
|
||||||
|
|
||||||
|
These conversions are important for allowing future (and better)
|
||||||
|
conversions to come later.
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
#! /usr/bin/python
|
#! /usr/bin/python
|
||||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
# For use with Topaz Scripts Version 1.8
|
||||||
|
|
||||||
from __future__ import with_statement
|
from __future__ import with_statement
|
||||||
import csv
|
import csv
|
||||||
|
|
Loading…
Reference in New Issue