From 24f001c61e0e7b313d0ac461315e07a4c2e62e57 Mon Sep 17 00:00:00 2001
From: some_updates
Date: Sun, 24 Jan 2010 12:19:20 +0000
Subject: [PATCH] topazscripts 1.8
---
Topaz_Tools/lib/changes.txt | 20 ++-
Topaz_Tools/lib/cmbtc_dump.py | 1 +
Topaz_Tools/lib/cmbtc_dump_nonK4PC.py | 1 +
Topaz_Tools/lib/convert2xml.py | 1 +
Topaz_Tools/lib/decode_meta.py | 1 +
Topaz_Tools/lib/flatxml2html.py | 204 ++++++++++++++++++++++----
Topaz_Tools/lib/genhtml.py | 10 +-
Topaz_Tools/lib/gensvg.py | 83 +++++++++--
Topaz_Tools/lib/genxml.py | 1 +
Topaz_Tools/lib/getpagedim.py | 1 +
Topaz_Tools/lib/readme.txt | 80 ++++++----
Topaz_Tools/lib/stylexml2css.py | 1 +
12 files changed, 332 insertions(+), 72 deletions(-)
diff --git a/Topaz_Tools/lib/changes.txt b/Topaz_Tools/lib/changes.txt
index 8d6b4f0..83910c0 100644
--- a/Topaz_Tools/lib/changes.txt
+++ b/Topaz_Tools/lib/changes.txt
@@ -1,3 +1,22 @@
+Changes in version 1.8
+
+ - gensvg.py now builds wonderful xhtml pages with embedded svg
+ that can be easily paged through as if reading a book!
+ (tested in Safari for Mac and Win and Firefox)
+ (requires javascript to be enabled)
+
+ - genhtml.py now REQUIRES that gensvg.py be run FIRST
+ this allows create of images on the fly from glyphs
+
+ - genhtml.py now automatically makes tables of words into svg
+ based images and will handle glyph based ornate first
+ letters of words
+
+ - cmbtc_dump_mac_linux.py has been renamed to be
+ cmbtc_dump_nonK4PC.py to make it clearer
+ when it needs to be used
+
+
Changes in version 1.7
- gensvg.py has been improved so that the glyphs render exactly (ClarkNova)
- gensvg.py has fixed a render order "bug" that allowed some images to cover or hide text. (ClarkNova)
@@ -5,7 +24,6 @@ Changes in version 1.7
- add missing tag
- make xhtml compliant doctype and minor changes to write correct xhtml
- make divs that act as anchors be hidden visually and to take up 0 height and 0 width to prevent any impact on layout
- - added support for new version of the <_span> tag called
Changes in version 1.6
- support for books whose paragraphs have no styles
diff --git a/Topaz_Tools/lib/cmbtc_dump.py b/Topaz_Tools/lib/cmbtc_dump.py
index 9cd32de..ac7e33c 100644
--- a/Topaz_Tools/lib/cmbtc_dump.py
+++ b/Topaz_Tools/lib/cmbtc_dump.py
@@ -1,4 +1,5 @@
#! /usr/bin/python
+# For use in Topaz Scripts version 1.8
"""
diff --git a/Topaz_Tools/lib/cmbtc_dump_nonK4PC.py b/Topaz_Tools/lib/cmbtc_dump_nonK4PC.py
index 8a03a3a..ed7ff87 100644
--- a/Topaz_Tools/lib/cmbtc_dump_nonK4PC.py
+++ b/Topaz_Tools/lib/cmbtc_dump_nonK4PC.py
@@ -1,4 +1,5 @@
#! /usr/bin/python
+# For use with Topaz Scripts Version 1.8
from __future__ import with_statement
diff --git a/Topaz_Tools/lib/convert2xml.py b/Topaz_Tools/lib/convert2xml.py
index 06bd89f..12ca934 100644
--- a/Topaz_Tools/lib/convert2xml.py
+++ b/Topaz_Tools/lib/convert2xml.py
@@ -1,5 +1,6 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+# For use with Topaz Scripts Version 1.8
from __future__ import with_statement
import csv
diff --git a/Topaz_Tools/lib/decode_meta.py b/Topaz_Tools/lib/decode_meta.py
index f038310..ba831ec 100644
--- a/Topaz_Tools/lib/decode_meta.py
+++ b/Topaz_Tools/lib/decode_meta.py
@@ -1,5 +1,6 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+# For use with Topaz Scripts Version 1.8
from __future__ import with_statement
import csv
diff --git a/Topaz_Tools/lib/flatxml2html.py b/Topaz_Tools/lib/flatxml2html.py
index 0b28468..4182502 100644
--- a/Topaz_Tools/lib/flatxml2html.py
+++ b/Topaz_Tools/lib/flatxml2html.py
@@ -1,21 +1,27 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+# For use with Topaz Scripts Version 1.8
from __future__ import with_statement
import csv
import sys
import os
+import math
import getopt
from struct import pack
from struct import unpack
class DocParser(object):
- def __init__(self, flatxml, classlst, fileid):
+ def __init__(self, flatxml, classlst, fileid, bookDir):
self.id = os.path.basename(fileid).replace('.dat','')
+ self.svgcount = 0
self.docList = flatxml.split('\n')
self.docSize = len(self.docList)
self.classList = {}
+ self.bookDir = bookDir
+ self.glyphPaths = { }
+ self.numPaths = 0
tmpList = classlst.split('\n')
for pclass in tmpList:
if pclass != '':
@@ -30,6 +36,107 @@ class DocParser(object):
self.paracont_stemid = []
self.parastems_stemid = []
+
+ def getGlyph(self, gid):
+ result = ''
+ id='gl%d' % gid
+ return self.glyphPaths[id]
+
+
+ def glyphs_to_image(self, glyphList):
+
+ def extract(path, key):
+ b = path.find(key) + len(key)
+ e = path.find(' ',b)
+ return int(path[b:e])
+
+ def extractID(path, key):
+ b = path.find(key) + len(key)
+ e = path.find('"',b)
+ return path[b:e]
+
+
+ svgDir = os.path.join(self.bookDir,'svg')
+ glyfile = os.path.join(svgDir,'glyphs.svg')
+
+ imgDir = os.path.join(self.bookDir,'img')
+ imgname = self.id + '_%04d.svg' % self.svgcount
+ imgfile = os.path.join(imgDir,imgname)
+
+ # build hash table of glyph paths keyed by glyph id
+ if self.numPaths == 0:
+ gfile = open(glyfile, 'r')
+ while True:
+ path = gfile.readline()
+ if (path == ''): break
+ glyphid = extractID(path,'id="')
+ self.glyphPaths[glyphid] = path
+ self.numPaths += 1
+ gfile.close()
+
+
+ # get glyph information
+ gxList = self.getData('info.glyph.x',0,-1)
+ gyList = self.getData('info.glyph.y',0,-1)
+ gidList = self.getData('info.glyph.glyphID',0,-1)
+
+ gids = []
+ maxws = []
+ maxhs = []
+ xs = []
+ ys = []
+ gdefs = []
+
+ # get path defintions, positions, dimensions for ecah glyph
+ # that makes up the image, and find min x and min y to reposition origin
+ minx = -1
+ miny = -1
+ for j in glyphList:
+ gid = gidList[j]
+ gids.append(gid)
+
+ xs.append(gxList[j])
+ if minx == -1: minx = gxList[j]
+ else : minx = min(minx, gxList[j])
+
+ ys.append(gyList[j])
+ if miny == -1: miny = gyList[j]
+ else : miny = min(miny, gyList[j])
+
+ path = self.getGlyph(gid)
+ gdefs.append(path)
+
+ maxws.append(extract(path,'width='))
+ maxhs.append(extract(path,'height='))
+
+
+ # change the origin to minx, miny and calc max height and width
+ maxw = maxws[0] + xs[0] - minx
+ maxh = maxhs[0] + ys[0] - miny
+ for j in xrange(0, len(xs)):
+ xs[j] = xs[j] - minx
+ ys[j] = ys[j] - miny
+ maxw = max( maxw, (maxws[j] + xs[j]) )
+ maxh = max( maxh, (maxhs[j] + ys[j]) )
+
+ # open the image file for output
+ ifile = open(imgfile,'w')
+ ifile.write('\n')
+ ifile.write('\n')
+ ifile.write('')
+ ifile.close()
+
+ return 0
+
+
+
# return tag at line pos in document
def lineinDoc(self, pos) :
if (pos >= 0) and (pos < self.docSize) :
@@ -77,6 +184,17 @@ class DocParser(object):
return startpos
+ # returns a vector of integers for the tagpath
+ def getData(self, tagpath, pos, end):
+ argres=[]
+ (foundat, argt) = self.findinDoc(tagpath, pos, end)
+ if (argt != None) and (len(argt) > 0) :
+ argList = argt.split('|')
+ argres = [ int(strval) for strval in argList]
+ return argres
+
+
+
# build a description of the paragraph
def getParaDescription(self, start, end):
@@ -120,6 +238,7 @@ class DocParser(object):
# this type of paragrph may be made up of multiple _spans, inline
# word monograms (images) and words with semantic meaning
# and now a new type "span" versus the old "_span"
+ # plus glyphs used to form starting letter of first word
# need to parse this type line by line
line = start + 1
@@ -143,6 +262,21 @@ class DocParser(object):
result.append(('ocr', wordnum))
line += 1
+ elif name.endswith('word.firstGlyph') :
+ first = int(argres)
+ (name, argres) = self.lineinDoc(line+1)
+ if not name.endswith('word.lastGlyph'):
+ print 'Error: - incorrect glyph ordering inside word in paragraph'
+ last = int(argres)
+ glyphList = []
+ for glyphnum in xrange(first, last):
+ glyphList.append(glyphnum)
+ num = self.svgcount
+ self.glyphs_to_image(glyphList)
+ self.svgcount += 1
+ result.append(('svg', num))
+ line += 1
+
elif name.endswith('word.class'):
(cname, space) = argres.split('-',1)
if space == '' : space = '0'
@@ -241,6 +375,11 @@ class DocParser(object):
parares += '' % num
parares += sep
+ elif wtype == 'svg' :
+ sep = ''
+ parares += '' % num
+ parares += sep
+
if len(sep) > 0 : parares = parares[0:-1]
if (type == 'full') or (type == 'end') :
parares += '
'
@@ -260,10 +399,7 @@ class DocParser(object):
if argres : self.ocrtext = argres.split('|')
# get information to dehyphenate the text
- (pos, argres) = self.findinDoc('info.dehyphen.rootID',0,-1)
- if argres:
- argList = argres.split('|')
- self.dehyphen_rootid = [ int(strval) for strval in argList]
+ self.dehyphen_rootid = self.getData('info.dehyphen.rootID',0,-1)
# determine if first paragraph is continued from previous page
(pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1)
@@ -274,16 +410,10 @@ class DocParser(object):
last_para_continued = (self.paracont_stemid != None)
# collect link ids
- (pos, argres) = self.findinDoc('info.word.link_id',0,-1)
- if argres:
- argList = argres.split('|')
- self.link_id = [ int(strval) for strval in argList]
+ self.link_id = self.getData('info.word.link_id',0,-1)
# collect link destination page numbers
- (pos, argres) = self.findinDoc('info.links.page',0,-1)
- if argres :
- argList = argres.split('|')
- self.link_page = [ int(strval) for strval in argList]
+ self.link_page = self.getData('info.links.page',0,-1)
# collect link titles
(pos, argres) = self.findinDoc('info.links.title',0,-1)
@@ -382,23 +512,45 @@ class DocParser(object):
elif (regtype == 'table') :
- ptype = 'full'
- if first_para_continued :
- ptype = 'end'
- first_para_continued = False
- (pclass, pdesc) = self.getParaDescription(start,end)
- htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
- print "Warnings - Table Conversions are notoriously poor"
- print "Strongly recommend taking a screen capture image of the "
- print "table in %s.svg and using it to replace this attempt at a table" % self.id
-
+ # translate first and last word into first and last glyphs
+ # and generate table as an image and include a link to it
+ glyphList = []
+ (pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
+ (pos, slast) = self.findinDoc('paragraph.lastWord',start,end)
+ firstglyphList = self.getData('word.firstGlyph',0,-1)
+ gidList = self.getData('info.glyph.glyphID',0,-1)
+ if (sfirst != None) and (slast != None) :
+ first = int(sfirst)
+ last = int(slast)
+ firstGlyph = firstglyphList[first]
+ if last < len(firstglyphList):
+ lastGlyph = firstglyphList[last]
+ else :
+ lastGlyph = len(gidList)
+ for glyphnum in xrange(firstGlyph, lastGlyph):
+ glyphList.append(glyphnum)
+ num = self.svgcount
+ self.glyphs_to_image(glyphList)
+ self.svgcount += 1
+ htmlpage += '' % num
+ else :
+ ptype = 'full'
+ if first_para_continued :
+ ptype = 'end'
+ first_para_continued = False
+ (pclass, pdesc) = self.getParaDescription(start,end)
+ htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
+ print " "
+ print "Warning: - Table Conversions are notoriously poor"
+ print " Strongly recommend taking a screen capture image of the "
+ print " table in %s.svg and using it to replace this attempt at a table" % self.id
+ print " "
elif (regtype == 'synth_fcvr.center') or (regtype == 'synth_text.center'):
(pos, simgsrc) = self.findinDoc('img.src',start,end)
if simgsrc:
htmlpage += '' % int(simgsrc)
-
else :
print 'Warning: region type', regtype
(pos, temp) = self.findinDoc('paragraph',start,end)
@@ -437,10 +589,10 @@ class DocParser(object):
-def convert2HTML(flatxml, classlst, fileid):
+def convert2HTML(flatxml, classlst, fileid, bookDir):
# create a document parser
- dp = DocParser(flatxml, classlst, fileid)
+ dp = DocParser(flatxml, classlst, fileid, bookDir)
htmlpage = dp.process()
diff --git a/Topaz_Tools/lib/genhtml.py b/Topaz_Tools/lib/genhtml.py
index 3333f82..58d9e9a 100644
--- a/Topaz_Tools/lib/genhtml.py
+++ b/Topaz_Tools/lib/genhtml.py
@@ -1,5 +1,6 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+# For use with Topaz Scripts Version 1.8
import os, sys, getopt
@@ -65,6 +66,12 @@ def main(argv):
print "Can not find image directory in unencrypted book"
sys.exit(-1)
+ svgDir = os.path.join(bookDir,'svg')
+ if not os.path.exists(svgDir) :
+ print "Can not find svg directory in unencrypted book"
+ print "please run gensvg.py before running genhtml.py"
+ sys.exit(-1)
+
otherFile = os.path.join(bookDir,'other0000.dat')
if not os.path.exists(otherFile) :
print "Can not find other0000.dat in unencrypted book"
@@ -75,7 +82,6 @@ def main(argv):
print "Can not find metadata0000.dat in unencrypted book"
sys.exit(-1)
-
htmlFileName = "book.html"
htmlstr = '\n'
htmlstr += '\n'
@@ -133,7 +139,7 @@ def main(argv):
print ' ', filename
fname = os.path.join(pageDir,filename)
flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
- htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname)
+ htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir)
htmlstr += '\n