tools v4.8
This commit is contained in:
parent
e95ed1a8ed
commit
93f02c625a
|
@ -19,7 +19,7 @@ class K4DeDRM(FileTypePlugin):
|
|||
description = 'Removes DRM from Mobipocket, Kindle/Mobi, Kindle/Topaz and Kindle/Print Replica files. Provided by the work of many including DiapDealer, SomeUpdates, IHeartCabbages, CMBDTC, Skindle, DarkReverser, ApprenticeAlf, etc.'
|
||||
supported_platforms = ['osx', 'windows', 'linux'] # Platforms this plugin will run on
|
||||
author = 'DiapDealer, SomeUpdates' # The author of this plugin
|
||||
version = (0, 3, 7) # The version number of this plugin
|
||||
version = (0, 3, 8) # The version number of this plugin
|
||||
file_types = set(['prc','mobi','azw','azw1','azw4','tpz']) # The file types that this plugin will be applied to
|
||||
on_import = True # Run this plugin during the import
|
||||
priority = 210 # run this plugin before mobidedrm, k4pcdedrm, k4dedrm
|
||||
|
|
|
@ -20,6 +20,8 @@ import getopt
|
|||
from struct import pack
|
||||
from struct import unpack
|
||||
|
||||
class TpzDRMError(Exception):
|
||||
pass
|
||||
|
||||
# Get a 7 bit encoded number from string. The most
|
||||
# significant byte comes first and has the high bit (8th) set
|
||||
|
@ -138,7 +140,8 @@ class Dictionary(object):
|
|||
return self.stable[self.pos]
|
||||
else:
|
||||
print "Error - %d outside of string table limits" % val
|
||||
sys.exit(-1)
|
||||
raise TpzDRMError('outside of string table limits')
|
||||
# sys.exit(-1)
|
||||
|
||||
def getSize(self):
|
||||
return self.size
|
||||
|
@ -258,6 +261,11 @@ class PageParser(object):
|
|||
'paragraph.class' : (1, 'scalar_text', 0, 0),
|
||||
'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
|
||||
'word_semantic' : (1, 'snippets', 1, 1),
|
||||
'word_semantic.type' : (1, 'scalar_text', 0, 0),
|
||||
|
@ -272,11 +280,17 @@ class PageParser(object):
|
|||
|
||||
'_span' : (1, 'snippets', 1, 0),
|
||||
'_span.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
'-span.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'_span.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'_span.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
'_span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'_span.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
'span' : (1, 'snippets', 1, 0),
|
||||
'span.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
'span.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'span.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
'span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'span.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
'extratokens' : (1, 'snippets', 1, 0),
|
||||
'extratokens.type' : (1, 'scalar_text', 0, 0),
|
||||
|
|
|
@ -271,6 +271,9 @@ class DocParser(object):
|
|||
|
||||
pclass = self.getClass(pclass)
|
||||
|
||||
# if paragraph uses extratokens (extra glyphs) then make it fixed
|
||||
(pos, extraglyphs) = self.findinDoc('paragraph.extratokens',start,end)
|
||||
|
||||
# build up a description of the paragraph in result and return it
|
||||
# first check for the basic - all words paragraph
|
||||
(pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
|
||||
|
@ -280,6 +283,7 @@ class DocParser(object):
|
|||
last = int(slast)
|
||||
|
||||
makeImage = (regtype == 'vertical') or (regtype == 'table')
|
||||
makeImage = makeImage or (extraglyphs != None)
|
||||
if self.fixedimage:
|
||||
makeImage = makeImage or (regtype == 'fixed')
|
||||
|
||||
|
@ -353,6 +357,8 @@ class DocParser(object):
|
|||
|
||||
word_class = ''
|
||||
|
||||
word_semantic_type = ''
|
||||
|
||||
while (line < end) :
|
||||
|
||||
(name, argres) = self.lineinDoc(line)
|
||||
|
@ -512,6 +518,72 @@ class DocParser(object):
|
|||
return parares
|
||||
|
||||
|
||||
def buildTOCEntry(self, pdesc) :
|
||||
parares = ''
|
||||
sep =''
|
||||
tocentry = ''
|
||||
handle_links = len(self.link_id) > 0
|
||||
|
||||
lstart = 0
|
||||
|
||||
cnt = len(pdesc)
|
||||
for j in xrange( 0, cnt) :
|
||||
|
||||
(wtype, num) = pdesc[j]
|
||||
|
||||
if wtype == 'ocr' :
|
||||
word = self.ocrtext[num]
|
||||
sep = ' '
|
||||
|
||||
if handle_links:
|
||||
link = self.link_id[num]
|
||||
if (link > 0):
|
||||
linktype = self.link_type[link-1]
|
||||
title = self.link_title[link-1]
|
||||
title = title.rstrip('. ')
|
||||
alt_title = parares[lstart:]
|
||||
alt_title = alt_title.strip()
|
||||
# now strip off the actual printed page number
|
||||
alt_title = alt_title.rstrip('01234567890ivxldIVXLD-.')
|
||||
alt_title = alt_title.rstrip('. ')
|
||||
# skip over any external links - can't have them in a books toc
|
||||
if linktype == 'external' :
|
||||
title = ''
|
||||
alt_title = ''
|
||||
linkpage = ''
|
||||
else :
|
||||
if len(self.link_page) >= link :
|
||||
ptarget = self.link_page[link-1] - 1
|
||||
linkpage = '%04d' % ptarget
|
||||
else :
|
||||
# just link to the current page
|
||||
linkpage = self.id[4:]
|
||||
if len(alt_title) >= len(title):
|
||||
title = alt_title
|
||||
if title != '' and linkpage != '':
|
||||
tocentry += title + '|' + linkpage + '\n'
|
||||
lstart = len(parares)
|
||||
if word == '_link_' : word = ''
|
||||
elif (link < 0) :
|
||||
if word == '_link_' : word = ''
|
||||
|
||||
if word == '_lb_':
|
||||
word = ''
|
||||
sep = ''
|
||||
|
||||
if num in self.dehyphen_rootid :
|
||||
word = word[0:-1]
|
||||
sep = ''
|
||||
|
||||
parares += word + sep
|
||||
|
||||
else :
|
||||
continue
|
||||
|
||||
return tocentry
|
||||
|
||||
|
||||
|
||||
|
||||
# walk the document tree collecting the information needed
|
||||
# to build an html page using the ocrText
|
||||
|
@ -519,6 +591,7 @@ class DocParser(object):
|
|||
def process(self):
|
||||
|
||||
htmlpage = ''
|
||||
tocinfo = ''
|
||||
|
||||
# get the ocr text
|
||||
(pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
|
||||
|
@ -644,9 +717,9 @@ class DocParser(object):
|
|||
ptype = 'end'
|
||||
first_para_continued = False
|
||||
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
|
||||
tocinfo += self.buildTOCEntry(pdesc)
|
||||
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
|
||||
|
||||
|
||||
elif (regtype == 'vertical') or (regtype == 'table') :
|
||||
ptype = 'full'
|
||||
if inGroup:
|
||||
|
@ -704,12 +777,11 @@ class DocParser(object):
|
|||
htmlpage = htmlpage[0:-4]
|
||||
last_para_continued = False
|
||||
|
||||
return htmlpage
|
||||
|
||||
return htmlpage, tocinfo
|
||||
|
||||
|
||||
def convert2HTML(flatxml, classlst, fileid, bookDir, gdict, fixedimage):
|
||||
# create a document parser
|
||||
dp = DocParser(flatxml, classlst, fileid, bookDir, gdict, fixedimage)
|
||||
htmlpage = dp.process()
|
||||
return htmlpage
|
||||
htmlpage, tocinfo = dp.process()
|
||||
return htmlpage, tocinfo
|
||||
|
|
|
@ -10,17 +10,94 @@ from struct import unpack
|
|||
|
||||
|
||||
class PParser(object):
|
||||
def __init__(self, gd, flatxml):
|
||||
def __init__(self, gd, flatxml, meta_array):
|
||||
self.gd = gd
|
||||
self.flatdoc = flatxml.split('\n')
|
||||
self.docSize = len(self.flatdoc)
|
||||
self.temp = []
|
||||
foo = self.getData('page.h') or self.getData('book.h')
|
||||
self.ph = foo[0]
|
||||
foo = self.getData('page.w') or self.getData('book.w')
|
||||
self.pw = foo[0]
|
||||
self.gx = self.getData('info.glyph.x')
|
||||
self.gy = self.getData('info.glyph.y')
|
||||
self.gid = self.getData('info.glyph.glyphID')
|
||||
|
||||
self.ph = -1
|
||||
self.pw = -1
|
||||
startpos = self.posinDoc('page.h') or self.posinDoc('book.h')
|
||||
for p in startpos:
|
||||
(name, argres) = self.lineinDoc(p)
|
||||
self.ph = max(self.ph, int(argres))
|
||||
startpos = self.posinDoc('page.w') or self.posinDoc('book.w')
|
||||
for p in startpos:
|
||||
(name, argres) = self.lineinDoc(p)
|
||||
self.pw = max(self.pw, int(argres))
|
||||
|
||||
if self.ph <= 0:
|
||||
self.ph = int(meta_array.get('pageHeight', '11000'))
|
||||
if self.pw <= 0:
|
||||
self.pw = int(meta_array.get('pageWidth', '8500'))
|
||||
|
||||
res = []
|
||||
startpos = self.posinDoc('info.glyph.x')
|
||||
for p in startpos:
|
||||
argres = self.getDataatPos('info.glyph.x', p)
|
||||
res.extend(argres)
|
||||
self.gx = res
|
||||
|
||||
res = []
|
||||
startpos = self.posinDoc('info.glyph.y')
|
||||
for p in startpos:
|
||||
argres = self.getDataatPos('info.glyph.y', p)
|
||||
res.extend(argres)
|
||||
self.gy = res
|
||||
|
||||
res = []
|
||||
startpos = self.posinDoc('info.glyph.glyphID')
|
||||
for p in startpos:
|
||||
argres = self.getDataatPos('info.glyph.glyphID', p)
|
||||
res.extend(argres)
|
||||
self.gid = res
|
||||
|
||||
|
||||
# return tag at line pos in document
|
||||
def lineinDoc(self, pos) :
|
||||
if (pos >= 0) and (pos < self.docSize) :
|
||||
item = self.flatdoc[pos]
|
||||
if item.find('=') >= 0:
|
||||
(name, argres) = item.split('=',1)
|
||||
else :
|
||||
name = item
|
||||
argres = ''
|
||||
return name, argres
|
||||
|
||||
# find tag in doc if within pos to end inclusive
|
||||
def findinDoc(self, tagpath, pos, end) :
|
||||
result = None
|
||||
if end == -1 :
|
||||
end = self.docSize
|
||||
else:
|
||||
end = min(self.docSize, end)
|
||||
foundat = -1
|
||||
for j in xrange(pos, end):
|
||||
item = self.flatdoc[j]
|
||||
if item.find('=') >= 0:
|
||||
(name, argres) = item.split('=',1)
|
||||
else :
|
||||
name = item
|
||||
argres = ''
|
||||
if name.endswith(tagpath) :
|
||||
result = argres
|
||||
foundat = j
|
||||
break
|
||||
return foundat, result
|
||||
|
||||
# return list of start positions for the tagpath
|
||||
def posinDoc(self, tagpath):
|
||||
startpos = []
|
||||
pos = 0
|
||||
res = ""
|
||||
while res != None :
|
||||
(foundpos, res) = self.findinDoc(tagpath, pos, -1)
|
||||
if res != None :
|
||||
startpos.append(foundpos)
|
||||
pos = foundpos + 1
|
||||
return startpos
|
||||
|
||||
def getData(self, path):
|
||||
result = None
|
||||
cnt = len(self.flatdoc)
|
||||
|
@ -39,6 +116,23 @@ class PParser(object):
|
|||
for j in xrange(0,len(argres)):
|
||||
argres[j] = int(argres[j])
|
||||
return result
|
||||
|
||||
def getDataatPos(self, path, pos):
|
||||
result = None
|
||||
item = self.flatdoc[pos]
|
||||
if item.find('=') >= 0:
|
||||
(name, argt) = item.split('=')
|
||||
argres = argt.split('|')
|
||||
else:
|
||||
name = item
|
||||
argres = []
|
||||
if (len(argres) > 0) :
|
||||
for j in xrange(0,len(argres)):
|
||||
argres[j] = int(argres[j])
|
||||
if (name.endswith(path)):
|
||||
result = argres
|
||||
return result
|
||||
|
||||
def getDataTemp(self, path):
|
||||
result = None
|
||||
cnt = len(self.temp)
|
||||
|
@ -58,6 +152,7 @@ class PParser(object):
|
|||
for j in xrange(0,len(argres)):
|
||||
argres[j] = int(argres[j])
|
||||
return result
|
||||
|
||||
def getImages(self):
|
||||
result = []
|
||||
self.temp = self.flatdoc
|
||||
|
@ -69,6 +164,7 @@ class PParser(object):
|
|||
src = self.getDataTemp('img.src')[0]
|
||||
result.append('<image xlink:href="../img/img%04d.jpg" x="%d" y="%d" width="%d" height="%d" />\n' % (src, x, y, w, h))
|
||||
return result
|
||||
|
||||
def getGlyphs(self):
|
||||
result = []
|
||||
if (self.gid != None) and (len(self.gid) > 0):
|
||||
|
@ -84,25 +180,25 @@ class PParser(object):
|
|||
return result
|
||||
|
||||
|
||||
def convert2SVG(gdict, flat_xml, counter, numfiles, svgDir, raw, meta_array, scaledpi):
|
||||
def convert2SVG(gdict, flat_xml, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi):
|
||||
ml = ''
|
||||
pp = PParser(gdict, flat_xml)
|
||||
pp = PParser(gdict, flat_xml, meta_array)
|
||||
ml += '<?xml version="1.0" standalone="no"?>\n'
|
||||
if (raw):
|
||||
ml += '<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n'
|
||||
ml += '<svg width="%fin" height="%fin" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1)
|
||||
ml += '<title>Page %d - %s by %s</title>\n' % (counter, meta_array['Title'],meta_array['Authors'])
|
||||
ml += '<title>Page %d - %s by %s</title>\n' % (pageid, meta_array['Title'],meta_array['Authors'])
|
||||
else:
|
||||
ml += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
|
||||
ml += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" ><head>\n'
|
||||
ml += '<title>Page %d - %s by %s</title>\n' % (counter, meta_array['Title'],meta_array['Authors'])
|
||||
ml += '<title>Page %d - %s by %s</title>\n' % (pageid, meta_array['Title'],meta_array['Authors'])
|
||||
ml += '<script><![CDATA[\n'
|
||||
ml += 'function gd(){var p=window.location.href.replace(/^.*\?dpi=(\d+).*$/i,"$1");return p;}\n'
|
||||
ml += 'var dpi=%d;\n' % scaledpi
|
||||
if (counter) :
|
||||
ml += 'var prevpage="page%04d.xhtml";\n' % (counter - 1)
|
||||
if (counter < numfiles-1) :
|
||||
ml += 'var nextpage="page%04d.xhtml";\n' % (counter + 1)
|
||||
if (previd) :
|
||||
ml += 'var prevpage="page%04d.xhtml";\n' % (previd)
|
||||
if (nextid) :
|
||||
ml += 'var nextpage="page%04d.xhtml";\n' % (nextid)
|
||||
ml += 'var pw=%d;var ph=%d;' % (pp.pw, pp.ph)
|
||||
ml += 'function zoomin(){dpi=dpi*(0.8);setsize();}\n'
|
||||
ml += 'function zoomout(){dpi=dpi*1.25;setsize();}\n'
|
||||
|
@ -115,10 +211,11 @@ def convert2SVG(gdict, flat_xml, counter, numfiles, svgDir, raw, meta_array, sca
|
|||
ml += '</head>\n'
|
||||
ml += '<body onLoad="setsize();" style="background-color:#777;text-align:center;">\n'
|
||||
ml += '<div style="white-space:nowrap;">\n'
|
||||
if (counter == 0) :
|
||||
if previd == None:
|
||||
ml += '<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n'
|
||||
else:
|
||||
ml += '<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,150,95,5,95,295" fill="#AAAAAA" /></svg></a>\n'
|
||||
|
||||
ml += '<a href="javascript:npage();"><svg id="svgimg" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" style="background-color:#FFF;border:1px solid black;">' % (pp.pw, pp.ph)
|
||||
if (pp.gid != None):
|
||||
ml += '<defs>\n'
|
||||
|
@ -134,12 +231,14 @@ def convert2SVG(gdict, flat_xml, counter, numfiles, svgDir, raw, meta_array, sca
|
|||
for j in xrange(0,len(pp.gid)):
|
||||
ml += '<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (pp.gid[j], pp.gx[j], pp.gy[j])
|
||||
if (img == None or len(img) == 0) and (pp.gid == None or len(pp.gid) == 0):
|
||||
ml += '<text x="10" y="10" font-family="Helvetica" font-size="100" stroke="black">This page intentionally left blank.</text>\n<text x="10" y="110" font-family="Helvetica" font-size="50" stroke="black">Until this notice unintentionally gave it content. (gensvg.py)</text>\n'
|
||||
xpos = "%d" % (pp.pw // 3)
|
||||
ypos = "%d" % (pp.ph // 3)
|
||||
ml += '<text x="' + xpos + '" y="' + ypos + '" font-size="' + meta_array['fontSize'] + '" font-family="Helvetica" stroke="black">This page intentionally left blank.</text>\n'
|
||||
if (raw) :
|
||||
ml += '</svg>'
|
||||
else :
|
||||
ml += '</svg></a>\n'
|
||||
if (counter == numfiles - 1) :
|
||||
if nextid == None:
|
||||
ml += '<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n'
|
||||
else :
|
||||
ml += '<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,5,5,295,95,150" fill="#AAAAAA" /></svg></a>\n'
|
||||
|
|
|
@ -19,6 +19,8 @@ import getopt
|
|||
from struct import pack
|
||||
from struct import unpack
|
||||
|
||||
class TpzDRMError(Exception):
|
||||
pass
|
||||
|
||||
# local support routines
|
||||
if 'calibre' in sys.modules:
|
||||
|
@ -114,7 +116,8 @@ class Dictionary(object):
|
|||
return self.stable[self.pos]
|
||||
else:
|
||||
print "Error - %d outside of string table limits" % val
|
||||
sys.exit(-1)
|
||||
raise TpzDRMError('outside or string table limits')
|
||||
# sys.exit(-1)
|
||||
def getSize(self):
|
||||
return self.size
|
||||
def getPos(self):
|
||||
|
@ -371,10 +374,34 @@ def generateBook(bookDir, raw, fixedimage):
|
|||
(ph, pw) = getPageDim(flat_xml)
|
||||
if (ph == '-1') or (ph == '0') : ph = '11000'
|
||||
if (pw == '-1') or (pw == '0') : pw = '8500'
|
||||
meta_array['pageHeight'] = ph
|
||||
meta_array['pageWidth'] = pw
|
||||
if 'fontSize' not in meta_array.keys():
|
||||
meta_array['fontSize'] = fontsize
|
||||
|
||||
# print ' ', 'other0000.dat'
|
||||
# process other.dat for css info and for map of page files to svg images
|
||||
# this map is needed because some pages actually are made up of multiple
|
||||
# pageXXXX.xml files
|
||||
xname = os.path.join(bookDir, 'style.css')
|
||||
flat_xml = convert2xml.fromData(dict, otherFile)
|
||||
|
||||
# extract info.original.pid to get original page information
|
||||
pageIDMap = {}
|
||||
pageidnums = stylexml2css.getpageIDMap(flat_xml)
|
||||
if len(pageidnums) == 0:
|
||||
filenames = os.listdir(pageDir)
|
||||
numfiles = len(filenames)
|
||||
for k in range(numfiles):
|
||||
pageidnums.append(k)
|
||||
# create a map from page ids to list of page file nums to process for that page
|
||||
for i in range(len(pageidnums)):
|
||||
id = pageidnums[i]
|
||||
if id in pageIDMap.keys():
|
||||
pageIDMap[id].append(i)
|
||||
else:
|
||||
pageIDMap[id] = [i]
|
||||
|
||||
# now get the css info
|
||||
cssstr , classlst = stylexml2css.convert2CSS(flat_xml, fontsize, ph, pw)
|
||||
file(xname, 'wb').write(cssstr)
|
||||
xname = os.path.join(xmlDir, 'other0000.xml')
|
||||
|
@ -414,6 +441,9 @@ def generateBook(bookDir, raw, fixedimage):
|
|||
glyfile.close()
|
||||
print " "
|
||||
|
||||
# build up tocentries while processing html
|
||||
tocentries = ''
|
||||
|
||||
# start up the html
|
||||
htmlFileName = "book.html"
|
||||
htmlstr = '<?xml version="1.0" encoding="utf-8"?>\n'
|
||||
|
@ -436,6 +466,77 @@ def generateBook(bookDir, raw, fixedimage):
|
|||
# readability when rendering to the screen.
|
||||
scaledpi = 1440.0
|
||||
|
||||
filenames = os.listdir(pageDir)
|
||||
filenames = sorted(filenames)
|
||||
numfiles = len(filenames)
|
||||
|
||||
xmllst = []
|
||||
|
||||
for filename in filenames:
|
||||
# print ' ', filename
|
||||
print ".",
|
||||
fname = os.path.join(pageDir,filename)
|
||||
flat_xml = convert2xml.fromData(dict, fname)
|
||||
|
||||
# keep flat_xml for later svg processing
|
||||
xmllst.append(flat_xml)
|
||||
|
||||
xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
|
||||
file(xname, 'wb').write(convert2xml.getXML(dict, fname))
|
||||
|
||||
# first get the html
|
||||
pagehtml, tocinfo = flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir, gd, fixedimage)
|
||||
tocentries += tocinfo
|
||||
htmlstr += pagehtml
|
||||
|
||||
# finish up the html string and output it
|
||||
htmlstr += '</body>\n</html>\n'
|
||||
file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
|
||||
|
||||
print " "
|
||||
print 'Extracting Table of Contents from Amazon OCR'
|
||||
|
||||
# first create a table of contents file for the svg images
|
||||
tochtml = '<?xml version="1.0" encoding="utf-8"?>\n'
|
||||
tochtml += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
|
||||
tochtml += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >'
|
||||
tochtml += '<head>\n'
|
||||
tochtml += '<title>' + meta_array['Title'] + '</title>\n'
|
||||
tochtml += '<meta name="Author" content="' + meta_array['Authors'] + '" />\n'
|
||||
tochtml += '<meta name="Title" content="' + meta_array['Title'] + '" />\n'
|
||||
if 'ASIN' in meta_array:
|
||||
tochtml += '<meta name="ASIN" content="' + meta_array['ASIN'] + '" />\n'
|
||||
if 'GUID' in meta_array:
|
||||
tochtml += '<meta name="GUID" content="' + meta_array['GUID'] + '" />\n'
|
||||
tochtml += '</head>\n'
|
||||
tochtml += '<body>\n'
|
||||
|
||||
tochtml += '<h2>Table of Contents</h2>\n'
|
||||
start = pageidnums[0]
|
||||
if (raw):
|
||||
startname = 'page%04d.svg' % start
|
||||
else:
|
||||
startname = 'page%04d.xhtml' % start
|
||||
|
||||
tochtml += '<h3><a href="' + startname + '">Start of Book</a></h3>\n'
|
||||
# build up a table of contents for the svg xhtml output
|
||||
toclst = tocentries.split('\n')
|
||||
toclst.pop()
|
||||
for entry in toclst:
|
||||
print entry
|
||||
title, pagenum = entry.split('|')
|
||||
id = pageidnums[int(pagenum)]
|
||||
if (raw):
|
||||
fname = 'page%04d.svg' % id
|
||||
else:
|
||||
fname = 'page%04d.xhtml' % id
|
||||
tochtml += '<h3><a href="'+ fname + '">' + title + '</a></h3>\n'
|
||||
tochtml += '</body>\n'
|
||||
tochtml += '</html>\n'
|
||||
file(os.path.join(svgDir, 'toc.xhtml'), 'wb').write(tochtml)
|
||||
|
||||
|
||||
# now create index_svg.xhtml that points to all required files
|
||||
svgindex = '<?xml version="1.0" encoding="utf-8"?>\n'
|
||||
svgindex += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
|
||||
svgindex += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >'
|
||||
|
@ -450,50 +551,42 @@ def generateBook(bookDir, raw, fixedimage):
|
|||
svgindex += '</head>\n'
|
||||
svgindex += '<body>\n'
|
||||
|
||||
filenames = os.listdir(pageDir)
|
||||
filenames = sorted(filenames)
|
||||
numfiles = len(filenames)
|
||||
counter = 0
|
||||
|
||||
for filename in filenames:
|
||||
# print ' ', filename
|
||||
print ".",
|
||||
|
||||
fname = os.path.join(pageDir,filename)
|
||||
flat_xml = convert2xml.fromData(dict, fname)
|
||||
|
||||
xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
|
||||
file(xname, 'wb').write(convert2xml.getXML(dict, fname))
|
||||
|
||||
# first get the html
|
||||
htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir, gd, fixedimage)
|
||||
|
||||
# now get the svg image of the page
|
||||
svgxml = flatxml2svg.convert2SVG(gd, flat_xml, counter, numfiles, svgDir, raw, meta_array, scaledpi)
|
||||
|
||||
print "Building svg images of each book page"
|
||||
svgindex += '<h2>List of Pages</h2>\n'
|
||||
svgindex += '<div>\n'
|
||||
idlst = sorted(pageIDMap.keys())
|
||||
numids = len(idlst)
|
||||
cnt = len(idlst)
|
||||
previd = None
|
||||
for j in range(cnt):
|
||||
pageid = idlst[j]
|
||||
if j < cnt - 1:
|
||||
nextid = idlst[j+1]
|
||||
else:
|
||||
nextid = None
|
||||
print '.',
|
||||
pagelst = pageIDMap[pageid]
|
||||
flat_svg = ''
|
||||
for page in pagelst:
|
||||
flat_svg += xmllst[page]
|
||||
svgxml = flatxml2svg.convert2SVG(gd, flat_svg, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi)
|
||||
if (raw) :
|
||||
pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w')
|
||||
svgindex += '<a href="svg/page%04d.svg">Page %d</a>\n' % (counter, counter)
|
||||
pfile = open(os.path.join(svgDir,'page%04d.svg' % pageid),'w')
|
||||
svgindex += '<a href="svg/page%04d.svg">Page %d</a>\n' % (pageid, pageid)
|
||||
else :
|
||||
pfile = open(os.path.join(svgDir,'page%04d.xhtml' % counter), 'w')
|
||||
svgindex += '<a href="svg/page%04d.xhtml">Page %d</a>\n' % (counter, counter)
|
||||
|
||||
|
||||
pfile = open(os.path.join(svgDir,'page%04d.xhtml' % pageid), 'w')
|
||||
svgindex += '<a href="svg/page%04d.xhtml">Page %d</a>\n' % (pageid, pageid)
|
||||
previd = pageid
|
||||
pfile.write(svgxml)
|
||||
pfile.close()
|
||||
|
||||
counter += 1
|
||||
|
||||
print " "
|
||||
|
||||
# finish up the html string and output it
|
||||
htmlstr += '</body>\n</html>\n'
|
||||
file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
|
||||
|
||||
# finish up the svg index string and output it
|
||||
svgindex += '</div>\n'
|
||||
svgindex += '<h2><a href="svg/toc.xhtml">Table of Contents</a></h2>\n'
|
||||
svgindex += '</body>\n</html>\n'
|
||||
file(os.path.join(bookDir, 'index_svg.xhtml'), 'wb').write(svgindex)
|
||||
|
||||
print " "
|
||||
|
||||
# build the opf file
|
||||
opfname = os.path.join(bookDir, 'book.opf')
|
||||
opfstr = '<?xml version="1.0" encoding="utf-8"?>\n'
|
||||
|
@ -573,7 +666,7 @@ def main(argv):
|
|||
return 1
|
||||
|
||||
raw = 0
|
||||
fixedimage = False
|
||||
fixedimage = True
|
||||
for o, a in opts:
|
||||
if o =="-h":
|
||||
usage()
|
||||
|
|
|
@ -17,7 +17,7 @@ from __future__ import with_statement
|
|||
# and many many others
|
||||
|
||||
|
||||
__version__ = '3.7'
|
||||
__version__ = '3.9'
|
||||
|
||||
class Unbuffered:
|
||||
def __init__(self, stream):
|
||||
|
@ -32,6 +32,7 @@ import sys
|
|||
import os, csv, getopt
|
||||
import string
|
||||
import re
|
||||
import traceback
|
||||
|
||||
class DrmException(Exception):
|
||||
pass
|
||||
|
@ -95,8 +96,14 @@ def decryptBook(infile, outdir, k4, kInfoFiles, serials, pids):
|
|||
print "Processing Book: ", title
|
||||
filenametitle = cleanup_name(title)
|
||||
outfilename = bookname
|
||||
if len(bookname)>4 and len(filenametitle)>4 and bookname[:4] != filenametitle[:4]:
|
||||
if len(outfilename)<=8 or len(filenametitle)<=8:
|
||||
outfilename = outfilename + "_" + filenametitle
|
||||
elif outfilename[:8] != filenametitle[:8]:
|
||||
outfilename = outfilename[:8] + "_" + filenametitle
|
||||
|
||||
# avoid excessively long file names
|
||||
if len(outfilename)>150:
|
||||
outfilename = outfilename[:150]
|
||||
|
||||
# build pid list
|
||||
md1, md2 = mb.getPIDMetaInfo()
|
||||
|
@ -128,8 +135,8 @@ def decryptBook(infile, outdir, k4, kInfoFiles, serials, pids):
|
|||
zipname = os.path.join(outdir, outfilename + '_nodrm' + '.htmlz')
|
||||
mb.getHTMLZip(zipname)
|
||||
|
||||
print " Creating SVG HTMLZ Archive"
|
||||
zipname = os.path.join(outdir, outfilename + '_SVG' + '.htmlz')
|
||||
print " Creating SVG ZIP Archive"
|
||||
zipname = os.path.join(outdir, outfilename + '_SVG' + '.zip')
|
||||
mb.getSVGZip(zipname)
|
||||
|
||||
print " Creating XML ZIP Archive"
|
||||
|
|
|
@ -81,6 +81,14 @@ class DocParser(object):
|
|||
pos = foundpos + 1
|
||||
return startpos
|
||||
|
||||
# returns a vector of integers for the tagpath
|
||||
def getData(self, tagpath, pos, end):
|
||||
argres=[]
|
||||
(foundat, argt) = self.findinDoc(tagpath, pos, end)
|
||||
if (argt != None) and (len(argt) > 0) :
|
||||
argList = argt.split('|')
|
||||
argres = [ int(strval) for strval in argList]
|
||||
return argres
|
||||
|
||||
def process(self):
|
||||
|
||||
|
@ -237,7 +245,11 @@ def convert2CSS(flatxml, fontsize, ph, pw):
|
|||
|
||||
# create a document parser
|
||||
dp = DocParser(flatxml, fontsize, ph, pw)
|
||||
|
||||
csspage = dp.process()
|
||||
|
||||
return csspage
|
||||
|
||||
|
||||
def getpageIDMap(flatxml):
|
||||
dp = DocParser(flatxml, 0, 0, 0)
|
||||
pageidnumbers = dp.getData('info.original.pid', 0, -1)
|
||||
return pageidnumbers
|
||||
|
|
|
@ -140,6 +140,7 @@ class TopazBook:
|
|||
def __init__(self, filename):
|
||||
self.fo = file(filename, 'rb')
|
||||
self.outdir = tempfile.mkdtemp()
|
||||
# self.outdir = 'rawdat'
|
||||
self.bookPayloadOffset = 0
|
||||
self.bookHeaderRecords = {}
|
||||
self.bookMetadata = {}
|
||||
|
@ -370,7 +371,8 @@ class TopazBook:
|
|||
|
||||
def cleanup(self):
|
||||
if os.path.isdir(self.outdir):
|
||||
shutil.rmtree(self.outdir, True)
|
||||
pass
|
||||
# shutil.rmtree(self.outdir, True)
|
||||
|
||||
def usage(progname):
|
||||
print "Removes DRM protection from Topaz ebooks and extract the contents"
|
||||
|
@ -438,7 +440,7 @@ def main(argv=sys.argv):
|
|||
tb.getHTMLZip(zipname)
|
||||
|
||||
print " Creating SVG ZIP Archive"
|
||||
zipname = os.path.join(outdir, bookname + '_SVG' + '.htmlz')
|
||||
zipname = os.path.join(outdir, bookname + '_SVG' + '.zip')
|
||||
tb.getSVGZip(zipname)
|
||||
|
||||
print " Creating XML ZIP Archive"
|
||||
|
@ -450,12 +452,12 @@ def main(argv=sys.argv):
|
|||
|
||||
except TpzDRMError, e:
|
||||
print str(e)
|
||||
tb.cleanup()
|
||||
# tb.cleanup()
|
||||
return 1
|
||||
|
||||
except Exception, e:
|
||||
print str(e)
|
||||
tb.cleanup
|
||||
# tb.cleanup
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
|
Binary file not shown.
|
@ -47,7 +47,7 @@ class eRdrDeDRM(FileTypePlugin):
|
|||
Credit given to The Dark Reverser for the original standalone script.'
|
||||
supported_platforms = ['linux', 'osx', 'windows'] # Platforms this plugin will run on
|
||||
author = 'DiapDealer' # The author of this plugin
|
||||
version = (0, 0, 5) # The version number of this plugin
|
||||
version = (0, 0, 6) # The version number of this plugin
|
||||
file_types = set(['pdb']) # The file types that this plugin will be applied to
|
||||
on_import = True # Run this plugin during the import
|
||||
minimum_calibre_version = (0, 7, 55)
|
||||
|
@ -114,7 +114,7 @@ class eRdrDeDRM(FileTypePlugin):
|
|||
|
||||
print " Decoding File"
|
||||
sect = erdr2pml.Sectionizer(infile, 'PNRdPPrs')
|
||||
er = erdr2pml.EreaderProcessor(sect.loadSection, name, cc)
|
||||
er = erdr2pml.EreaderProcessor(sect, name, cc)
|
||||
|
||||
if er.getNumImages() > 0:
|
||||
print " Extracting images"
|
||||
|
|
|
@ -59,8 +59,11 @@
|
|||
# 0.18 - on Windows try PyCrypto first and OpenSSL next
|
||||
# 0.19 - Modify the interface to allow use of import
|
||||
# 0.20 - modify to allow use inside new interface for calibre plugins
|
||||
# 0.21 - Support eReader (drm) version 11.
|
||||
# - Don't reject dictionary format.
|
||||
# - Ignore sidebars for dictionaries (different format?)
|
||||
|
||||
__version__='0.20'
|
||||
__version__='0.21'
|
||||
|
||||
class Unbuffered:
|
||||
def __init__(self, stream):
|
||||
|
@ -140,12 +143,18 @@ logging.basicConfig()
|
|||
|
||||
|
||||
class Sectionizer(object):
|
||||
bkType = "Book"
|
||||
|
||||
def __init__(self, filename, ident):
|
||||
self.contents = file(filename, 'rb').read()
|
||||
self.header = self.contents[0:72]
|
||||
self.num_sections, = struct.unpack('>H', self.contents[76:78])
|
||||
# Dictionary or normal content (TODO: Not hard-coded)
|
||||
if self.header[0x3C:0x3C+8] != ident:
|
||||
raise ValueError('Invalid file format')
|
||||
if self.header[0x3C:0x3C+8] == "PDctPPrs":
|
||||
self.bkType = "Dict"
|
||||
else:
|
||||
raise ValueError('Invalid file format')
|
||||
self.sections = []
|
||||
for i in xrange(self.num_sections):
|
||||
offset, a1,a2,a3,a4 = struct.unpack('>LBBBB', self.contents[78+i*8:78+i*8+8])
|
||||
|
@ -182,15 +191,15 @@ def deXOR(text, sp, table):
|
|||
return r
|
||||
|
||||
class EreaderProcessor(object):
|
||||
def __init__(self, section_reader, username, creditcard):
|
||||
self.section_reader = section_reader
|
||||
data = section_reader(0)
|
||||
def __init__(self, sect, username, creditcard):
|
||||
self.section_reader = sect.loadSection
|
||||
data = self.section_reader(0)
|
||||
version, = struct.unpack('>H', data[0:2])
|
||||
self.version = version
|
||||
logging.info('eReader file format version %s', version)
|
||||
if version != 272 and version != 260 and version != 259:
|
||||
raise ValueError('incorrect eReader version %d (error 1)' % version)
|
||||
data = section_reader(1)
|
||||
data = self.section_reader(1)
|
||||
self.data = data
|
||||
des = Des(fixKey(data[0:8]))
|
||||
cookie_shuf, cookie_size = struct.unpack('>LL', des.decrypt(data[-8:]))
|
||||
|
@ -219,11 +228,17 @@ class EreaderProcessor(object):
|
|||
self.num_text_pages = struct.unpack('>H', r[2:4])[0] - 1
|
||||
self.num_image_pages = struct.unpack('>H', r[26:26+2])[0]
|
||||
self.first_image_page = struct.unpack('>H', r[24:24+2])[0]
|
||||
# Default values
|
||||
self.num_footnote_pages = 0
|
||||
self.num_sidebar_pages = 0
|
||||
self.first_footnote_page = -1
|
||||
self.first_sidebar_page = -1
|
||||
if self.version == 272:
|
||||
self.num_footnote_pages = struct.unpack('>H', r[46:46+2])[0]
|
||||
self.first_footnote_page = struct.unpack('>H', r[44:44+2])[0]
|
||||
self.num_sidebar_pages = struct.unpack('>H', r[38:38+2])[0]
|
||||
self.first_sidebar_page = struct.unpack('>H', r[36:36+2])[0]
|
||||
if (sect.bkType == "Book"):
|
||||
self.num_sidebar_pages = struct.unpack('>H', r[38:38+2])[0]
|
||||
self.first_sidebar_page = struct.unpack('>H', r[36:36+2])[0]
|
||||
# self.num_bookinfo_pages = struct.unpack('>H', r[34:34+2])[0]
|
||||
# self.first_bookinfo_page = struct.unpack('>H', r[32:32+2])[0]
|
||||
# self.num_chapter_pages = struct.unpack('>H', r[22:22+2])[0]
|
||||
|
@ -239,10 +254,8 @@ class EreaderProcessor(object):
|
|||
self.xortable_size = struct.unpack('>H', r[42:42+2])[0]
|
||||
self.xortable = self.data[self.xortable_offset:self.xortable_offset + self.xortable_size]
|
||||
else:
|
||||
self.num_footnote_pages = 0
|
||||
self.num_sidebar_pages = 0
|
||||
self.first_footnote_page = -1
|
||||
self.first_sidebar_page = -1
|
||||
# Nothing needs to be done
|
||||
pass
|
||||
# self.num_bookinfo_pages = 0
|
||||
# self.num_chapter_pages = 0
|
||||
# self.num_link_pages = 0
|
||||
|
@ -267,10 +280,14 @@ class EreaderProcessor(object):
|
|||
encrypted_key_sha = r[44:44+20]
|
||||
encrypted_key = r[64:64+8]
|
||||
elif version == 260:
|
||||
if drm_sub_version != 13:
|
||||
if drm_sub_version != 13 and drm_sub_version != 11:
|
||||
raise ValueError('incorrect eReader version %d (error 3)' % drm_sub_version)
|
||||
encrypted_key = r[44:44+8]
|
||||
encrypted_key_sha = r[52:52+20]
|
||||
if drm_sub_version == 13:
|
||||
encrypted_key = r[44:44+8]
|
||||
encrypted_key_sha = r[52:52+20]
|
||||
else:
|
||||
encrypted_key = r[64:64+8]
|
||||
encrypted_key_sha = r[44:44+20]
|
||||
elif version == 272:
|
||||
encrypted_key = r[172:172+8]
|
||||
encrypted_key_sha = r[56:56+20]
|
||||
|
@ -356,6 +373,12 @@ class EreaderProcessor(object):
|
|||
r += fmarker
|
||||
fnote_ids = fnote_ids[id_len+4:]
|
||||
|
||||
# TODO: Handle dictionary index (?) pages - which are also marked as
|
||||
# sidebar_pages (?). For now dictionary sidebars are ignored
|
||||
# For dictionaries - record 0 is null terminated strings, followed by
|
||||
# blocks of around 62000 bytes and a final block. Not sure of the
|
||||
# encoding
|
||||
|
||||
# now handle sidebar pages
|
||||
if self.num_sidebar_pages > 0:
|
||||
r += '\n'
|
||||
|
@ -368,7 +391,7 @@ class EreaderProcessor(object):
|
|||
id_len = ord(sbar_ids[2])
|
||||
id = sbar_ids[3:3+id_len]
|
||||
smarker = '<sidebar id="%s">\n' % id
|
||||
smarker += zlib.decompress(des.decrypt(self.section_reader(self.first_footnote_page + i)))
|
||||
smarker += zlib.decompress(des.decrypt(self.section_reader(self.first_sidebar_page + i)))
|
||||
smarker += '\n</sidebar>\n'
|
||||
r += smarker
|
||||
sbar_ids = sbar_ids[id_len+4:]
|
||||
|
@ -389,7 +412,7 @@ def convertEreaderToPml(infile, name, cc, outdir):
|
|||
bookname = os.path.splitext(os.path.basename(infile))[0]
|
||||
print " Decoding File"
|
||||
sect = Sectionizer(infile, 'PNRdPPrs')
|
||||
er = EreaderProcessor(sect.loadSection, name, cc)
|
||||
er = EreaderProcessor(sect, name, cc)
|
||||
|
||||
if er.getNumImages() > 0:
|
||||
print " Extracting images"
|
||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -24,17 +24,17 @@
|
|||
<key>CFBundleExecutable</key>
|
||||
<string>droplet</string>
|
||||
<key>CFBundleGetInfoString</key>
|
||||
<string>DeDRM 3.0, Written 2010–2011 by Apprentice Alf and others.</string>
|
||||
<string>DeDRM 3.1, Written 2010–2011 by Apprentice Alf and others.</string>
|
||||
<key>CFBundleIconFile</key>
|
||||
<string>droplet</string>
|
||||
<key>CFBundleInfoDictionaryVersion</key>
|
||||
<string>6.0</string>
|
||||
<key>CFBundleName</key>
|
||||
<string>DeDRM 3.0</string>
|
||||
<string>DeDRM 3.1</string>
|
||||
<key>CFBundlePackageType</key>
|
||||
<string>APPL</string>
|
||||
<key>CFBundleShortVersionString</key>
|
||||
<string>3.0</string>
|
||||
<string>3.1</string>
|
||||
<key>CFBundleSignature</key>
|
||||
<string>dplt</string>
|
||||
<key>LSMinimumSystemVersion</key>
|
||||
|
@ -43,14 +43,18 @@
|
|||
<true/>
|
||||
<key>WindowState</key>
|
||||
<dict>
|
||||
<key>dividerCollapsed</key>
|
||||
<false/>
|
||||
<key>eventLogLevel</key>
|
||||
<integer>-1</integer>
|
||||
<key>name</key>
|
||||
<string>ScriptWindowState</string>
|
||||
<key>positionOfDivider</key>
|
||||
<real>274</real>
|
||||
<real>460</real>
|
||||
<key>savedFrame</key>
|
||||
<string>39 376 439 476 0 0 1440 878 </string>
|
||||
<string>39 106 1316 746 0 0 1440 878 </string>
|
||||
<key>selectedTabView</key>
|
||||
<string>result</string>
|
||||
<string>event log</string>
|
||||
</dict>
|
||||
</dict>
|
||||
</plist>
|
||||
|
|
Binary file not shown.
|
@ -20,6 +20,8 @@ import getopt
|
|||
from struct import pack
|
||||
from struct import unpack
|
||||
|
||||
class TpzDRMError(Exception):
|
||||
pass
|
||||
|
||||
# Get a 7 bit encoded number from string. The most
|
||||
# significant byte comes first and has the high bit (8th) set
|
||||
|
@ -138,7 +140,8 @@ class Dictionary(object):
|
|||
return self.stable[self.pos]
|
||||
else:
|
||||
print "Error - %d outside of string table limits" % val
|
||||
sys.exit(-1)
|
||||
raise TpzDRMError('outside of string table limits')
|
||||
# sys.exit(-1)
|
||||
|
||||
def getSize(self):
|
||||
return self.size
|
||||
|
@ -258,6 +261,11 @@ class PageParser(object):
|
|||
'paragraph.class' : (1, 'scalar_text', 0, 0),
|
||||
'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
|
||||
'word_semantic' : (1, 'snippets', 1, 1),
|
||||
'word_semantic.type' : (1, 'scalar_text', 0, 0),
|
||||
|
@ -272,11 +280,17 @@ class PageParser(object):
|
|||
|
||||
'_span' : (1, 'snippets', 1, 0),
|
||||
'_span.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
'-span.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'_span.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'_span.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
'_span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'_span.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
'span' : (1, 'snippets', 1, 0),
|
||||
'span.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
'span.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'span.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
'span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'span.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
'extratokens' : (1, 'snippets', 1, 0),
|
||||
'extratokens.type' : (1, 'scalar_text', 0, 0),
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
{\rtf1\ansi\ansicpg1252\cocoartf949\cocoasubrtf540
|
||||
{\rtf1\ansi\ansicpg1252\cocoartf1038\cocoasubrtf360
|
||||
{\fonttbl}
|
||||
{\colortbl;\red255\green255\blue255;}
|
||||
}
|
|
@ -59,8 +59,11 @@
|
|||
# 0.18 - on Windows try PyCrypto first and OpenSSL next
|
||||
# 0.19 - Modify the interface to allow use of import
|
||||
# 0.20 - modify to allow use inside new interface for calibre plugins
|
||||
# 0.21 - Support eReader (drm) version 11.
|
||||
# - Don't reject dictionary format.
|
||||
# - Ignore sidebars for dictionaries (different format?)
|
||||
|
||||
__version__='0.20'
|
||||
__version__='0.21'
|
||||
|
||||
class Unbuffered:
|
||||
def __init__(self, stream):
|
||||
|
@ -140,12 +143,18 @@ logging.basicConfig()
|
|||
|
||||
|
||||
class Sectionizer(object):
|
||||
bkType = "Book"
|
||||
|
||||
def __init__(self, filename, ident):
|
||||
self.contents = file(filename, 'rb').read()
|
||||
self.header = self.contents[0:72]
|
||||
self.num_sections, = struct.unpack('>H', self.contents[76:78])
|
||||
# Dictionary or normal content (TODO: Not hard-coded)
|
||||
if self.header[0x3C:0x3C+8] != ident:
|
||||
raise ValueError('Invalid file format')
|
||||
if self.header[0x3C:0x3C+8] == "PDctPPrs":
|
||||
self.bkType = "Dict"
|
||||
else:
|
||||
raise ValueError('Invalid file format')
|
||||
self.sections = []
|
||||
for i in xrange(self.num_sections):
|
||||
offset, a1,a2,a3,a4 = struct.unpack('>LBBBB', self.contents[78+i*8:78+i*8+8])
|
||||
|
@ -182,15 +191,15 @@ def deXOR(text, sp, table):
|
|||
return r
|
||||
|
||||
class EreaderProcessor(object):
|
||||
def __init__(self, section_reader, username, creditcard):
|
||||
self.section_reader = section_reader
|
||||
data = section_reader(0)
|
||||
def __init__(self, sect, username, creditcard):
|
||||
self.section_reader = sect.loadSection
|
||||
data = self.section_reader(0)
|
||||
version, = struct.unpack('>H', data[0:2])
|
||||
self.version = version
|
||||
logging.info('eReader file format version %s', version)
|
||||
if version != 272 and version != 260 and version != 259:
|
||||
raise ValueError('incorrect eReader version %d (error 1)' % version)
|
||||
data = section_reader(1)
|
||||
data = self.section_reader(1)
|
||||
self.data = data
|
||||
des = Des(fixKey(data[0:8]))
|
||||
cookie_shuf, cookie_size = struct.unpack('>LL', des.decrypt(data[-8:]))
|
||||
|
@ -219,11 +228,17 @@ class EreaderProcessor(object):
|
|||
self.num_text_pages = struct.unpack('>H', r[2:4])[0] - 1
|
||||
self.num_image_pages = struct.unpack('>H', r[26:26+2])[0]
|
||||
self.first_image_page = struct.unpack('>H', r[24:24+2])[0]
|
||||
# Default values
|
||||
self.num_footnote_pages = 0
|
||||
self.num_sidebar_pages = 0
|
||||
self.first_footnote_page = -1
|
||||
self.first_sidebar_page = -1
|
||||
if self.version == 272:
|
||||
self.num_footnote_pages = struct.unpack('>H', r[46:46+2])[0]
|
||||
self.first_footnote_page = struct.unpack('>H', r[44:44+2])[0]
|
||||
self.num_sidebar_pages = struct.unpack('>H', r[38:38+2])[0]
|
||||
self.first_sidebar_page = struct.unpack('>H', r[36:36+2])[0]
|
||||
if (sect.bkType == "Book"):
|
||||
self.num_sidebar_pages = struct.unpack('>H', r[38:38+2])[0]
|
||||
self.first_sidebar_page = struct.unpack('>H', r[36:36+2])[0]
|
||||
# self.num_bookinfo_pages = struct.unpack('>H', r[34:34+2])[0]
|
||||
# self.first_bookinfo_page = struct.unpack('>H', r[32:32+2])[0]
|
||||
# self.num_chapter_pages = struct.unpack('>H', r[22:22+2])[0]
|
||||
|
@ -239,10 +254,8 @@ class EreaderProcessor(object):
|
|||
self.xortable_size = struct.unpack('>H', r[42:42+2])[0]
|
||||
self.xortable = self.data[self.xortable_offset:self.xortable_offset + self.xortable_size]
|
||||
else:
|
||||
self.num_footnote_pages = 0
|
||||
self.num_sidebar_pages = 0
|
||||
self.first_footnote_page = -1
|
||||
self.first_sidebar_page = -1
|
||||
# Nothing needs to be done
|
||||
pass
|
||||
# self.num_bookinfo_pages = 0
|
||||
# self.num_chapter_pages = 0
|
||||
# self.num_link_pages = 0
|
||||
|
@ -267,10 +280,14 @@ class EreaderProcessor(object):
|
|||
encrypted_key_sha = r[44:44+20]
|
||||
encrypted_key = r[64:64+8]
|
||||
elif version == 260:
|
||||
if drm_sub_version != 13:
|
||||
if drm_sub_version != 13 and drm_sub_version != 11:
|
||||
raise ValueError('incorrect eReader version %d (error 3)' % drm_sub_version)
|
||||
encrypted_key = r[44:44+8]
|
||||
encrypted_key_sha = r[52:52+20]
|
||||
if drm_sub_version == 13:
|
||||
encrypted_key = r[44:44+8]
|
||||
encrypted_key_sha = r[52:52+20]
|
||||
else:
|
||||
encrypted_key = r[64:64+8]
|
||||
encrypted_key_sha = r[44:44+20]
|
||||
elif version == 272:
|
||||
encrypted_key = r[172:172+8]
|
||||
encrypted_key_sha = r[56:56+20]
|
||||
|
@ -356,6 +373,12 @@ class EreaderProcessor(object):
|
|||
r += fmarker
|
||||
fnote_ids = fnote_ids[id_len+4:]
|
||||
|
||||
# TODO: Handle dictionary index (?) pages - which are also marked as
|
||||
# sidebar_pages (?). For now dictionary sidebars are ignored
|
||||
# For dictionaries - record 0 is null terminated strings, followed by
|
||||
# blocks of around 62000 bytes and a final block. Not sure of the
|
||||
# encoding
|
||||
|
||||
# now handle sidebar pages
|
||||
if self.num_sidebar_pages > 0:
|
||||
r += '\n'
|
||||
|
@ -368,7 +391,7 @@ class EreaderProcessor(object):
|
|||
id_len = ord(sbar_ids[2])
|
||||
id = sbar_ids[3:3+id_len]
|
||||
smarker = '<sidebar id="%s">\n' % id
|
||||
smarker += zlib.decompress(des.decrypt(self.section_reader(self.first_footnote_page + i)))
|
||||
smarker += zlib.decompress(des.decrypt(self.section_reader(self.first_sidebar_page + i)))
|
||||
smarker += '\n</sidebar>\n'
|
||||
r += smarker
|
||||
sbar_ids = sbar_ids[id_len+4:]
|
||||
|
@ -389,7 +412,7 @@ def convertEreaderToPml(infile, name, cc, outdir):
|
|||
bookname = os.path.splitext(os.path.basename(infile))[0]
|
||||
print " Decoding File"
|
||||
sect = Sectionizer(infile, 'PNRdPPrs')
|
||||
er = EreaderProcessor(sect.loadSection, name, cc)
|
||||
er = EreaderProcessor(sect, name, cc)
|
||||
|
||||
if er.getNumImages() > 0:
|
||||
print " Extracting images"
|
||||
|
|
|
@ -271,6 +271,9 @@ class DocParser(object):
|
|||
|
||||
pclass = self.getClass(pclass)
|
||||
|
||||
# if paragraph uses extratokens (extra glyphs) then make it fixed
|
||||
(pos, extraglyphs) = self.findinDoc('paragraph.extratokens',start,end)
|
||||
|
||||
# build up a description of the paragraph in result and return it
|
||||
# first check for the basic - all words paragraph
|
||||
(pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
|
||||
|
@ -280,6 +283,7 @@ class DocParser(object):
|
|||
last = int(slast)
|
||||
|
||||
makeImage = (regtype == 'vertical') or (regtype == 'table')
|
||||
makeImage = makeImage or (extraglyphs != None)
|
||||
if self.fixedimage:
|
||||
makeImage = makeImage or (regtype == 'fixed')
|
||||
|
||||
|
@ -353,6 +357,8 @@ class DocParser(object):
|
|||
|
||||
word_class = ''
|
||||
|
||||
word_semantic_type = ''
|
||||
|
||||
while (line < end) :
|
||||
|
||||
(name, argres) = self.lineinDoc(line)
|
||||
|
@ -512,6 +518,72 @@ class DocParser(object):
|
|||
return parares
|
||||
|
||||
|
||||
def buildTOCEntry(self, pdesc) :
|
||||
parares = ''
|
||||
sep =''
|
||||
tocentry = ''
|
||||
handle_links = len(self.link_id) > 0
|
||||
|
||||
lstart = 0
|
||||
|
||||
cnt = len(pdesc)
|
||||
for j in xrange( 0, cnt) :
|
||||
|
||||
(wtype, num) = pdesc[j]
|
||||
|
||||
if wtype == 'ocr' :
|
||||
word = self.ocrtext[num]
|
||||
sep = ' '
|
||||
|
||||
if handle_links:
|
||||
link = self.link_id[num]
|
||||
if (link > 0):
|
||||
linktype = self.link_type[link-1]
|
||||
title = self.link_title[link-1]
|
||||
title = title.rstrip('. ')
|
||||
alt_title = parares[lstart:]
|
||||
alt_title = alt_title.strip()
|
||||
# now strip off the actual printed page number
|
||||
alt_title = alt_title.rstrip('01234567890ivxldIVXLD-.')
|
||||
alt_title = alt_title.rstrip('. ')
|
||||
# skip over any external links - can't have them in a books toc
|
||||
if linktype == 'external' :
|
||||
title = ''
|
||||
alt_title = ''
|
||||
linkpage = ''
|
||||
else :
|
||||
if len(self.link_page) >= link :
|
||||
ptarget = self.link_page[link-1] - 1
|
||||
linkpage = '%04d' % ptarget
|
||||
else :
|
||||
# just link to the current page
|
||||
linkpage = self.id[4:]
|
||||
if len(alt_title) >= len(title):
|
||||
title = alt_title
|
||||
if title != '' and linkpage != '':
|
||||
tocentry += title + '|' + linkpage + '\n'
|
||||
lstart = len(parares)
|
||||
if word == '_link_' : word = ''
|
||||
elif (link < 0) :
|
||||
if word == '_link_' : word = ''
|
||||
|
||||
if word == '_lb_':
|
||||
word = ''
|
||||
sep = ''
|
||||
|
||||
if num in self.dehyphen_rootid :
|
||||
word = word[0:-1]
|
||||
sep = ''
|
||||
|
||||
parares += word + sep
|
||||
|
||||
else :
|
||||
continue
|
||||
|
||||
return tocentry
|
||||
|
||||
|
||||
|
||||
|
||||
# walk the document tree collecting the information needed
|
||||
# to build an html page using the ocrText
|
||||
|
@ -519,6 +591,7 @@ class DocParser(object):
|
|||
def process(self):
|
||||
|
||||
htmlpage = ''
|
||||
tocinfo = ''
|
||||
|
||||
# get the ocr text
|
||||
(pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
|
||||
|
@ -644,9 +717,9 @@ class DocParser(object):
|
|||
ptype = 'end'
|
||||
first_para_continued = False
|
||||
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
|
||||
tocinfo += self.buildTOCEntry(pdesc)
|
||||
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
|
||||
|
||||
|
||||
elif (regtype == 'vertical') or (regtype == 'table') :
|
||||
ptype = 'full'
|
||||
if inGroup:
|
||||
|
@ -704,12 +777,11 @@ class DocParser(object):
|
|||
htmlpage = htmlpage[0:-4]
|
||||
last_para_continued = False
|
||||
|
||||
return htmlpage
|
||||
|
||||
return htmlpage, tocinfo
|
||||
|
||||
|
||||
def convert2HTML(flatxml, classlst, fileid, bookDir, gdict, fixedimage):
|
||||
# create a document parser
|
||||
dp = DocParser(flatxml, classlst, fileid, bookDir, gdict, fixedimage)
|
||||
htmlpage = dp.process()
|
||||
return htmlpage
|
||||
htmlpage, tocinfo = dp.process()
|
||||
return htmlpage, tocinfo
|
||||
|
|
|
@ -10,17 +10,94 @@ from struct import unpack
|
|||
|
||||
|
||||
class PParser(object):
|
||||
def __init__(self, gd, flatxml):
|
||||
def __init__(self, gd, flatxml, meta_array):
|
||||
self.gd = gd
|
||||
self.flatdoc = flatxml.split('\n')
|
||||
self.docSize = len(self.flatdoc)
|
||||
self.temp = []
|
||||
foo = self.getData('page.h') or self.getData('book.h')
|
||||
self.ph = foo[0]
|
||||
foo = self.getData('page.w') or self.getData('book.w')
|
||||
self.pw = foo[0]
|
||||
self.gx = self.getData('info.glyph.x')
|
||||
self.gy = self.getData('info.glyph.y')
|
||||
self.gid = self.getData('info.glyph.glyphID')
|
||||
|
||||
self.ph = -1
|
||||
self.pw = -1
|
||||
startpos = self.posinDoc('page.h') or self.posinDoc('book.h')
|
||||
for p in startpos:
|
||||
(name, argres) = self.lineinDoc(p)
|
||||
self.ph = max(self.ph, int(argres))
|
||||
startpos = self.posinDoc('page.w') or self.posinDoc('book.w')
|
||||
for p in startpos:
|
||||
(name, argres) = self.lineinDoc(p)
|
||||
self.pw = max(self.pw, int(argres))
|
||||
|
||||
if self.ph <= 0:
|
||||
self.ph = int(meta_array.get('pageHeight', '11000'))
|
||||
if self.pw <= 0:
|
||||
self.pw = int(meta_array.get('pageWidth', '8500'))
|
||||
|
||||
res = []
|
||||
startpos = self.posinDoc('info.glyph.x')
|
||||
for p in startpos:
|
||||
argres = self.getDataatPos('info.glyph.x', p)
|
||||
res.extend(argres)
|
||||
self.gx = res
|
||||
|
||||
res = []
|
||||
startpos = self.posinDoc('info.glyph.y')
|
||||
for p in startpos:
|
||||
argres = self.getDataatPos('info.glyph.y', p)
|
||||
res.extend(argres)
|
||||
self.gy = res
|
||||
|
||||
res = []
|
||||
startpos = self.posinDoc('info.glyph.glyphID')
|
||||
for p in startpos:
|
||||
argres = self.getDataatPos('info.glyph.glyphID', p)
|
||||
res.extend(argres)
|
||||
self.gid = res
|
||||
|
||||
|
||||
# return tag at line pos in document
|
||||
def lineinDoc(self, pos) :
|
||||
if (pos >= 0) and (pos < self.docSize) :
|
||||
item = self.flatdoc[pos]
|
||||
if item.find('=') >= 0:
|
||||
(name, argres) = item.split('=',1)
|
||||
else :
|
||||
name = item
|
||||
argres = ''
|
||||
return name, argres
|
||||
|
||||
# find tag in doc if within pos to end inclusive
|
||||
def findinDoc(self, tagpath, pos, end) :
|
||||
result = None
|
||||
if end == -1 :
|
||||
end = self.docSize
|
||||
else:
|
||||
end = min(self.docSize, end)
|
||||
foundat = -1
|
||||
for j in xrange(pos, end):
|
||||
item = self.flatdoc[j]
|
||||
if item.find('=') >= 0:
|
||||
(name, argres) = item.split('=',1)
|
||||
else :
|
||||
name = item
|
||||
argres = ''
|
||||
if name.endswith(tagpath) :
|
||||
result = argres
|
||||
foundat = j
|
||||
break
|
||||
return foundat, result
|
||||
|
||||
# return list of start positions for the tagpath
|
||||
def posinDoc(self, tagpath):
|
||||
startpos = []
|
||||
pos = 0
|
||||
res = ""
|
||||
while res != None :
|
||||
(foundpos, res) = self.findinDoc(tagpath, pos, -1)
|
||||
if res != None :
|
||||
startpos.append(foundpos)
|
||||
pos = foundpos + 1
|
||||
return startpos
|
||||
|
||||
def getData(self, path):
|
||||
result = None
|
||||
cnt = len(self.flatdoc)
|
||||
|
@ -39,6 +116,23 @@ class PParser(object):
|
|||
for j in xrange(0,len(argres)):
|
||||
argres[j] = int(argres[j])
|
||||
return result
|
||||
|
||||
def getDataatPos(self, path, pos):
|
||||
result = None
|
||||
item = self.flatdoc[pos]
|
||||
if item.find('=') >= 0:
|
||||
(name, argt) = item.split('=')
|
||||
argres = argt.split('|')
|
||||
else:
|
||||
name = item
|
||||
argres = []
|
||||
if (len(argres) > 0) :
|
||||
for j in xrange(0,len(argres)):
|
||||
argres[j] = int(argres[j])
|
||||
if (name.endswith(path)):
|
||||
result = argres
|
||||
return result
|
||||
|
||||
def getDataTemp(self, path):
|
||||
result = None
|
||||
cnt = len(self.temp)
|
||||
|
@ -58,6 +152,7 @@ class PParser(object):
|
|||
for j in xrange(0,len(argres)):
|
||||
argres[j] = int(argres[j])
|
||||
return result
|
||||
|
||||
def getImages(self):
|
||||
result = []
|
||||
self.temp = self.flatdoc
|
||||
|
@ -69,6 +164,7 @@ class PParser(object):
|
|||
src = self.getDataTemp('img.src')[0]
|
||||
result.append('<image xlink:href="../img/img%04d.jpg" x="%d" y="%d" width="%d" height="%d" />\n' % (src, x, y, w, h))
|
||||
return result
|
||||
|
||||
def getGlyphs(self):
|
||||
result = []
|
||||
if (self.gid != None) and (len(self.gid) > 0):
|
||||
|
@ -84,25 +180,25 @@ class PParser(object):
|
|||
return result
|
||||
|
||||
|
||||
def convert2SVG(gdict, flat_xml, counter, numfiles, svgDir, raw, meta_array, scaledpi):
|
||||
def convert2SVG(gdict, flat_xml, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi):
|
||||
ml = ''
|
||||
pp = PParser(gdict, flat_xml)
|
||||
pp = PParser(gdict, flat_xml, meta_array)
|
||||
ml += '<?xml version="1.0" standalone="no"?>\n'
|
||||
if (raw):
|
||||
ml += '<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n'
|
||||
ml += '<svg width="%fin" height="%fin" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1)
|
||||
ml += '<title>Page %d - %s by %s</title>\n' % (counter, meta_array['Title'],meta_array['Authors'])
|
||||
ml += '<title>Page %d - %s by %s</title>\n' % (pageid, meta_array['Title'],meta_array['Authors'])
|
||||
else:
|
||||
ml += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
|
||||
ml += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" ><head>\n'
|
||||
ml += '<title>Page %d - %s by %s</title>\n' % (counter, meta_array['Title'],meta_array['Authors'])
|
||||
ml += '<title>Page %d - %s by %s</title>\n' % (pageid, meta_array['Title'],meta_array['Authors'])
|
||||
ml += '<script><![CDATA[\n'
|
||||
ml += 'function gd(){var p=window.location.href.replace(/^.*\?dpi=(\d+).*$/i,"$1");return p;}\n'
|
||||
ml += 'var dpi=%d;\n' % scaledpi
|
||||
if (counter) :
|
||||
ml += 'var prevpage="page%04d.xhtml";\n' % (counter - 1)
|
||||
if (counter < numfiles-1) :
|
||||
ml += 'var nextpage="page%04d.xhtml";\n' % (counter + 1)
|
||||
if (previd) :
|
||||
ml += 'var prevpage="page%04d.xhtml";\n' % (previd)
|
||||
if (nextid) :
|
||||
ml += 'var nextpage="page%04d.xhtml";\n' % (nextid)
|
||||
ml += 'var pw=%d;var ph=%d;' % (pp.pw, pp.ph)
|
||||
ml += 'function zoomin(){dpi=dpi*(0.8);setsize();}\n'
|
||||
ml += 'function zoomout(){dpi=dpi*1.25;setsize();}\n'
|
||||
|
@ -115,10 +211,11 @@ def convert2SVG(gdict, flat_xml, counter, numfiles, svgDir, raw, meta_array, sca
|
|||
ml += '</head>\n'
|
||||
ml += '<body onLoad="setsize();" style="background-color:#777;text-align:center;">\n'
|
||||
ml += '<div style="white-space:nowrap;">\n'
|
||||
if (counter == 0) :
|
||||
if previd == None:
|
||||
ml += '<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n'
|
||||
else:
|
||||
ml += '<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,150,95,5,95,295" fill="#AAAAAA" /></svg></a>\n'
|
||||
|
||||
ml += '<a href="javascript:npage();"><svg id="svgimg" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" style="background-color:#FFF;border:1px solid black;">' % (pp.pw, pp.ph)
|
||||
if (pp.gid != None):
|
||||
ml += '<defs>\n'
|
||||
|
@ -134,12 +231,14 @@ def convert2SVG(gdict, flat_xml, counter, numfiles, svgDir, raw, meta_array, sca
|
|||
for j in xrange(0,len(pp.gid)):
|
||||
ml += '<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (pp.gid[j], pp.gx[j], pp.gy[j])
|
||||
if (img == None or len(img) == 0) and (pp.gid == None or len(pp.gid) == 0):
|
||||
ml += '<text x="10" y="10" font-family="Helvetica" font-size="100" stroke="black">This page intentionally left blank.</text>\n<text x="10" y="110" font-family="Helvetica" font-size="50" stroke="black">Until this notice unintentionally gave it content. (gensvg.py)</text>\n'
|
||||
xpos = "%d" % (pp.pw // 3)
|
||||
ypos = "%d" % (pp.ph // 3)
|
||||
ml += '<text x="' + xpos + '" y="' + ypos + '" font-size="' + meta_array['fontSize'] + '" font-family="Helvetica" stroke="black">This page intentionally left blank.</text>\n'
|
||||
if (raw) :
|
||||
ml += '</svg>'
|
||||
else :
|
||||
ml += '</svg></a>\n'
|
||||
if (counter == numfiles - 1) :
|
||||
if nextid == None:
|
||||
ml += '<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n'
|
||||
else :
|
||||
ml += '<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,5,5,295,95,150" fill="#AAAAAA" /></svg></a>\n'
|
||||
|
|
|
@ -19,6 +19,8 @@ import getopt
|
|||
from struct import pack
|
||||
from struct import unpack
|
||||
|
||||
class TpzDRMError(Exception):
|
||||
pass
|
||||
|
||||
# local support routines
|
||||
if 'calibre' in sys.modules:
|
||||
|
@ -114,7 +116,8 @@ class Dictionary(object):
|
|||
return self.stable[self.pos]
|
||||
else:
|
||||
print "Error - %d outside of string table limits" % val
|
||||
sys.exit(-1)
|
||||
raise TpzDRMError('outside or string table limits')
|
||||
# sys.exit(-1)
|
||||
def getSize(self):
|
||||
return self.size
|
||||
def getPos(self):
|
||||
|
@ -371,10 +374,34 @@ def generateBook(bookDir, raw, fixedimage):
|
|||
(ph, pw) = getPageDim(flat_xml)
|
||||
if (ph == '-1') or (ph == '0') : ph = '11000'
|
||||
if (pw == '-1') or (pw == '0') : pw = '8500'
|
||||
meta_array['pageHeight'] = ph
|
||||
meta_array['pageWidth'] = pw
|
||||
if 'fontSize' not in meta_array.keys():
|
||||
meta_array['fontSize'] = fontsize
|
||||
|
||||
# print ' ', 'other0000.dat'
|
||||
# process other.dat for css info and for map of page files to svg images
|
||||
# this map is needed because some pages actually are made up of multiple
|
||||
# pageXXXX.xml files
|
||||
xname = os.path.join(bookDir, 'style.css')
|
||||
flat_xml = convert2xml.fromData(dict, otherFile)
|
||||
|
||||
# extract info.original.pid to get original page information
|
||||
pageIDMap = {}
|
||||
pageidnums = stylexml2css.getpageIDMap(flat_xml)
|
||||
if len(pageidnums) == 0:
|
||||
filenames = os.listdir(pageDir)
|
||||
numfiles = len(filenames)
|
||||
for k in range(numfiles):
|
||||
pageidnums.append(k)
|
||||
# create a map from page ids to list of page file nums to process for that page
|
||||
for i in range(len(pageidnums)):
|
||||
id = pageidnums[i]
|
||||
if id in pageIDMap.keys():
|
||||
pageIDMap[id].append(i)
|
||||
else:
|
||||
pageIDMap[id] = [i]
|
||||
|
||||
# now get the css info
|
||||
cssstr , classlst = stylexml2css.convert2CSS(flat_xml, fontsize, ph, pw)
|
||||
file(xname, 'wb').write(cssstr)
|
||||
xname = os.path.join(xmlDir, 'other0000.xml')
|
||||
|
@ -414,6 +441,9 @@ def generateBook(bookDir, raw, fixedimage):
|
|||
glyfile.close()
|
||||
print " "
|
||||
|
||||
# build up tocentries while processing html
|
||||
tocentries = ''
|
||||
|
||||
# start up the html
|
||||
htmlFileName = "book.html"
|
||||
htmlstr = '<?xml version="1.0" encoding="utf-8"?>\n'
|
||||
|
@ -436,6 +466,77 @@ def generateBook(bookDir, raw, fixedimage):
|
|||
# readability when rendering to the screen.
|
||||
scaledpi = 1440.0
|
||||
|
||||
filenames = os.listdir(pageDir)
|
||||
filenames = sorted(filenames)
|
||||
numfiles = len(filenames)
|
||||
|
||||
xmllst = []
|
||||
|
||||
for filename in filenames:
|
||||
# print ' ', filename
|
||||
print ".",
|
||||
fname = os.path.join(pageDir,filename)
|
||||
flat_xml = convert2xml.fromData(dict, fname)
|
||||
|
||||
# keep flat_xml for later svg processing
|
||||
xmllst.append(flat_xml)
|
||||
|
||||
xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
|
||||
file(xname, 'wb').write(convert2xml.getXML(dict, fname))
|
||||
|
||||
# first get the html
|
||||
pagehtml, tocinfo = flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir, gd, fixedimage)
|
||||
tocentries += tocinfo
|
||||
htmlstr += pagehtml
|
||||
|
||||
# finish up the html string and output it
|
||||
htmlstr += '</body>\n</html>\n'
|
||||
file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
|
||||
|
||||
print " "
|
||||
print 'Extracting Table of Contents from Amazon OCR'
|
||||
|
||||
# first create a table of contents file for the svg images
|
||||
tochtml = '<?xml version="1.0" encoding="utf-8"?>\n'
|
||||
tochtml += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
|
||||
tochtml += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >'
|
||||
tochtml += '<head>\n'
|
||||
tochtml += '<title>' + meta_array['Title'] + '</title>\n'
|
||||
tochtml += '<meta name="Author" content="' + meta_array['Authors'] + '" />\n'
|
||||
tochtml += '<meta name="Title" content="' + meta_array['Title'] + '" />\n'
|
||||
if 'ASIN' in meta_array:
|
||||
tochtml += '<meta name="ASIN" content="' + meta_array['ASIN'] + '" />\n'
|
||||
if 'GUID' in meta_array:
|
||||
tochtml += '<meta name="GUID" content="' + meta_array['GUID'] + '" />\n'
|
||||
tochtml += '</head>\n'
|
||||
tochtml += '<body>\n'
|
||||
|
||||
tochtml += '<h2>Table of Contents</h2>\n'
|
||||
start = pageidnums[0]
|
||||
if (raw):
|
||||
startname = 'page%04d.svg' % start
|
||||
else:
|
||||
startname = 'page%04d.xhtml' % start
|
||||
|
||||
tochtml += '<h3><a href="' + startname + '">Start of Book</a></h3>\n'
|
||||
# build up a table of contents for the svg xhtml output
|
||||
toclst = tocentries.split('\n')
|
||||
toclst.pop()
|
||||
for entry in toclst:
|
||||
print entry
|
||||
title, pagenum = entry.split('|')
|
||||
id = pageidnums[int(pagenum)]
|
||||
if (raw):
|
||||
fname = 'page%04d.svg' % id
|
||||
else:
|
||||
fname = 'page%04d.xhtml' % id
|
||||
tochtml += '<h3><a href="'+ fname + '">' + title + '</a></h3>\n'
|
||||
tochtml += '</body>\n'
|
||||
tochtml += '</html>\n'
|
||||
file(os.path.join(svgDir, 'toc.xhtml'), 'wb').write(tochtml)
|
||||
|
||||
|
||||
# now create index_svg.xhtml that points to all required files
|
||||
svgindex = '<?xml version="1.0" encoding="utf-8"?>\n'
|
||||
svgindex += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
|
||||
svgindex += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >'
|
||||
|
@ -450,50 +551,42 @@ def generateBook(bookDir, raw, fixedimage):
|
|||
svgindex += '</head>\n'
|
||||
svgindex += '<body>\n'
|
||||
|
||||
filenames = os.listdir(pageDir)
|
||||
filenames = sorted(filenames)
|
||||
numfiles = len(filenames)
|
||||
counter = 0
|
||||
|
||||
for filename in filenames:
|
||||
# print ' ', filename
|
||||
print ".",
|
||||
|
||||
fname = os.path.join(pageDir,filename)
|
||||
flat_xml = convert2xml.fromData(dict, fname)
|
||||
|
||||
xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
|
||||
file(xname, 'wb').write(convert2xml.getXML(dict, fname))
|
||||
|
||||
# first get the html
|
||||
htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir, gd, fixedimage)
|
||||
|
||||
# now get the svg image of the page
|
||||
svgxml = flatxml2svg.convert2SVG(gd, flat_xml, counter, numfiles, svgDir, raw, meta_array, scaledpi)
|
||||
|
||||
print "Building svg images of each book page"
|
||||
svgindex += '<h2>List of Pages</h2>\n'
|
||||
svgindex += '<div>\n'
|
||||
idlst = sorted(pageIDMap.keys())
|
||||
numids = len(idlst)
|
||||
cnt = len(idlst)
|
||||
previd = None
|
||||
for j in range(cnt):
|
||||
pageid = idlst[j]
|
||||
if j < cnt - 1:
|
||||
nextid = idlst[j+1]
|
||||
else:
|
||||
nextid = None
|
||||
print '.',
|
||||
pagelst = pageIDMap[pageid]
|
||||
flat_svg = ''
|
||||
for page in pagelst:
|
||||
flat_svg += xmllst[page]
|
||||
svgxml = flatxml2svg.convert2SVG(gd, flat_svg, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi)
|
||||
if (raw) :
|
||||
pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w')
|
||||
svgindex += '<a href="svg/page%04d.svg">Page %d</a>\n' % (counter, counter)
|
||||
pfile = open(os.path.join(svgDir,'page%04d.svg' % pageid),'w')
|
||||
svgindex += '<a href="svg/page%04d.svg">Page %d</a>\n' % (pageid, pageid)
|
||||
else :
|
||||
pfile = open(os.path.join(svgDir,'page%04d.xhtml' % counter), 'w')
|
||||
svgindex += '<a href="svg/page%04d.xhtml">Page %d</a>\n' % (counter, counter)
|
||||
|
||||
|
||||
pfile = open(os.path.join(svgDir,'page%04d.xhtml' % pageid), 'w')
|
||||
svgindex += '<a href="svg/page%04d.xhtml">Page %d</a>\n' % (pageid, pageid)
|
||||
previd = pageid
|
||||
pfile.write(svgxml)
|
||||
pfile.close()
|
||||
|
||||
counter += 1
|
||||
|
||||
print " "
|
||||
|
||||
# finish up the html string and output it
|
||||
htmlstr += '</body>\n</html>\n'
|
||||
file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
|
||||
|
||||
# finish up the svg index string and output it
|
||||
svgindex += '</div>\n'
|
||||
svgindex += '<h2><a href="svg/toc.xhtml">Table of Contents</a></h2>\n'
|
||||
svgindex += '</body>\n</html>\n'
|
||||
file(os.path.join(bookDir, 'index_svg.xhtml'), 'wb').write(svgindex)
|
||||
|
||||
print " "
|
||||
|
||||
# build the opf file
|
||||
opfname = os.path.join(bookDir, 'book.opf')
|
||||
opfstr = '<?xml version="1.0" encoding="utf-8"?>\n'
|
||||
|
@ -573,7 +666,7 @@ def main(argv):
|
|||
return 1
|
||||
|
||||
raw = 0
|
||||
fixedimage = False
|
||||
fixedimage = True
|
||||
for o, a in opts:
|
||||
if o =="-h":
|
||||
usage()
|
||||
|
|
|
@ -17,7 +17,7 @@ from __future__ import with_statement
|
|||
# and many many others
|
||||
|
||||
|
||||
__version__ = '3.7'
|
||||
__version__ = '3.9'
|
||||
|
||||
class Unbuffered:
|
||||
def __init__(self, stream):
|
||||
|
@ -32,6 +32,7 @@ import sys
|
|||
import os, csv, getopt
|
||||
import string
|
||||
import re
|
||||
import traceback
|
||||
|
||||
class DrmException(Exception):
|
||||
pass
|
||||
|
@ -95,8 +96,14 @@ def decryptBook(infile, outdir, k4, kInfoFiles, serials, pids):
|
|||
print "Processing Book: ", title
|
||||
filenametitle = cleanup_name(title)
|
||||
outfilename = bookname
|
||||
if len(bookname)>4 and len(filenametitle)>4 and bookname[:4] != filenametitle[:4]:
|
||||
if len(outfilename)<=8 or len(filenametitle)<=8:
|
||||
outfilename = outfilename + "_" + filenametitle
|
||||
elif outfilename[:8] != filenametitle[:8]:
|
||||
outfilename = outfilename[:8] + "_" + filenametitle
|
||||
|
||||
# avoid excessively long file names
|
||||
if len(outfilename)>150:
|
||||
outfilename = outfilename[:150]
|
||||
|
||||
# build pid list
|
||||
md1, md2 = mb.getPIDMetaInfo()
|
||||
|
@ -128,8 +135,8 @@ def decryptBook(infile, outdir, k4, kInfoFiles, serials, pids):
|
|||
zipname = os.path.join(outdir, outfilename + '_nodrm' + '.htmlz')
|
||||
mb.getHTMLZip(zipname)
|
||||
|
||||
print " Creating SVG HTMLZ Archive"
|
||||
zipname = os.path.join(outdir, outfilename + '_SVG' + '.htmlz')
|
||||
print " Creating SVG ZIP Archive"
|
||||
zipname = os.path.join(outdir, outfilename + '_SVG' + '.zip')
|
||||
mb.getSVGZip(zipname)
|
||||
|
||||
print " Creating XML ZIP Archive"
|
||||
|
|
|
@ -81,6 +81,14 @@ class DocParser(object):
|
|||
pos = foundpos + 1
|
||||
return startpos
|
||||
|
||||
# returns a vector of integers for the tagpath
|
||||
def getData(self, tagpath, pos, end):
|
||||
argres=[]
|
||||
(foundat, argt) = self.findinDoc(tagpath, pos, end)
|
||||
if (argt != None) and (len(argt) > 0) :
|
||||
argList = argt.split('|')
|
||||
argres = [ int(strval) for strval in argList]
|
||||
return argres
|
||||
|
||||
def process(self):
|
||||
|
||||
|
@ -237,7 +245,11 @@ def convert2CSS(flatxml, fontsize, ph, pw):
|
|||
|
||||
# create a document parser
|
||||
dp = DocParser(flatxml, fontsize, ph, pw)
|
||||
|
||||
csspage = dp.process()
|
||||
|
||||
return csspage
|
||||
|
||||
|
||||
def getpageIDMap(flatxml):
|
||||
dp = DocParser(flatxml, 0, 0, 0)
|
||||
pageidnumbers = dp.getData('info.original.pid', 0, -1)
|
||||
return pageidnumbers
|
||||
|
|
|
@ -140,6 +140,7 @@ class TopazBook:
|
|||
def __init__(self, filename):
|
||||
self.fo = file(filename, 'rb')
|
||||
self.outdir = tempfile.mkdtemp()
|
||||
# self.outdir = 'rawdat'
|
||||
self.bookPayloadOffset = 0
|
||||
self.bookHeaderRecords = {}
|
||||
self.bookMetadata = {}
|
||||
|
@ -370,7 +371,8 @@ class TopazBook:
|
|||
|
||||
def cleanup(self):
|
||||
if os.path.isdir(self.outdir):
|
||||
shutil.rmtree(self.outdir, True)
|
||||
pass
|
||||
# shutil.rmtree(self.outdir, True)
|
||||
|
||||
def usage(progname):
|
||||
print "Removes DRM protection from Topaz ebooks and extract the contents"
|
||||
|
@ -438,7 +440,7 @@ def main(argv=sys.argv):
|
|||
tb.getHTMLZip(zipname)
|
||||
|
||||
print " Creating SVG ZIP Archive"
|
||||
zipname = os.path.join(outdir, bookname + '_SVG' + '.htmlz')
|
||||
zipname = os.path.join(outdir, bookname + '_SVG' + '.zip')
|
||||
tb.getSVGZip(zipname)
|
||||
|
||||
print " Creating XML ZIP Archive"
|
||||
|
@ -450,12 +452,12 @@ def main(argv=sys.argv):
|
|||
|
||||
except TpzDRMError, e:
|
||||
print str(e)
|
||||
tb.cleanup()
|
||||
# tb.cleanup()
|
||||
return 1
|
||||
|
||||
except Exception, e:
|
||||
print str(e)
|
||||
tb.cleanup
|
||||
# tb.cleanup
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
|
|
@ -20,6 +20,8 @@ import getopt
|
|||
from struct import pack
|
||||
from struct import unpack
|
||||
|
||||
class TpzDRMError(Exception):
|
||||
pass
|
||||
|
||||
# Get a 7 bit encoded number from string. The most
|
||||
# significant byte comes first and has the high bit (8th) set
|
||||
|
@ -138,7 +140,8 @@ class Dictionary(object):
|
|||
return self.stable[self.pos]
|
||||
else:
|
||||
print "Error - %d outside of string table limits" % val
|
||||
sys.exit(-1)
|
||||
raise TpzDRMError('outside of string table limits')
|
||||
# sys.exit(-1)
|
||||
|
||||
def getSize(self):
|
||||
return self.size
|
||||
|
@ -258,6 +261,11 @@ class PageParser(object):
|
|||
'paragraph.class' : (1, 'scalar_text', 0, 0),
|
||||
'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
|
||||
'word_semantic' : (1, 'snippets', 1, 1),
|
||||
'word_semantic.type' : (1, 'scalar_text', 0, 0),
|
||||
|
@ -272,11 +280,17 @@ class PageParser(object):
|
|||
|
||||
'_span' : (1, 'snippets', 1, 0),
|
||||
'_span.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
'-span.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'_span.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'_span.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
'_span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'_span.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
'span' : (1, 'snippets', 1, 0),
|
||||
'span.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
'span.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'span.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
'span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'span.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
'extratokens' : (1, 'snippets', 1, 0),
|
||||
'extratokens.type' : (1, 'scalar_text', 0, 0),
|
||||
|
|
|
@ -59,8 +59,11 @@
|
|||
# 0.18 - on Windows try PyCrypto first and OpenSSL next
|
||||
# 0.19 - Modify the interface to allow use of import
|
||||
# 0.20 - modify to allow use inside new interface for calibre plugins
|
||||
# 0.21 - Support eReader (drm) version 11.
|
||||
# - Don't reject dictionary format.
|
||||
# - Ignore sidebars for dictionaries (different format?)
|
||||
|
||||
__version__='0.20'
|
||||
__version__='0.21'
|
||||
|
||||
class Unbuffered:
|
||||
def __init__(self, stream):
|
||||
|
@ -140,12 +143,18 @@ logging.basicConfig()
|
|||
|
||||
|
||||
class Sectionizer(object):
|
||||
bkType = "Book"
|
||||
|
||||
def __init__(self, filename, ident):
|
||||
self.contents = file(filename, 'rb').read()
|
||||
self.header = self.contents[0:72]
|
||||
self.num_sections, = struct.unpack('>H', self.contents[76:78])
|
||||
# Dictionary or normal content (TODO: Not hard-coded)
|
||||
if self.header[0x3C:0x3C+8] != ident:
|
||||
raise ValueError('Invalid file format')
|
||||
if self.header[0x3C:0x3C+8] == "PDctPPrs":
|
||||
self.bkType = "Dict"
|
||||
else:
|
||||
raise ValueError('Invalid file format')
|
||||
self.sections = []
|
||||
for i in xrange(self.num_sections):
|
||||
offset, a1,a2,a3,a4 = struct.unpack('>LBBBB', self.contents[78+i*8:78+i*8+8])
|
||||
|
@ -182,15 +191,15 @@ def deXOR(text, sp, table):
|
|||
return r
|
||||
|
||||
class EreaderProcessor(object):
|
||||
def __init__(self, section_reader, username, creditcard):
|
||||
self.section_reader = section_reader
|
||||
data = section_reader(0)
|
||||
def __init__(self, sect, username, creditcard):
|
||||
self.section_reader = sect.loadSection
|
||||
data = self.section_reader(0)
|
||||
version, = struct.unpack('>H', data[0:2])
|
||||
self.version = version
|
||||
logging.info('eReader file format version %s', version)
|
||||
if version != 272 and version != 260 and version != 259:
|
||||
raise ValueError('incorrect eReader version %d (error 1)' % version)
|
||||
data = section_reader(1)
|
||||
data = self.section_reader(1)
|
||||
self.data = data
|
||||
des = Des(fixKey(data[0:8]))
|
||||
cookie_shuf, cookie_size = struct.unpack('>LL', des.decrypt(data[-8:]))
|
||||
|
@ -219,11 +228,17 @@ class EreaderProcessor(object):
|
|||
self.num_text_pages = struct.unpack('>H', r[2:4])[0] - 1
|
||||
self.num_image_pages = struct.unpack('>H', r[26:26+2])[0]
|
||||
self.first_image_page = struct.unpack('>H', r[24:24+2])[0]
|
||||
# Default values
|
||||
self.num_footnote_pages = 0
|
||||
self.num_sidebar_pages = 0
|
||||
self.first_footnote_page = -1
|
||||
self.first_sidebar_page = -1
|
||||
if self.version == 272:
|
||||
self.num_footnote_pages = struct.unpack('>H', r[46:46+2])[0]
|
||||
self.first_footnote_page = struct.unpack('>H', r[44:44+2])[0]
|
||||
self.num_sidebar_pages = struct.unpack('>H', r[38:38+2])[0]
|
||||
self.first_sidebar_page = struct.unpack('>H', r[36:36+2])[0]
|
||||
if (sect.bkType == "Book"):
|
||||
self.num_sidebar_pages = struct.unpack('>H', r[38:38+2])[0]
|
||||
self.first_sidebar_page = struct.unpack('>H', r[36:36+2])[0]
|
||||
# self.num_bookinfo_pages = struct.unpack('>H', r[34:34+2])[0]
|
||||
# self.first_bookinfo_page = struct.unpack('>H', r[32:32+2])[0]
|
||||
# self.num_chapter_pages = struct.unpack('>H', r[22:22+2])[0]
|
||||
|
@ -239,10 +254,8 @@ class EreaderProcessor(object):
|
|||
self.xortable_size = struct.unpack('>H', r[42:42+2])[0]
|
||||
self.xortable = self.data[self.xortable_offset:self.xortable_offset + self.xortable_size]
|
||||
else:
|
||||
self.num_footnote_pages = 0
|
||||
self.num_sidebar_pages = 0
|
||||
self.first_footnote_page = -1
|
||||
self.first_sidebar_page = -1
|
||||
# Nothing needs to be done
|
||||
pass
|
||||
# self.num_bookinfo_pages = 0
|
||||
# self.num_chapter_pages = 0
|
||||
# self.num_link_pages = 0
|
||||
|
@ -267,10 +280,14 @@ class EreaderProcessor(object):
|
|||
encrypted_key_sha = r[44:44+20]
|
||||
encrypted_key = r[64:64+8]
|
||||
elif version == 260:
|
||||
if drm_sub_version != 13:
|
||||
if drm_sub_version != 13 and drm_sub_version != 11:
|
||||
raise ValueError('incorrect eReader version %d (error 3)' % drm_sub_version)
|
||||
encrypted_key = r[44:44+8]
|
||||
encrypted_key_sha = r[52:52+20]
|
||||
if drm_sub_version == 13:
|
||||
encrypted_key = r[44:44+8]
|
||||
encrypted_key_sha = r[52:52+20]
|
||||
else:
|
||||
encrypted_key = r[64:64+8]
|
||||
encrypted_key_sha = r[44:44+20]
|
||||
elif version == 272:
|
||||
encrypted_key = r[172:172+8]
|
||||
encrypted_key_sha = r[56:56+20]
|
||||
|
@ -356,6 +373,12 @@ class EreaderProcessor(object):
|
|||
r += fmarker
|
||||
fnote_ids = fnote_ids[id_len+4:]
|
||||
|
||||
# TODO: Handle dictionary index (?) pages - which are also marked as
|
||||
# sidebar_pages (?). For now dictionary sidebars are ignored
|
||||
# For dictionaries - record 0 is null terminated strings, followed by
|
||||
# blocks of around 62000 bytes and a final block. Not sure of the
|
||||
# encoding
|
||||
|
||||
# now handle sidebar pages
|
||||
if self.num_sidebar_pages > 0:
|
||||
r += '\n'
|
||||
|
@ -368,7 +391,7 @@ class EreaderProcessor(object):
|
|||
id_len = ord(sbar_ids[2])
|
||||
id = sbar_ids[3:3+id_len]
|
||||
smarker = '<sidebar id="%s">\n' % id
|
||||
smarker += zlib.decompress(des.decrypt(self.section_reader(self.first_footnote_page + i)))
|
||||
smarker += zlib.decompress(des.decrypt(self.section_reader(self.first_sidebar_page + i)))
|
||||
smarker += '\n</sidebar>\n'
|
||||
r += smarker
|
||||
sbar_ids = sbar_ids[id_len+4:]
|
||||
|
@ -389,7 +412,7 @@ def convertEreaderToPml(infile, name, cc, outdir):
|
|||
bookname = os.path.splitext(os.path.basename(infile))[0]
|
||||
print " Decoding File"
|
||||
sect = Sectionizer(infile, 'PNRdPPrs')
|
||||
er = EreaderProcessor(sect.loadSection, name, cc)
|
||||
er = EreaderProcessor(sect, name, cc)
|
||||
|
||||
if er.getNumImages() > 0:
|
||||
print " Extracting images"
|
||||
|
|
|
@ -271,6 +271,9 @@ class DocParser(object):
|
|||
|
||||
pclass = self.getClass(pclass)
|
||||
|
||||
# if paragraph uses extratokens (extra glyphs) then make it fixed
|
||||
(pos, extraglyphs) = self.findinDoc('paragraph.extratokens',start,end)
|
||||
|
||||
# build up a description of the paragraph in result and return it
|
||||
# first check for the basic - all words paragraph
|
||||
(pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
|
||||
|
@ -280,6 +283,7 @@ class DocParser(object):
|
|||
last = int(slast)
|
||||
|
||||
makeImage = (regtype == 'vertical') or (regtype == 'table')
|
||||
makeImage = makeImage or (extraglyphs != None)
|
||||
if self.fixedimage:
|
||||
makeImage = makeImage or (regtype == 'fixed')
|
||||
|
||||
|
@ -353,6 +357,8 @@ class DocParser(object):
|
|||
|
||||
word_class = ''
|
||||
|
||||
word_semantic_type = ''
|
||||
|
||||
while (line < end) :
|
||||
|
||||
(name, argres) = self.lineinDoc(line)
|
||||
|
@ -512,6 +518,72 @@ class DocParser(object):
|
|||
return parares
|
||||
|
||||
|
||||
def buildTOCEntry(self, pdesc) :
|
||||
parares = ''
|
||||
sep =''
|
||||
tocentry = ''
|
||||
handle_links = len(self.link_id) > 0
|
||||
|
||||
lstart = 0
|
||||
|
||||
cnt = len(pdesc)
|
||||
for j in xrange( 0, cnt) :
|
||||
|
||||
(wtype, num) = pdesc[j]
|
||||
|
||||
if wtype == 'ocr' :
|
||||
word = self.ocrtext[num]
|
||||
sep = ' '
|
||||
|
||||
if handle_links:
|
||||
link = self.link_id[num]
|
||||
if (link > 0):
|
||||
linktype = self.link_type[link-1]
|
||||
title = self.link_title[link-1]
|
||||
title = title.rstrip('. ')
|
||||
alt_title = parares[lstart:]
|
||||
alt_title = alt_title.strip()
|
||||
# now strip off the actual printed page number
|
||||
alt_title = alt_title.rstrip('01234567890ivxldIVXLD-.')
|
||||
alt_title = alt_title.rstrip('. ')
|
||||
# skip over any external links - can't have them in a books toc
|
||||
if linktype == 'external' :
|
||||
title = ''
|
||||
alt_title = ''
|
||||
linkpage = ''
|
||||
else :
|
||||
if len(self.link_page) >= link :
|
||||
ptarget = self.link_page[link-1] - 1
|
||||
linkpage = '%04d' % ptarget
|
||||
else :
|
||||
# just link to the current page
|
||||
linkpage = self.id[4:]
|
||||
if len(alt_title) >= len(title):
|
||||
title = alt_title
|
||||
if title != '' and linkpage != '':
|
||||
tocentry += title + '|' + linkpage + '\n'
|
||||
lstart = len(parares)
|
||||
if word == '_link_' : word = ''
|
||||
elif (link < 0) :
|
||||
if word == '_link_' : word = ''
|
||||
|
||||
if word == '_lb_':
|
||||
word = ''
|
||||
sep = ''
|
||||
|
||||
if num in self.dehyphen_rootid :
|
||||
word = word[0:-1]
|
||||
sep = ''
|
||||
|
||||
parares += word + sep
|
||||
|
||||
else :
|
||||
continue
|
||||
|
||||
return tocentry
|
||||
|
||||
|
||||
|
||||
|
||||
# walk the document tree collecting the information needed
|
||||
# to build an html page using the ocrText
|
||||
|
@ -519,6 +591,7 @@ class DocParser(object):
|
|||
def process(self):
|
||||
|
||||
htmlpage = ''
|
||||
tocinfo = ''
|
||||
|
||||
# get the ocr text
|
||||
(pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
|
||||
|
@ -644,9 +717,9 @@ class DocParser(object):
|
|||
ptype = 'end'
|
||||
first_para_continued = False
|
||||
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
|
||||
tocinfo += self.buildTOCEntry(pdesc)
|
||||
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
|
||||
|
||||
|
||||
elif (regtype == 'vertical') or (regtype == 'table') :
|
||||
ptype = 'full'
|
||||
if inGroup:
|
||||
|
@ -704,12 +777,11 @@ class DocParser(object):
|
|||
htmlpage = htmlpage[0:-4]
|
||||
last_para_continued = False
|
||||
|
||||
return htmlpage
|
||||
|
||||
return htmlpage, tocinfo
|
||||
|
||||
|
||||
def convert2HTML(flatxml, classlst, fileid, bookDir, gdict, fixedimage):
|
||||
# create a document parser
|
||||
dp = DocParser(flatxml, classlst, fileid, bookDir, gdict, fixedimage)
|
||||
htmlpage = dp.process()
|
||||
return htmlpage
|
||||
htmlpage, tocinfo = dp.process()
|
||||
return htmlpage, tocinfo
|
||||
|
|
|
@ -10,17 +10,94 @@ from struct import unpack
|
|||
|
||||
|
||||
class PParser(object):
|
||||
def __init__(self, gd, flatxml):
|
||||
def __init__(self, gd, flatxml, meta_array):
|
||||
self.gd = gd
|
||||
self.flatdoc = flatxml.split('\n')
|
||||
self.docSize = len(self.flatdoc)
|
||||
self.temp = []
|
||||
foo = self.getData('page.h') or self.getData('book.h')
|
||||
self.ph = foo[0]
|
||||
foo = self.getData('page.w') or self.getData('book.w')
|
||||
self.pw = foo[0]
|
||||
self.gx = self.getData('info.glyph.x')
|
||||
self.gy = self.getData('info.glyph.y')
|
||||
self.gid = self.getData('info.glyph.glyphID')
|
||||
|
||||
self.ph = -1
|
||||
self.pw = -1
|
||||
startpos = self.posinDoc('page.h') or self.posinDoc('book.h')
|
||||
for p in startpos:
|
||||
(name, argres) = self.lineinDoc(p)
|
||||
self.ph = max(self.ph, int(argres))
|
||||
startpos = self.posinDoc('page.w') or self.posinDoc('book.w')
|
||||
for p in startpos:
|
||||
(name, argres) = self.lineinDoc(p)
|
||||
self.pw = max(self.pw, int(argres))
|
||||
|
||||
if self.ph <= 0:
|
||||
self.ph = int(meta_array.get('pageHeight', '11000'))
|
||||
if self.pw <= 0:
|
||||
self.pw = int(meta_array.get('pageWidth', '8500'))
|
||||
|
||||
res = []
|
||||
startpos = self.posinDoc('info.glyph.x')
|
||||
for p in startpos:
|
||||
argres = self.getDataatPos('info.glyph.x', p)
|
||||
res.extend(argres)
|
||||
self.gx = res
|
||||
|
||||
res = []
|
||||
startpos = self.posinDoc('info.glyph.y')
|
||||
for p in startpos:
|
||||
argres = self.getDataatPos('info.glyph.y', p)
|
||||
res.extend(argres)
|
||||
self.gy = res
|
||||
|
||||
res = []
|
||||
startpos = self.posinDoc('info.glyph.glyphID')
|
||||
for p in startpos:
|
||||
argres = self.getDataatPos('info.glyph.glyphID', p)
|
||||
res.extend(argres)
|
||||
self.gid = res
|
||||
|
||||
|
||||
# return tag at line pos in document
|
||||
def lineinDoc(self, pos) :
|
||||
if (pos >= 0) and (pos < self.docSize) :
|
||||
item = self.flatdoc[pos]
|
||||
if item.find('=') >= 0:
|
||||
(name, argres) = item.split('=',1)
|
||||
else :
|
||||
name = item
|
||||
argres = ''
|
||||
return name, argres
|
||||
|
||||
# find tag in doc if within pos to end inclusive
|
||||
def findinDoc(self, tagpath, pos, end) :
|
||||
result = None
|
||||
if end == -1 :
|
||||
end = self.docSize
|
||||
else:
|
||||
end = min(self.docSize, end)
|
||||
foundat = -1
|
||||
for j in xrange(pos, end):
|
||||
item = self.flatdoc[j]
|
||||
if item.find('=') >= 0:
|
||||
(name, argres) = item.split('=',1)
|
||||
else :
|
||||
name = item
|
||||
argres = ''
|
||||
if name.endswith(tagpath) :
|
||||
result = argres
|
||||
foundat = j
|
||||
break
|
||||
return foundat, result
|
||||
|
||||
# return list of start positions for the tagpath
|
||||
def posinDoc(self, tagpath):
|
||||
startpos = []
|
||||
pos = 0
|
||||
res = ""
|
||||
while res != None :
|
||||
(foundpos, res) = self.findinDoc(tagpath, pos, -1)
|
||||
if res != None :
|
||||
startpos.append(foundpos)
|
||||
pos = foundpos + 1
|
||||
return startpos
|
||||
|
||||
def getData(self, path):
|
||||
result = None
|
||||
cnt = len(self.flatdoc)
|
||||
|
@ -39,6 +116,23 @@ class PParser(object):
|
|||
for j in xrange(0,len(argres)):
|
||||
argres[j] = int(argres[j])
|
||||
return result
|
||||
|
||||
def getDataatPos(self, path, pos):
|
||||
result = None
|
||||
item = self.flatdoc[pos]
|
||||
if item.find('=') >= 0:
|
||||
(name, argt) = item.split('=')
|
||||
argres = argt.split('|')
|
||||
else:
|
||||
name = item
|
||||
argres = []
|
||||
if (len(argres) > 0) :
|
||||
for j in xrange(0,len(argres)):
|
||||
argres[j] = int(argres[j])
|
||||
if (name.endswith(path)):
|
||||
result = argres
|
||||
return result
|
||||
|
||||
def getDataTemp(self, path):
|
||||
result = None
|
||||
cnt = len(self.temp)
|
||||
|
@ -58,6 +152,7 @@ class PParser(object):
|
|||
for j in xrange(0,len(argres)):
|
||||
argres[j] = int(argres[j])
|
||||
return result
|
||||
|
||||
def getImages(self):
|
||||
result = []
|
||||
self.temp = self.flatdoc
|
||||
|
@ -69,6 +164,7 @@ class PParser(object):
|
|||
src = self.getDataTemp('img.src')[0]
|
||||
result.append('<image xlink:href="../img/img%04d.jpg" x="%d" y="%d" width="%d" height="%d" />\n' % (src, x, y, w, h))
|
||||
return result
|
||||
|
||||
def getGlyphs(self):
|
||||
result = []
|
||||
if (self.gid != None) and (len(self.gid) > 0):
|
||||
|
@ -84,25 +180,25 @@ class PParser(object):
|
|||
return result
|
||||
|
||||
|
||||
def convert2SVG(gdict, flat_xml, counter, numfiles, svgDir, raw, meta_array, scaledpi):
|
||||
def convert2SVG(gdict, flat_xml, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi):
|
||||
ml = ''
|
||||
pp = PParser(gdict, flat_xml)
|
||||
pp = PParser(gdict, flat_xml, meta_array)
|
||||
ml += '<?xml version="1.0" standalone="no"?>\n'
|
||||
if (raw):
|
||||
ml += '<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n'
|
||||
ml += '<svg width="%fin" height="%fin" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1)
|
||||
ml += '<title>Page %d - %s by %s</title>\n' % (counter, meta_array['Title'],meta_array['Authors'])
|
||||
ml += '<title>Page %d - %s by %s</title>\n' % (pageid, meta_array['Title'],meta_array['Authors'])
|
||||
else:
|
||||
ml += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
|
||||
ml += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" ><head>\n'
|
||||
ml += '<title>Page %d - %s by %s</title>\n' % (counter, meta_array['Title'],meta_array['Authors'])
|
||||
ml += '<title>Page %d - %s by %s</title>\n' % (pageid, meta_array['Title'],meta_array['Authors'])
|
||||
ml += '<script><![CDATA[\n'
|
||||
ml += 'function gd(){var p=window.location.href.replace(/^.*\?dpi=(\d+).*$/i,"$1");return p;}\n'
|
||||
ml += 'var dpi=%d;\n' % scaledpi
|
||||
if (counter) :
|
||||
ml += 'var prevpage="page%04d.xhtml";\n' % (counter - 1)
|
||||
if (counter < numfiles-1) :
|
||||
ml += 'var nextpage="page%04d.xhtml";\n' % (counter + 1)
|
||||
if (previd) :
|
||||
ml += 'var prevpage="page%04d.xhtml";\n' % (previd)
|
||||
if (nextid) :
|
||||
ml += 'var nextpage="page%04d.xhtml";\n' % (nextid)
|
||||
ml += 'var pw=%d;var ph=%d;' % (pp.pw, pp.ph)
|
||||
ml += 'function zoomin(){dpi=dpi*(0.8);setsize();}\n'
|
||||
ml += 'function zoomout(){dpi=dpi*1.25;setsize();}\n'
|
||||
|
@ -115,10 +211,11 @@ def convert2SVG(gdict, flat_xml, counter, numfiles, svgDir, raw, meta_array, sca
|
|||
ml += '</head>\n'
|
||||
ml += '<body onLoad="setsize();" style="background-color:#777;text-align:center;">\n'
|
||||
ml += '<div style="white-space:nowrap;">\n'
|
||||
if (counter == 0) :
|
||||
if previd == None:
|
||||
ml += '<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n'
|
||||
else:
|
||||
ml += '<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,150,95,5,95,295" fill="#AAAAAA" /></svg></a>\n'
|
||||
|
||||
ml += '<a href="javascript:npage();"><svg id="svgimg" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" style="background-color:#FFF;border:1px solid black;">' % (pp.pw, pp.ph)
|
||||
if (pp.gid != None):
|
||||
ml += '<defs>\n'
|
||||
|
@ -134,12 +231,14 @@ def convert2SVG(gdict, flat_xml, counter, numfiles, svgDir, raw, meta_array, sca
|
|||
for j in xrange(0,len(pp.gid)):
|
||||
ml += '<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (pp.gid[j], pp.gx[j], pp.gy[j])
|
||||
if (img == None or len(img) == 0) and (pp.gid == None or len(pp.gid) == 0):
|
||||
ml += '<text x="10" y="10" font-family="Helvetica" font-size="100" stroke="black">This page intentionally left blank.</text>\n<text x="10" y="110" font-family="Helvetica" font-size="50" stroke="black">Until this notice unintentionally gave it content. (gensvg.py)</text>\n'
|
||||
xpos = "%d" % (pp.pw // 3)
|
||||
ypos = "%d" % (pp.ph // 3)
|
||||
ml += '<text x="' + xpos + '" y="' + ypos + '" font-size="' + meta_array['fontSize'] + '" font-family="Helvetica" stroke="black">This page intentionally left blank.</text>\n'
|
||||
if (raw) :
|
||||
ml += '</svg>'
|
||||
else :
|
||||
ml += '</svg></a>\n'
|
||||
if (counter == numfiles - 1) :
|
||||
if nextid == None:
|
||||
ml += '<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n'
|
||||
else :
|
||||
ml += '<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,5,5,295,95,150" fill="#AAAAAA" /></svg></a>\n'
|
||||
|
|
|
@ -19,6 +19,8 @@ import getopt
|
|||
from struct import pack
|
||||
from struct import unpack
|
||||
|
||||
class TpzDRMError(Exception):
|
||||
pass
|
||||
|
||||
# local support routines
|
||||
if 'calibre' in sys.modules:
|
||||
|
@ -114,7 +116,8 @@ class Dictionary(object):
|
|||
return self.stable[self.pos]
|
||||
else:
|
||||
print "Error - %d outside of string table limits" % val
|
||||
sys.exit(-1)
|
||||
raise TpzDRMError('outside or string table limits')
|
||||
# sys.exit(-1)
|
||||
def getSize(self):
|
||||
return self.size
|
||||
def getPos(self):
|
||||
|
@ -371,10 +374,34 @@ def generateBook(bookDir, raw, fixedimage):
|
|||
(ph, pw) = getPageDim(flat_xml)
|
||||
if (ph == '-1') or (ph == '0') : ph = '11000'
|
||||
if (pw == '-1') or (pw == '0') : pw = '8500'
|
||||
meta_array['pageHeight'] = ph
|
||||
meta_array['pageWidth'] = pw
|
||||
if 'fontSize' not in meta_array.keys():
|
||||
meta_array['fontSize'] = fontsize
|
||||
|
||||
# print ' ', 'other0000.dat'
|
||||
# process other.dat for css info and for map of page files to svg images
|
||||
# this map is needed because some pages actually are made up of multiple
|
||||
# pageXXXX.xml files
|
||||
xname = os.path.join(bookDir, 'style.css')
|
||||
flat_xml = convert2xml.fromData(dict, otherFile)
|
||||
|
||||
# extract info.original.pid to get original page information
|
||||
pageIDMap = {}
|
||||
pageidnums = stylexml2css.getpageIDMap(flat_xml)
|
||||
if len(pageidnums) == 0:
|
||||
filenames = os.listdir(pageDir)
|
||||
numfiles = len(filenames)
|
||||
for k in range(numfiles):
|
||||
pageidnums.append(k)
|
||||
# create a map from page ids to list of page file nums to process for that page
|
||||
for i in range(len(pageidnums)):
|
||||
id = pageidnums[i]
|
||||
if id in pageIDMap.keys():
|
||||
pageIDMap[id].append(i)
|
||||
else:
|
||||
pageIDMap[id] = [i]
|
||||
|
||||
# now get the css info
|
||||
cssstr , classlst = stylexml2css.convert2CSS(flat_xml, fontsize, ph, pw)
|
||||
file(xname, 'wb').write(cssstr)
|
||||
xname = os.path.join(xmlDir, 'other0000.xml')
|
||||
|
@ -414,6 +441,9 @@ def generateBook(bookDir, raw, fixedimage):
|
|||
glyfile.close()
|
||||
print " "
|
||||
|
||||
# build up tocentries while processing html
|
||||
tocentries = ''
|
||||
|
||||
# start up the html
|
||||
htmlFileName = "book.html"
|
||||
htmlstr = '<?xml version="1.0" encoding="utf-8"?>\n'
|
||||
|
@ -436,6 +466,77 @@ def generateBook(bookDir, raw, fixedimage):
|
|||
# readability when rendering to the screen.
|
||||
scaledpi = 1440.0
|
||||
|
||||
filenames = os.listdir(pageDir)
|
||||
filenames = sorted(filenames)
|
||||
numfiles = len(filenames)
|
||||
|
||||
xmllst = []
|
||||
|
||||
for filename in filenames:
|
||||
# print ' ', filename
|
||||
print ".",
|
||||
fname = os.path.join(pageDir,filename)
|
||||
flat_xml = convert2xml.fromData(dict, fname)
|
||||
|
||||
# keep flat_xml for later svg processing
|
||||
xmllst.append(flat_xml)
|
||||
|
||||
xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
|
||||
file(xname, 'wb').write(convert2xml.getXML(dict, fname))
|
||||
|
||||
# first get the html
|
||||
pagehtml, tocinfo = flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir, gd, fixedimage)
|
||||
tocentries += tocinfo
|
||||
htmlstr += pagehtml
|
||||
|
||||
# finish up the html string and output it
|
||||
htmlstr += '</body>\n</html>\n'
|
||||
file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
|
||||
|
||||
print " "
|
||||
print 'Extracting Table of Contents from Amazon OCR'
|
||||
|
||||
# first create a table of contents file for the svg images
|
||||
tochtml = '<?xml version="1.0" encoding="utf-8"?>\n'
|
||||
tochtml += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
|
||||
tochtml += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >'
|
||||
tochtml += '<head>\n'
|
||||
tochtml += '<title>' + meta_array['Title'] + '</title>\n'
|
||||
tochtml += '<meta name="Author" content="' + meta_array['Authors'] + '" />\n'
|
||||
tochtml += '<meta name="Title" content="' + meta_array['Title'] + '" />\n'
|
||||
if 'ASIN' in meta_array:
|
||||
tochtml += '<meta name="ASIN" content="' + meta_array['ASIN'] + '" />\n'
|
||||
if 'GUID' in meta_array:
|
||||
tochtml += '<meta name="GUID" content="' + meta_array['GUID'] + '" />\n'
|
||||
tochtml += '</head>\n'
|
||||
tochtml += '<body>\n'
|
||||
|
||||
tochtml += '<h2>Table of Contents</h2>\n'
|
||||
start = pageidnums[0]
|
||||
if (raw):
|
||||
startname = 'page%04d.svg' % start
|
||||
else:
|
||||
startname = 'page%04d.xhtml' % start
|
||||
|
||||
tochtml += '<h3><a href="' + startname + '">Start of Book</a></h3>\n'
|
||||
# build up a table of contents for the svg xhtml output
|
||||
toclst = tocentries.split('\n')
|
||||
toclst.pop()
|
||||
for entry in toclst:
|
||||
print entry
|
||||
title, pagenum = entry.split('|')
|
||||
id = pageidnums[int(pagenum)]
|
||||
if (raw):
|
||||
fname = 'page%04d.svg' % id
|
||||
else:
|
||||
fname = 'page%04d.xhtml' % id
|
||||
tochtml += '<h3><a href="'+ fname + '">' + title + '</a></h3>\n'
|
||||
tochtml += '</body>\n'
|
||||
tochtml += '</html>\n'
|
||||
file(os.path.join(svgDir, 'toc.xhtml'), 'wb').write(tochtml)
|
||||
|
||||
|
||||
# now create index_svg.xhtml that points to all required files
|
||||
svgindex = '<?xml version="1.0" encoding="utf-8"?>\n'
|
||||
svgindex += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
|
||||
svgindex += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >'
|
||||
|
@ -450,50 +551,42 @@ def generateBook(bookDir, raw, fixedimage):
|
|||
svgindex += '</head>\n'
|
||||
svgindex += '<body>\n'
|
||||
|
||||
filenames = os.listdir(pageDir)
|
||||
filenames = sorted(filenames)
|
||||
numfiles = len(filenames)
|
||||
counter = 0
|
||||
|
||||
for filename in filenames:
|
||||
# print ' ', filename
|
||||
print ".",
|
||||
|
||||
fname = os.path.join(pageDir,filename)
|
||||
flat_xml = convert2xml.fromData(dict, fname)
|
||||
|
||||
xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
|
||||
file(xname, 'wb').write(convert2xml.getXML(dict, fname))
|
||||
|
||||
# first get the html
|
||||
htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir, gd, fixedimage)
|
||||
|
||||
# now get the svg image of the page
|
||||
svgxml = flatxml2svg.convert2SVG(gd, flat_xml, counter, numfiles, svgDir, raw, meta_array, scaledpi)
|
||||
|
||||
print "Building svg images of each book page"
|
||||
svgindex += '<h2>List of Pages</h2>\n'
|
||||
svgindex += '<div>\n'
|
||||
idlst = sorted(pageIDMap.keys())
|
||||
numids = len(idlst)
|
||||
cnt = len(idlst)
|
||||
previd = None
|
||||
for j in range(cnt):
|
||||
pageid = idlst[j]
|
||||
if j < cnt - 1:
|
||||
nextid = idlst[j+1]
|
||||
else:
|
||||
nextid = None
|
||||
print '.',
|
||||
pagelst = pageIDMap[pageid]
|
||||
flat_svg = ''
|
||||
for page in pagelst:
|
||||
flat_svg += xmllst[page]
|
||||
svgxml = flatxml2svg.convert2SVG(gd, flat_svg, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi)
|
||||
if (raw) :
|
||||
pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w')
|
||||
svgindex += '<a href="svg/page%04d.svg">Page %d</a>\n' % (counter, counter)
|
||||
pfile = open(os.path.join(svgDir,'page%04d.svg' % pageid),'w')
|
||||
svgindex += '<a href="svg/page%04d.svg">Page %d</a>\n' % (pageid, pageid)
|
||||
else :
|
||||
pfile = open(os.path.join(svgDir,'page%04d.xhtml' % counter), 'w')
|
||||
svgindex += '<a href="svg/page%04d.xhtml">Page %d</a>\n' % (counter, counter)
|
||||
|
||||
|
||||
pfile = open(os.path.join(svgDir,'page%04d.xhtml' % pageid), 'w')
|
||||
svgindex += '<a href="svg/page%04d.xhtml">Page %d</a>\n' % (pageid, pageid)
|
||||
previd = pageid
|
||||
pfile.write(svgxml)
|
||||
pfile.close()
|
||||
|
||||
counter += 1
|
||||
|
||||
print " "
|
||||
|
||||
# finish up the html string and output it
|
||||
htmlstr += '</body>\n</html>\n'
|
||||
file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
|
||||
|
||||
# finish up the svg index string and output it
|
||||
svgindex += '</div>\n'
|
||||
svgindex += '<h2><a href="svg/toc.xhtml">Table of Contents</a></h2>\n'
|
||||
svgindex += '</body>\n</html>\n'
|
||||
file(os.path.join(bookDir, 'index_svg.xhtml'), 'wb').write(svgindex)
|
||||
|
||||
print " "
|
||||
|
||||
# build the opf file
|
||||
opfname = os.path.join(bookDir, 'book.opf')
|
||||
opfstr = '<?xml version="1.0" encoding="utf-8"?>\n'
|
||||
|
@ -573,7 +666,7 @@ def main(argv):
|
|||
return 1
|
||||
|
||||
raw = 0
|
||||
fixedimage = False
|
||||
fixedimage = True
|
||||
for o, a in opts:
|
||||
if o =="-h":
|
||||
usage()
|
||||
|
|
|
@ -17,7 +17,7 @@ from __future__ import with_statement
|
|||
# and many many others
|
||||
|
||||
|
||||
__version__ = '3.7'
|
||||
__version__ = '3.9'
|
||||
|
||||
class Unbuffered:
|
||||
def __init__(self, stream):
|
||||
|
@ -32,6 +32,7 @@ import sys
|
|||
import os, csv, getopt
|
||||
import string
|
||||
import re
|
||||
import traceback
|
||||
|
||||
class DrmException(Exception):
|
||||
pass
|
||||
|
@ -95,8 +96,14 @@ def decryptBook(infile, outdir, k4, kInfoFiles, serials, pids):
|
|||
print "Processing Book: ", title
|
||||
filenametitle = cleanup_name(title)
|
||||
outfilename = bookname
|
||||
if len(bookname)>4 and len(filenametitle)>4 and bookname[:4] != filenametitle[:4]:
|
||||
if len(outfilename)<=8 or len(filenametitle)<=8:
|
||||
outfilename = outfilename + "_" + filenametitle
|
||||
elif outfilename[:8] != filenametitle[:8]:
|
||||
outfilename = outfilename[:8] + "_" + filenametitle
|
||||
|
||||
# avoid excessively long file names
|
||||
if len(outfilename)>150:
|
||||
outfilename = outfilename[:150]
|
||||
|
||||
# build pid list
|
||||
md1, md2 = mb.getPIDMetaInfo()
|
||||
|
@ -128,8 +135,8 @@ def decryptBook(infile, outdir, k4, kInfoFiles, serials, pids):
|
|||
zipname = os.path.join(outdir, outfilename + '_nodrm' + '.htmlz')
|
||||
mb.getHTMLZip(zipname)
|
||||
|
||||
print " Creating SVG HTMLZ Archive"
|
||||
zipname = os.path.join(outdir, outfilename + '_SVG' + '.htmlz')
|
||||
print " Creating SVG ZIP Archive"
|
||||
zipname = os.path.join(outdir, outfilename + '_SVG' + '.zip')
|
||||
mb.getSVGZip(zipname)
|
||||
|
||||
print " Creating XML ZIP Archive"
|
||||
|
|
|
@ -81,6 +81,14 @@ class DocParser(object):
|
|||
pos = foundpos + 1
|
||||
return startpos
|
||||
|
||||
# returns a vector of integers for the tagpath
|
||||
def getData(self, tagpath, pos, end):
|
||||
argres=[]
|
||||
(foundat, argt) = self.findinDoc(tagpath, pos, end)
|
||||
if (argt != None) and (len(argt) > 0) :
|
||||
argList = argt.split('|')
|
||||
argres = [ int(strval) for strval in argList]
|
||||
return argres
|
||||
|
||||
def process(self):
|
||||
|
||||
|
@ -237,7 +245,11 @@ def convert2CSS(flatxml, fontsize, ph, pw):
|
|||
|
||||
# create a document parser
|
||||
dp = DocParser(flatxml, fontsize, ph, pw)
|
||||
|
||||
csspage = dp.process()
|
||||
|
||||
return csspage
|
||||
|
||||
|
||||
def getpageIDMap(flatxml):
|
||||
dp = DocParser(flatxml, 0, 0, 0)
|
||||
pageidnumbers = dp.getData('info.original.pid', 0, -1)
|
||||
return pageidnumbers
|
||||
|
|
|
@ -140,6 +140,7 @@ class TopazBook:
|
|||
def __init__(self, filename):
|
||||
self.fo = file(filename, 'rb')
|
||||
self.outdir = tempfile.mkdtemp()
|
||||
# self.outdir = 'rawdat'
|
||||
self.bookPayloadOffset = 0
|
||||
self.bookHeaderRecords = {}
|
||||
self.bookMetadata = {}
|
||||
|
@ -370,7 +371,8 @@ class TopazBook:
|
|||
|
||||
def cleanup(self):
|
||||
if os.path.isdir(self.outdir):
|
||||
shutil.rmtree(self.outdir, True)
|
||||
pass
|
||||
# shutil.rmtree(self.outdir, True)
|
||||
|
||||
def usage(progname):
|
||||
print "Removes DRM protection from Topaz ebooks and extract the contents"
|
||||
|
@ -438,7 +440,7 @@ def main(argv=sys.argv):
|
|||
tb.getHTMLZip(zipname)
|
||||
|
||||
print " Creating SVG ZIP Archive"
|
||||
zipname = os.path.join(outdir, bookname + '_SVG' + '.htmlz')
|
||||
zipname = os.path.join(outdir, bookname + '_SVG' + '.zip')
|
||||
tb.getSVGZip(zipname)
|
||||
|
||||
print " Creating XML ZIP Archive"
|
||||
|
@ -450,12 +452,12 @@ def main(argv=sys.argv):
|
|||
|
||||
except TpzDRMError, e:
|
||||
print str(e)
|
||||
tb.cleanup()
|
||||
# tb.cleanup()
|
||||
return 1
|
||||
|
||||
except Exception, e:
|
||||
print str(e)
|
||||
tb.cleanup
|
||||
# tb.cleanup
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
|
|
@ -20,6 +20,8 @@ import getopt
|
|||
from struct import pack
|
||||
from struct import unpack
|
||||
|
||||
class TpzDRMError(Exception):
|
||||
pass
|
||||
|
||||
# Get a 7 bit encoded number from string. The most
|
||||
# significant byte comes first and has the high bit (8th) set
|
||||
|
@ -138,7 +140,8 @@ class Dictionary(object):
|
|||
return self.stable[self.pos]
|
||||
else:
|
||||
print "Error - %d outside of string table limits" % val
|
||||
sys.exit(-1)
|
||||
raise TpzDRMError('outside of string table limits')
|
||||
# sys.exit(-1)
|
||||
|
||||
def getSize(self):
|
||||
return self.size
|
||||
|
@ -258,6 +261,11 @@ class PageParser(object):
|
|||
'paragraph.class' : (1, 'scalar_text', 0, 0),
|
||||
'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
|
||||
'word_semantic' : (1, 'snippets', 1, 1),
|
||||
'word_semantic.type' : (1, 'scalar_text', 0, 0),
|
||||
|
@ -272,11 +280,17 @@ class PageParser(object):
|
|||
|
||||
'_span' : (1, 'snippets', 1, 0),
|
||||
'_span.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
'-span.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'_span.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'_span.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
'_span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'_span.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
'span' : (1, 'snippets', 1, 0),
|
||||
'span.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
'span.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'span.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
'span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'span.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
'extratokens' : (1, 'snippets', 1, 0),
|
||||
'extratokens.type' : (1, 'scalar_text', 0, 0),
|
||||
|
|
|
@ -271,6 +271,9 @@ class DocParser(object):
|
|||
|
||||
pclass = self.getClass(pclass)
|
||||
|
||||
# if paragraph uses extratokens (extra glyphs) then make it fixed
|
||||
(pos, extraglyphs) = self.findinDoc('paragraph.extratokens',start,end)
|
||||
|
||||
# build up a description of the paragraph in result and return it
|
||||
# first check for the basic - all words paragraph
|
||||
(pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
|
||||
|
@ -280,6 +283,7 @@ class DocParser(object):
|
|||
last = int(slast)
|
||||
|
||||
makeImage = (regtype == 'vertical') or (regtype == 'table')
|
||||
makeImage = makeImage or (extraglyphs != None)
|
||||
if self.fixedimage:
|
||||
makeImage = makeImage or (regtype == 'fixed')
|
||||
|
||||
|
@ -353,6 +357,8 @@ class DocParser(object):
|
|||
|
||||
word_class = ''
|
||||
|
||||
word_semantic_type = ''
|
||||
|
||||
while (line < end) :
|
||||
|
||||
(name, argres) = self.lineinDoc(line)
|
||||
|
@ -512,6 +518,72 @@ class DocParser(object):
|
|||
return parares
|
||||
|
||||
|
||||
def buildTOCEntry(self, pdesc) :
|
||||
parares = ''
|
||||
sep =''
|
||||
tocentry = ''
|
||||
handle_links = len(self.link_id) > 0
|
||||
|
||||
lstart = 0
|
||||
|
||||
cnt = len(pdesc)
|
||||
for j in xrange( 0, cnt) :
|
||||
|
||||
(wtype, num) = pdesc[j]
|
||||
|
||||
if wtype == 'ocr' :
|
||||
word = self.ocrtext[num]
|
||||
sep = ' '
|
||||
|
||||
if handle_links:
|
||||
link = self.link_id[num]
|
||||
if (link > 0):
|
||||
linktype = self.link_type[link-1]
|
||||
title = self.link_title[link-1]
|
||||
title = title.rstrip('. ')
|
||||
alt_title = parares[lstart:]
|
||||
alt_title = alt_title.strip()
|
||||
# now strip off the actual printed page number
|
||||
alt_title = alt_title.rstrip('01234567890ivxldIVXLD-.')
|
||||
alt_title = alt_title.rstrip('. ')
|
||||
# skip over any external links - can't have them in a books toc
|
||||
if linktype == 'external' :
|
||||
title = ''
|
||||
alt_title = ''
|
||||
linkpage = ''
|
||||
else :
|
||||
if len(self.link_page) >= link :
|
||||
ptarget = self.link_page[link-1] - 1
|
||||
linkpage = '%04d' % ptarget
|
||||
else :
|
||||
# just link to the current page
|
||||
linkpage = self.id[4:]
|
||||
if len(alt_title) >= len(title):
|
||||
title = alt_title
|
||||
if title != '' and linkpage != '':
|
||||
tocentry += title + '|' + linkpage + '\n'
|
||||
lstart = len(parares)
|
||||
if word == '_link_' : word = ''
|
||||
elif (link < 0) :
|
||||
if word == '_link_' : word = ''
|
||||
|
||||
if word == '_lb_':
|
||||
word = ''
|
||||
sep = ''
|
||||
|
||||
if num in self.dehyphen_rootid :
|
||||
word = word[0:-1]
|
||||
sep = ''
|
||||
|
||||
parares += word + sep
|
||||
|
||||
else :
|
||||
continue
|
||||
|
||||
return tocentry
|
||||
|
||||
|
||||
|
||||
|
||||
# walk the document tree collecting the information needed
|
||||
# to build an html page using the ocrText
|
||||
|
@ -519,6 +591,7 @@ class DocParser(object):
|
|||
def process(self):
|
||||
|
||||
htmlpage = ''
|
||||
tocinfo = ''
|
||||
|
||||
# get the ocr text
|
||||
(pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
|
||||
|
@ -644,9 +717,9 @@ class DocParser(object):
|
|||
ptype = 'end'
|
||||
first_para_continued = False
|
||||
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
|
||||
tocinfo += self.buildTOCEntry(pdesc)
|
||||
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
|
||||
|
||||
|
||||
elif (regtype == 'vertical') or (regtype == 'table') :
|
||||
ptype = 'full'
|
||||
if inGroup:
|
||||
|
@ -704,12 +777,11 @@ class DocParser(object):
|
|||
htmlpage = htmlpage[0:-4]
|
||||
last_para_continued = False
|
||||
|
||||
return htmlpage
|
||||
|
||||
return htmlpage, tocinfo
|
||||
|
||||
|
||||
def convert2HTML(flatxml, classlst, fileid, bookDir, gdict, fixedimage):
|
||||
# create a document parser
|
||||
dp = DocParser(flatxml, classlst, fileid, bookDir, gdict, fixedimage)
|
||||
htmlpage = dp.process()
|
||||
return htmlpage
|
||||
htmlpage, tocinfo = dp.process()
|
||||
return htmlpage, tocinfo
|
||||
|
|
|
@ -10,17 +10,94 @@ from struct import unpack
|
|||
|
||||
|
||||
class PParser(object):
|
||||
def __init__(self, gd, flatxml):
|
||||
def __init__(self, gd, flatxml, meta_array):
|
||||
self.gd = gd
|
||||
self.flatdoc = flatxml.split('\n')
|
||||
self.docSize = len(self.flatdoc)
|
||||
self.temp = []
|
||||
foo = self.getData('page.h') or self.getData('book.h')
|
||||
self.ph = foo[0]
|
||||
foo = self.getData('page.w') or self.getData('book.w')
|
||||
self.pw = foo[0]
|
||||
self.gx = self.getData('info.glyph.x')
|
||||
self.gy = self.getData('info.glyph.y')
|
||||
self.gid = self.getData('info.glyph.glyphID')
|
||||
|
||||
self.ph = -1
|
||||
self.pw = -1
|
||||
startpos = self.posinDoc('page.h') or self.posinDoc('book.h')
|
||||
for p in startpos:
|
||||
(name, argres) = self.lineinDoc(p)
|
||||
self.ph = max(self.ph, int(argres))
|
||||
startpos = self.posinDoc('page.w') or self.posinDoc('book.w')
|
||||
for p in startpos:
|
||||
(name, argres) = self.lineinDoc(p)
|
||||
self.pw = max(self.pw, int(argres))
|
||||
|
||||
if self.ph <= 0:
|
||||
self.ph = int(meta_array.get('pageHeight', '11000'))
|
||||
if self.pw <= 0:
|
||||
self.pw = int(meta_array.get('pageWidth', '8500'))
|
||||
|
||||
res = []
|
||||
startpos = self.posinDoc('info.glyph.x')
|
||||
for p in startpos:
|
||||
argres = self.getDataatPos('info.glyph.x', p)
|
||||
res.extend(argres)
|
||||
self.gx = res
|
||||
|
||||
res = []
|
||||
startpos = self.posinDoc('info.glyph.y')
|
||||
for p in startpos:
|
||||
argres = self.getDataatPos('info.glyph.y', p)
|
||||
res.extend(argres)
|
||||
self.gy = res
|
||||
|
||||
res = []
|
||||
startpos = self.posinDoc('info.glyph.glyphID')
|
||||
for p in startpos:
|
||||
argres = self.getDataatPos('info.glyph.glyphID', p)
|
||||
res.extend(argres)
|
||||
self.gid = res
|
||||
|
||||
|
||||
# return tag at line pos in document
|
||||
def lineinDoc(self, pos) :
|
||||
if (pos >= 0) and (pos < self.docSize) :
|
||||
item = self.flatdoc[pos]
|
||||
if item.find('=') >= 0:
|
||||
(name, argres) = item.split('=',1)
|
||||
else :
|
||||
name = item
|
||||
argres = ''
|
||||
return name, argres
|
||||
|
||||
# find tag in doc if within pos to end inclusive
|
||||
def findinDoc(self, tagpath, pos, end) :
|
||||
result = None
|
||||
if end == -1 :
|
||||
end = self.docSize
|
||||
else:
|
||||
end = min(self.docSize, end)
|
||||
foundat = -1
|
||||
for j in xrange(pos, end):
|
||||
item = self.flatdoc[j]
|
||||
if item.find('=') >= 0:
|
||||
(name, argres) = item.split('=',1)
|
||||
else :
|
||||
name = item
|
||||
argres = ''
|
||||
if name.endswith(tagpath) :
|
||||
result = argres
|
||||
foundat = j
|
||||
break
|
||||
return foundat, result
|
||||
|
||||
# return list of start positions for the tagpath
|
||||
def posinDoc(self, tagpath):
|
||||
startpos = []
|
||||
pos = 0
|
||||
res = ""
|
||||
while res != None :
|
||||
(foundpos, res) = self.findinDoc(tagpath, pos, -1)
|
||||
if res != None :
|
||||
startpos.append(foundpos)
|
||||
pos = foundpos + 1
|
||||
return startpos
|
||||
|
||||
def getData(self, path):
|
||||
result = None
|
||||
cnt = len(self.flatdoc)
|
||||
|
@ -39,6 +116,23 @@ class PParser(object):
|
|||
for j in xrange(0,len(argres)):
|
||||
argres[j] = int(argres[j])
|
||||
return result
|
||||
|
||||
def getDataatPos(self, path, pos):
|
||||
result = None
|
||||
item = self.flatdoc[pos]
|
||||
if item.find('=') >= 0:
|
||||
(name, argt) = item.split('=')
|
||||
argres = argt.split('|')
|
||||
else:
|
||||
name = item
|
||||
argres = []
|
||||
if (len(argres) > 0) :
|
||||
for j in xrange(0,len(argres)):
|
||||
argres[j] = int(argres[j])
|
||||
if (name.endswith(path)):
|
||||
result = argres
|
||||
return result
|
||||
|
||||
def getDataTemp(self, path):
|
||||
result = None
|
||||
cnt = len(self.temp)
|
||||
|
@ -58,6 +152,7 @@ class PParser(object):
|
|||
for j in xrange(0,len(argres)):
|
||||
argres[j] = int(argres[j])
|
||||
return result
|
||||
|
||||
def getImages(self):
|
||||
result = []
|
||||
self.temp = self.flatdoc
|
||||
|
@ -69,6 +164,7 @@ class PParser(object):
|
|||
src = self.getDataTemp('img.src')[0]
|
||||
result.append('<image xlink:href="../img/img%04d.jpg" x="%d" y="%d" width="%d" height="%d" />\n' % (src, x, y, w, h))
|
||||
return result
|
||||
|
||||
def getGlyphs(self):
|
||||
result = []
|
||||
if (self.gid != None) and (len(self.gid) > 0):
|
||||
|
@ -84,25 +180,25 @@ class PParser(object):
|
|||
return result
|
||||
|
||||
|
||||
def convert2SVG(gdict, flat_xml, counter, numfiles, svgDir, raw, meta_array, scaledpi):
|
||||
def convert2SVG(gdict, flat_xml, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi):
|
||||
ml = ''
|
||||
pp = PParser(gdict, flat_xml)
|
||||
pp = PParser(gdict, flat_xml, meta_array)
|
||||
ml += '<?xml version="1.0" standalone="no"?>\n'
|
||||
if (raw):
|
||||
ml += '<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n'
|
||||
ml += '<svg width="%fin" height="%fin" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1)
|
||||
ml += '<title>Page %d - %s by %s</title>\n' % (counter, meta_array['Title'],meta_array['Authors'])
|
||||
ml += '<title>Page %d - %s by %s</title>\n' % (pageid, meta_array['Title'],meta_array['Authors'])
|
||||
else:
|
||||
ml += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
|
||||
ml += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" ><head>\n'
|
||||
ml += '<title>Page %d - %s by %s</title>\n' % (counter, meta_array['Title'],meta_array['Authors'])
|
||||
ml += '<title>Page %d - %s by %s</title>\n' % (pageid, meta_array['Title'],meta_array['Authors'])
|
||||
ml += '<script><![CDATA[\n'
|
||||
ml += 'function gd(){var p=window.location.href.replace(/^.*\?dpi=(\d+).*$/i,"$1");return p;}\n'
|
||||
ml += 'var dpi=%d;\n' % scaledpi
|
||||
if (counter) :
|
||||
ml += 'var prevpage="page%04d.xhtml";\n' % (counter - 1)
|
||||
if (counter < numfiles-1) :
|
||||
ml += 'var nextpage="page%04d.xhtml";\n' % (counter + 1)
|
||||
if (previd) :
|
||||
ml += 'var prevpage="page%04d.xhtml";\n' % (previd)
|
||||
if (nextid) :
|
||||
ml += 'var nextpage="page%04d.xhtml";\n' % (nextid)
|
||||
ml += 'var pw=%d;var ph=%d;' % (pp.pw, pp.ph)
|
||||
ml += 'function zoomin(){dpi=dpi*(0.8);setsize();}\n'
|
||||
ml += 'function zoomout(){dpi=dpi*1.25;setsize();}\n'
|
||||
|
@ -115,10 +211,11 @@ def convert2SVG(gdict, flat_xml, counter, numfiles, svgDir, raw, meta_array, sca
|
|||
ml += '</head>\n'
|
||||
ml += '<body onLoad="setsize();" style="background-color:#777;text-align:center;">\n'
|
||||
ml += '<div style="white-space:nowrap;">\n'
|
||||
if (counter == 0) :
|
||||
if previd == None:
|
||||
ml += '<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n'
|
||||
else:
|
||||
ml += '<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,150,95,5,95,295" fill="#AAAAAA" /></svg></a>\n'
|
||||
|
||||
ml += '<a href="javascript:npage();"><svg id="svgimg" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" style="background-color:#FFF;border:1px solid black;">' % (pp.pw, pp.ph)
|
||||
if (pp.gid != None):
|
||||
ml += '<defs>\n'
|
||||
|
@ -134,12 +231,14 @@ def convert2SVG(gdict, flat_xml, counter, numfiles, svgDir, raw, meta_array, sca
|
|||
for j in xrange(0,len(pp.gid)):
|
||||
ml += '<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (pp.gid[j], pp.gx[j], pp.gy[j])
|
||||
if (img == None or len(img) == 0) and (pp.gid == None or len(pp.gid) == 0):
|
||||
ml += '<text x="10" y="10" font-family="Helvetica" font-size="100" stroke="black">This page intentionally left blank.</text>\n<text x="10" y="110" font-family="Helvetica" font-size="50" stroke="black">Until this notice unintentionally gave it content. (gensvg.py)</text>\n'
|
||||
xpos = "%d" % (pp.pw // 3)
|
||||
ypos = "%d" % (pp.ph // 3)
|
||||
ml += '<text x="' + xpos + '" y="' + ypos + '" font-size="' + meta_array['fontSize'] + '" font-family="Helvetica" stroke="black">This page intentionally left blank.</text>\n'
|
||||
if (raw) :
|
||||
ml += '</svg>'
|
||||
else :
|
||||
ml += '</svg></a>\n'
|
||||
if (counter == numfiles - 1) :
|
||||
if nextid == None:
|
||||
ml += '<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n'
|
||||
else :
|
||||
ml += '<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,5,5,295,95,150" fill="#AAAAAA" /></svg></a>\n'
|
||||
|
|
|
@ -19,6 +19,8 @@ import getopt
|
|||
from struct import pack
|
||||
from struct import unpack
|
||||
|
||||
class TpzDRMError(Exception):
|
||||
pass
|
||||
|
||||
# local support routines
|
||||
if 'calibre' in sys.modules:
|
||||
|
@ -114,7 +116,8 @@ class Dictionary(object):
|
|||
return self.stable[self.pos]
|
||||
else:
|
||||
print "Error - %d outside of string table limits" % val
|
||||
sys.exit(-1)
|
||||
raise TpzDRMError('outside or string table limits')
|
||||
# sys.exit(-1)
|
||||
def getSize(self):
|
||||
return self.size
|
||||
def getPos(self):
|
||||
|
@ -371,10 +374,34 @@ def generateBook(bookDir, raw, fixedimage):
|
|||
(ph, pw) = getPageDim(flat_xml)
|
||||
if (ph == '-1') or (ph == '0') : ph = '11000'
|
||||
if (pw == '-1') or (pw == '0') : pw = '8500'
|
||||
meta_array['pageHeight'] = ph
|
||||
meta_array['pageWidth'] = pw
|
||||
if 'fontSize' not in meta_array.keys():
|
||||
meta_array['fontSize'] = fontsize
|
||||
|
||||
# print ' ', 'other0000.dat'
|
||||
# process other.dat for css info and for map of page files to svg images
|
||||
# this map is needed because some pages actually are made up of multiple
|
||||
# pageXXXX.xml files
|
||||
xname = os.path.join(bookDir, 'style.css')
|
||||
flat_xml = convert2xml.fromData(dict, otherFile)
|
||||
|
||||
# extract info.original.pid to get original page information
|
||||
pageIDMap = {}
|
||||
pageidnums = stylexml2css.getpageIDMap(flat_xml)
|
||||
if len(pageidnums) == 0:
|
||||
filenames = os.listdir(pageDir)
|
||||
numfiles = len(filenames)
|
||||
for k in range(numfiles):
|
||||
pageidnums.append(k)
|
||||
# create a map from page ids to list of page file nums to process for that page
|
||||
for i in range(len(pageidnums)):
|
||||
id = pageidnums[i]
|
||||
if id in pageIDMap.keys():
|
||||
pageIDMap[id].append(i)
|
||||
else:
|
||||
pageIDMap[id] = [i]
|
||||
|
||||
# now get the css info
|
||||
cssstr , classlst = stylexml2css.convert2CSS(flat_xml, fontsize, ph, pw)
|
||||
file(xname, 'wb').write(cssstr)
|
||||
xname = os.path.join(xmlDir, 'other0000.xml')
|
||||
|
@ -414,6 +441,9 @@ def generateBook(bookDir, raw, fixedimage):
|
|||
glyfile.close()
|
||||
print " "
|
||||
|
||||
# build up tocentries while processing html
|
||||
tocentries = ''
|
||||
|
||||
# start up the html
|
||||
htmlFileName = "book.html"
|
||||
htmlstr = '<?xml version="1.0" encoding="utf-8"?>\n'
|
||||
|
@ -436,6 +466,77 @@ def generateBook(bookDir, raw, fixedimage):
|
|||
# readability when rendering to the screen.
|
||||
scaledpi = 1440.0
|
||||
|
||||
filenames = os.listdir(pageDir)
|
||||
filenames = sorted(filenames)
|
||||
numfiles = len(filenames)
|
||||
|
||||
xmllst = []
|
||||
|
||||
for filename in filenames:
|
||||
# print ' ', filename
|
||||
print ".",
|
||||
fname = os.path.join(pageDir,filename)
|
||||
flat_xml = convert2xml.fromData(dict, fname)
|
||||
|
||||
# keep flat_xml for later svg processing
|
||||
xmllst.append(flat_xml)
|
||||
|
||||
xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
|
||||
file(xname, 'wb').write(convert2xml.getXML(dict, fname))
|
||||
|
||||
# first get the html
|
||||
pagehtml, tocinfo = flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir, gd, fixedimage)
|
||||
tocentries += tocinfo
|
||||
htmlstr += pagehtml
|
||||
|
||||
# finish up the html string and output it
|
||||
htmlstr += '</body>\n</html>\n'
|
||||
file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
|
||||
|
||||
print " "
|
||||
print 'Extracting Table of Contents from Amazon OCR'
|
||||
|
||||
# first create a table of contents file for the svg images
|
||||
tochtml = '<?xml version="1.0" encoding="utf-8"?>\n'
|
||||
tochtml += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
|
||||
tochtml += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >'
|
||||
tochtml += '<head>\n'
|
||||
tochtml += '<title>' + meta_array['Title'] + '</title>\n'
|
||||
tochtml += '<meta name="Author" content="' + meta_array['Authors'] + '" />\n'
|
||||
tochtml += '<meta name="Title" content="' + meta_array['Title'] + '" />\n'
|
||||
if 'ASIN' in meta_array:
|
||||
tochtml += '<meta name="ASIN" content="' + meta_array['ASIN'] + '" />\n'
|
||||
if 'GUID' in meta_array:
|
||||
tochtml += '<meta name="GUID" content="' + meta_array['GUID'] + '" />\n'
|
||||
tochtml += '</head>\n'
|
||||
tochtml += '<body>\n'
|
||||
|
||||
tochtml += '<h2>Table of Contents</h2>\n'
|
||||
start = pageidnums[0]
|
||||
if (raw):
|
||||
startname = 'page%04d.svg' % start
|
||||
else:
|
||||
startname = 'page%04d.xhtml' % start
|
||||
|
||||
tochtml += '<h3><a href="' + startname + '">Start of Book</a></h3>\n'
|
||||
# build up a table of contents for the svg xhtml output
|
||||
toclst = tocentries.split('\n')
|
||||
toclst.pop()
|
||||
for entry in toclst:
|
||||
print entry
|
||||
title, pagenum = entry.split('|')
|
||||
id = pageidnums[int(pagenum)]
|
||||
if (raw):
|
||||
fname = 'page%04d.svg' % id
|
||||
else:
|
||||
fname = 'page%04d.xhtml' % id
|
||||
tochtml += '<h3><a href="'+ fname + '">' + title + '</a></h3>\n'
|
||||
tochtml += '</body>\n'
|
||||
tochtml += '</html>\n'
|
||||
file(os.path.join(svgDir, 'toc.xhtml'), 'wb').write(tochtml)
|
||||
|
||||
|
||||
# now create index_svg.xhtml that points to all required files
|
||||
svgindex = '<?xml version="1.0" encoding="utf-8"?>\n'
|
||||
svgindex += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
|
||||
svgindex += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >'
|
||||
|
@ -450,50 +551,42 @@ def generateBook(bookDir, raw, fixedimage):
|
|||
svgindex += '</head>\n'
|
||||
svgindex += '<body>\n'
|
||||
|
||||
filenames = os.listdir(pageDir)
|
||||
filenames = sorted(filenames)
|
||||
numfiles = len(filenames)
|
||||
counter = 0
|
||||
|
||||
for filename in filenames:
|
||||
# print ' ', filename
|
||||
print ".",
|
||||
|
||||
fname = os.path.join(pageDir,filename)
|
||||
flat_xml = convert2xml.fromData(dict, fname)
|
||||
|
||||
xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
|
||||
file(xname, 'wb').write(convert2xml.getXML(dict, fname))
|
||||
|
||||
# first get the html
|
||||
htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir, gd, fixedimage)
|
||||
|
||||
# now get the svg image of the page
|
||||
svgxml = flatxml2svg.convert2SVG(gd, flat_xml, counter, numfiles, svgDir, raw, meta_array, scaledpi)
|
||||
|
||||
print "Building svg images of each book page"
|
||||
svgindex += '<h2>List of Pages</h2>\n'
|
||||
svgindex += '<div>\n'
|
||||
idlst = sorted(pageIDMap.keys())
|
||||
numids = len(idlst)
|
||||
cnt = len(idlst)
|
||||
previd = None
|
||||
for j in range(cnt):
|
||||
pageid = idlst[j]
|
||||
if j < cnt - 1:
|
||||
nextid = idlst[j+1]
|
||||
else:
|
||||
nextid = None
|
||||
print '.',
|
||||
pagelst = pageIDMap[pageid]
|
||||
flat_svg = ''
|
||||
for page in pagelst:
|
||||
flat_svg += xmllst[page]
|
||||
svgxml = flatxml2svg.convert2SVG(gd, flat_svg, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi)
|
||||
if (raw) :
|
||||
pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w')
|
||||
svgindex += '<a href="svg/page%04d.svg">Page %d</a>\n' % (counter, counter)
|
||||
pfile = open(os.path.join(svgDir,'page%04d.svg' % pageid),'w')
|
||||
svgindex += '<a href="svg/page%04d.svg">Page %d</a>\n' % (pageid, pageid)
|
||||
else :
|
||||
pfile = open(os.path.join(svgDir,'page%04d.xhtml' % counter), 'w')
|
||||
svgindex += '<a href="svg/page%04d.xhtml">Page %d</a>\n' % (counter, counter)
|
||||
|
||||
|
||||
pfile = open(os.path.join(svgDir,'page%04d.xhtml' % pageid), 'w')
|
||||
svgindex += '<a href="svg/page%04d.xhtml">Page %d</a>\n' % (pageid, pageid)
|
||||
previd = pageid
|
||||
pfile.write(svgxml)
|
||||
pfile.close()
|
||||
|
||||
counter += 1
|
||||
|
||||
print " "
|
||||
|
||||
# finish up the html string and output it
|
||||
htmlstr += '</body>\n</html>\n'
|
||||
file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
|
||||
|
||||
# finish up the svg index string and output it
|
||||
svgindex += '</div>\n'
|
||||
svgindex += '<h2><a href="svg/toc.xhtml">Table of Contents</a></h2>\n'
|
||||
svgindex += '</body>\n</html>\n'
|
||||
file(os.path.join(bookDir, 'index_svg.xhtml'), 'wb').write(svgindex)
|
||||
|
||||
print " "
|
||||
|
||||
# build the opf file
|
||||
opfname = os.path.join(bookDir, 'book.opf')
|
||||
opfstr = '<?xml version="1.0" encoding="utf-8"?>\n'
|
||||
|
@ -573,7 +666,7 @@ def main(argv):
|
|||
return 1
|
||||
|
||||
raw = 0
|
||||
fixedimage = False
|
||||
fixedimage = True
|
||||
for o, a in opts:
|
||||
if o =="-h":
|
||||
usage()
|
||||
|
|
|
@ -17,7 +17,7 @@ from __future__ import with_statement
|
|||
# and many many others
|
||||
|
||||
|
||||
__version__ = '3.7'
|
||||
__version__ = '3.9'
|
||||
|
||||
class Unbuffered:
|
||||
def __init__(self, stream):
|
||||
|
@ -32,6 +32,7 @@ import sys
|
|||
import os, csv, getopt
|
||||
import string
|
||||
import re
|
||||
import traceback
|
||||
|
||||
class DrmException(Exception):
|
||||
pass
|
||||
|
@ -95,8 +96,14 @@ def decryptBook(infile, outdir, k4, kInfoFiles, serials, pids):
|
|||
print "Processing Book: ", title
|
||||
filenametitle = cleanup_name(title)
|
||||
outfilename = bookname
|
||||
if len(bookname)>4 and len(filenametitle)>4 and bookname[:4] != filenametitle[:4]:
|
||||
if len(outfilename)<=8 or len(filenametitle)<=8:
|
||||
outfilename = outfilename + "_" + filenametitle
|
||||
elif outfilename[:8] != filenametitle[:8]:
|
||||
outfilename = outfilename[:8] + "_" + filenametitle
|
||||
|
||||
# avoid excessively long file names
|
||||
if len(outfilename)>150:
|
||||
outfilename = outfilename[:150]
|
||||
|
||||
# build pid list
|
||||
md1, md2 = mb.getPIDMetaInfo()
|
||||
|
@ -128,8 +135,8 @@ def decryptBook(infile, outdir, k4, kInfoFiles, serials, pids):
|
|||
zipname = os.path.join(outdir, outfilename + '_nodrm' + '.htmlz')
|
||||
mb.getHTMLZip(zipname)
|
||||
|
||||
print " Creating SVG HTMLZ Archive"
|
||||
zipname = os.path.join(outdir, outfilename + '_SVG' + '.htmlz')
|
||||
print " Creating SVG ZIP Archive"
|
||||
zipname = os.path.join(outdir, outfilename + '_SVG' + '.zip')
|
||||
mb.getSVGZip(zipname)
|
||||
|
||||
print " Creating XML ZIP Archive"
|
||||
|
|
|
@ -81,6 +81,14 @@ class DocParser(object):
|
|||
pos = foundpos + 1
|
||||
return startpos
|
||||
|
||||
# returns a vector of integers for the tagpath
|
||||
def getData(self, tagpath, pos, end):
|
||||
argres=[]
|
||||
(foundat, argt) = self.findinDoc(tagpath, pos, end)
|
||||
if (argt != None) and (len(argt) > 0) :
|
||||
argList = argt.split('|')
|
||||
argres = [ int(strval) for strval in argList]
|
||||
return argres
|
||||
|
||||
def process(self):
|
||||
|
||||
|
@ -237,7 +245,11 @@ def convert2CSS(flatxml, fontsize, ph, pw):
|
|||
|
||||
# create a document parser
|
||||
dp = DocParser(flatxml, fontsize, ph, pw)
|
||||
|
||||
csspage = dp.process()
|
||||
|
||||
return csspage
|
||||
|
||||
|
||||
def getpageIDMap(flatxml):
|
||||
dp = DocParser(flatxml, 0, 0, 0)
|
||||
pageidnumbers = dp.getData('info.original.pid', 0, -1)
|
||||
return pageidnumbers
|
||||
|
|
|
@ -140,6 +140,7 @@ class TopazBook:
|
|||
def __init__(self, filename):
|
||||
self.fo = file(filename, 'rb')
|
||||
self.outdir = tempfile.mkdtemp()
|
||||
# self.outdir = 'rawdat'
|
||||
self.bookPayloadOffset = 0
|
||||
self.bookHeaderRecords = {}
|
||||
self.bookMetadata = {}
|
||||
|
@ -370,7 +371,8 @@ class TopazBook:
|
|||
|
||||
def cleanup(self):
|
||||
if os.path.isdir(self.outdir):
|
||||
shutil.rmtree(self.outdir, True)
|
||||
pass
|
||||
# shutil.rmtree(self.outdir, True)
|
||||
|
||||
def usage(progname):
|
||||
print "Removes DRM protection from Topaz ebooks and extract the contents"
|
||||
|
@ -438,7 +440,7 @@ def main(argv=sys.argv):
|
|||
tb.getHTMLZip(zipname)
|
||||
|
||||
print " Creating SVG ZIP Archive"
|
||||
zipname = os.path.join(outdir, bookname + '_SVG' + '.htmlz')
|
||||
zipname = os.path.join(outdir, bookname + '_SVG' + '.zip')
|
||||
tb.getSVGZip(zipname)
|
||||
|
||||
print " Creating XML ZIP Archive"
|
||||
|
@ -450,12 +452,12 @@ def main(argv=sys.argv):
|
|||
|
||||
except TpzDRMError, e:
|
||||
print str(e)
|
||||
tb.cleanup()
|
||||
# tb.cleanup()
|
||||
return 1
|
||||
|
||||
except Exception, e:
|
||||
print str(e)
|
||||
tb.cleanup
|
||||
# tb.cleanup
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
|
|
@ -107,42 +107,4 @@ Once Windows users have installed Python 2.X for 32 bits, and the matching OpenS
|
|||
Linux Users
|
||||
-----------
|
||||
|
||||
Unfortuantely, the Calibre Plugins do not really work well on Linux because of issues running Calibre under Wine. Native versions of Calibre can not be used with the K4MobiDeDRM plugin because the plugin will not be able to find the information it needs to remove the DRM.
|
||||
|
||||
Although some of the scripts do work on native Linux, others require the use of a recent version of Wine.
|
||||
|
||||
Here are the instructions for using KindleBooks.pyw on Linux under Wine.
|
||||
|
||||
1. upgrade to very recent versions of Wine; This has been tested with Wine 1.3.18 – 1.3.22. It may work with earlier versions but no promises.
|
||||
|
||||
2. Some versions of winecfg have a bug in setting the volume serial number, so create a .windows-serial file at root of drive_c to set a proper windows volume serial number (8 digit hex value for unsigned integer).
|
||||
cd ~
|
||||
cd .wine
|
||||
cd drive_c
|
||||
echo deadbeaf > .windows-serial
|
||||
|
||||
Replace deadbeef with whatever you want but I would stay away from the default setting of ffffffff
|
||||
|
||||
3. Only ***after*** setting the volume serial number properly – download and install under wine K4PC version for Windows. Register it and download from your Archive one of your Kindle ebooks. Versions known to work are K4PC 1.4.1 and earlier. Later version may work but no promises.
|
||||
|
||||
4. Download and install under wine ActiveState Active Python 2.7 for Windows 32bit
|
||||
|
||||
5. Download and unzip tools_v4.5.zip
|
||||
|
||||
6. Then run KindleBook.pyw ***under python running on wine*** using one of the following methods:
|
||||
|
||||
From a Linux shell:
|
||||
|
||||
wine python KindleBooks.pyw
|
||||
|
||||
Or to get a Windows (wine) command prompt
|
||||
|
||||
wine cmd
|
||||
python KindleBooks.pyw
|
||||
|
||||
Or to get a "Windows" file explorer:
|
||||
|
||||
winefile
|
||||
|
||||
and then double-click on any .pyw files to run them in the wine environment
|
||||
|
||||
Please see the ReadMe_Linux_Users.txt
|
||||
|
|
|
@ -0,0 +1,113 @@
|
|||
ReadMe for Linux Users and the Tools
|
||||
|
||||
|
||||
Linux and Kindle for PC (KindleBooks.pyw)
|
||||
------------------------------------------
|
||||
|
||||
Here are the instructions for using Kindle for PC and KindleBooks.pyw on Linux under Wine. (Thank you Eyeless and Pete)
|
||||
|
||||
1. upgrade to very recent versions of Wine; This has been tested with Wine 1.3.15 – 1.3.2X. It may work with earlier versions but no promises. It does not work with wine 1.2.X versions.
|
||||
|
||||
If you have not already installed Kindle for PC under wine, follow steps 2 and 3 otherwise jump to step 4
|
||||
|
||||
2. Some versions of winecfg have a bug in setting the volume serial number, so create a .windows-serial file at root of drive_c to set a proper windows volume serial number (8 digit hex value for unsigned integer).
|
||||
cd ~
|
||||
cd .wine
|
||||
cd drive_c
|
||||
echo deadbeef > .windows-serial
|
||||
|
||||
Replace "deadbeef" with whatever hex value you want but I would stay away from the default setting of "ffffffff" which does not seem to work. BTW: deadbeef is itself a valid possible hex value if you want to use it
|
||||
|
||||
3. Only ***after*** setting the volume serial number properly – download and install under wine K4PC version for Windows. Register it and download from your Archive one of your Kindle ebooks. Versions known to work are K4PC 1.7.1 and earlier. Later version may work but no promises.
|
||||
|
||||
4. Download and install under wine ActiveState Active Python 2.7 for Windows 32bit
|
||||
|
||||
5. Download and unzip tools_v4.X.zip
|
||||
|
||||
6. Now make sure the executable bit is NOT set for KindleBooks.pyw as Linux will actually keep trying to ignore wine and launch it under Linux python which will cause it to fail.
|
||||
|
||||
cd tools_v4.7/KindleBooks/
|
||||
chmod ugo-x KindleBooks.pyw
|
||||
|
||||
7. Then run KindleBook.pyw ***under python running on wine*** using the Linux shell as follows:
|
||||
|
||||
wine python KindleBooks.pyw
|
||||
|
||||
Select the ebook file directly from your “My Kindle Content” folder, select a new/unused directory for the output. You should not need to enter any PID or Serial Number for Kindle for PC.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Linux and Adobe Digital Editions ePubs
|
||||
--------------------------------------
|
||||
|
||||
Here are the instructions for using the tools with ePub books and Adobe Digital Editions on Linux under Wine. (Thank you mclien!)
|
||||
|
||||
|
||||
1. download the most recent version of wine from winehq.org (1.3.29 in my case)
|
||||
|
||||
For debian users:
|
||||
|
||||
to get a recent version of wine I decited to use aptosid (2011-02, xfce)
|
||||
(because I’m used to debian)
|
||||
install aptosid and upgrade it (see aptosid site for detaild instructions)
|
||||
|
||||
|
||||
2. properly install Wine (see the Wine site for details)
|
||||
|
||||
For debian users:
|
||||
|
||||
cd to this dir and install the packages as root:
|
||||
‘dpkg -i *.deb’
|
||||
you will get some error messages, which can be ignored.
|
||||
again as root use
|
||||
‘apt-get -f install’ to correct this errors
|
||||
|
||||
3. python 2.7 should already be installed on your system but you may need the following additional python package
|
||||
|
||||
'apt-get install python-tk’
|
||||
|
||||
4. all programms need to be installed as normal user. All these programm are installed the same way:
|
||||
‘wine ‘
|
||||
we need:
|
||||
a) Adobe Digital Edition 1.7.2(from: http://kb2.adobe.com/cps/403/kb403051.html)
|
||||
(there is a “can’t install ADE” site, where the setup.exe hides)
|
||||
b) ActivePython-2.7.2.5-win32-x86.msi (from: http://www.activestate.com/activepython/downloads)
|
||||
c) Win32OpenSSL_Light-0_9_8r.exe (from: http://www.slproweb.com/)
|
||||
d) pycrypto-2.3.win32-py2.7.msi (from: http://www.voidspace.org.uk/python/modules.shtml)
|
||||
|
||||
5. now get and unpack the very latest tools_v4.X (from Apprentice Alf) in the users drive_c of wine
|
||||
(~/.wine/drive_c/)
|
||||
|
||||
6. start ADE with:
|
||||
‘wine digitaleditions.exe’ or from the start menue wine-adobe-digital..
|
||||
|
||||
7. register this instance of ADE with your adobeID and close it
|
||||
change to the tools_v4.X dir:
|
||||
cd ~/.wine/drive_c/tools_v4.X/Adobe_ePub_Tools
|
||||
|
||||
8. create the adeptkey.der with:
|
||||
‘wine python ineptkey_v5.4.pyw’ (only need once!)
|
||||
(key will be here: ~/.wine/drive_c/tools_v4.X/Adobe_ePub_Tools/adeptkey.der)
|
||||
|
||||
9. Use ADE running under Wine to dowload all of your purchased ePub ebooks
|
||||
|
||||
10. for each book you have downloaded via Adobe Digital Editions
|
||||
There is no need to use Wine for this step!
|
||||
|
||||
'python ineptpub_v5.6.pyw’
|
||||
this will launch a window with 3 lines
|
||||
1. key: (allready filled in, otherwise it’s in the path where you did step 8.
|
||||
2. input file: drmbook.epub
|
||||
3. output file: name-ypu-want_for_free_book.epub
|
||||
|
||||
Also… once you successfully generate your adept.der keyfile using WINE, you can use the regular ineptepub plugin with the standard Linux calibre. Just put the *.der file(s) in your calibre configuration directory.
|
||||
so if you want you can use calibre in Linux:
|
||||
|
||||
11. install the plugins from the tools as discribed in the readmes for win
|
||||
|
||||
12. copy the adeptkey.der into the config dir of calibre (~/.config/calibre in debian). Every book imported to calibre will automaticly freed from DRM.
|
||||
|
||||
|
|
@ -111,7 +111,7 @@ class eRdrDeDRM(FileTypePlugin):
|
|||
|
||||
print " Decoding File"
|
||||
sect = erdr2pml.Sectionizer(infile, 'PNRdPPrs')
|
||||
er = erdr2pml.EreaderProcessor(sect.loadSection, name, cc)
|
||||
er = erdr2pml.EreaderProcessor(sect, name, cc)
|
||||
|
||||
if er.getNumImages() > 0:
|
||||
print " Extracting images"
|
||||
|
|
|
@ -59,8 +59,11 @@
|
|||
# 0.18 - on Windows try PyCrypto first and OpenSSL next
|
||||
# 0.19 - Modify the interface to allow use of import
|
||||
# 0.20 - modify to allow use inside new interface for calibre plugins
|
||||
# 0.21 - Support eReader (drm) version 11.
|
||||
# - Don't reject dictionary format.
|
||||
# - Ignore sidebars for dictionaries (different format?)
|
||||
|
||||
__version__='0.20'
|
||||
__version__='0.21'
|
||||
|
||||
class Unbuffered:
|
||||
def __init__(self, stream):
|
||||
|
@ -140,12 +143,18 @@ logging.basicConfig()
|
|||
|
||||
|
||||
class Sectionizer(object):
|
||||
bkType = "Book"
|
||||
|
||||
def __init__(self, filename, ident):
|
||||
self.contents = file(filename, 'rb').read()
|
||||
self.header = self.contents[0:72]
|
||||
self.num_sections, = struct.unpack('>H', self.contents[76:78])
|
||||
# Dictionary or normal content (TODO: Not hard-coded)
|
||||
if self.header[0x3C:0x3C+8] != ident:
|
||||
raise ValueError('Invalid file format')
|
||||
if self.header[0x3C:0x3C+8] == "PDctPPrs":
|
||||
self.bkType = "Dict"
|
||||
else:
|
||||
raise ValueError('Invalid file format')
|
||||
self.sections = []
|
||||
for i in xrange(self.num_sections):
|
||||
offset, a1,a2,a3,a4 = struct.unpack('>LBBBB', self.contents[78+i*8:78+i*8+8])
|
||||
|
@ -182,15 +191,15 @@ def deXOR(text, sp, table):
|
|||
return r
|
||||
|
||||
class EreaderProcessor(object):
|
||||
def __init__(self, section_reader, username, creditcard):
|
||||
self.section_reader = section_reader
|
||||
data = section_reader(0)
|
||||
def __init__(self, sect, username, creditcard):
|
||||
self.section_reader = sect.loadSection
|
||||
data = self.section_reader(0)
|
||||
version, = struct.unpack('>H', data[0:2])
|
||||
self.version = version
|
||||
logging.info('eReader file format version %s', version)
|
||||
if version != 272 and version != 260 and version != 259:
|
||||
raise ValueError('incorrect eReader version %d (error 1)' % version)
|
||||
data = section_reader(1)
|
||||
data = self.section_reader(1)
|
||||
self.data = data
|
||||
des = Des(fixKey(data[0:8]))
|
||||
cookie_shuf, cookie_size = struct.unpack('>LL', des.decrypt(data[-8:]))
|
||||
|
@ -219,11 +228,17 @@ class EreaderProcessor(object):
|
|||
self.num_text_pages = struct.unpack('>H', r[2:4])[0] - 1
|
||||
self.num_image_pages = struct.unpack('>H', r[26:26+2])[0]
|
||||
self.first_image_page = struct.unpack('>H', r[24:24+2])[0]
|
||||
# Default values
|
||||
self.num_footnote_pages = 0
|
||||
self.num_sidebar_pages = 0
|
||||
self.first_footnote_page = -1
|
||||
self.first_sidebar_page = -1
|
||||
if self.version == 272:
|
||||
self.num_footnote_pages = struct.unpack('>H', r[46:46+2])[0]
|
||||
self.first_footnote_page = struct.unpack('>H', r[44:44+2])[0]
|
||||
self.num_sidebar_pages = struct.unpack('>H', r[38:38+2])[0]
|
||||
self.first_sidebar_page = struct.unpack('>H', r[36:36+2])[0]
|
||||
if (sect.bkType == "Book"):
|
||||
self.num_sidebar_pages = struct.unpack('>H', r[38:38+2])[0]
|
||||
self.first_sidebar_page = struct.unpack('>H', r[36:36+2])[0]
|
||||
# self.num_bookinfo_pages = struct.unpack('>H', r[34:34+2])[0]
|
||||
# self.first_bookinfo_page = struct.unpack('>H', r[32:32+2])[0]
|
||||
# self.num_chapter_pages = struct.unpack('>H', r[22:22+2])[0]
|
||||
|
@ -239,10 +254,8 @@ class EreaderProcessor(object):
|
|||
self.xortable_size = struct.unpack('>H', r[42:42+2])[0]
|
||||
self.xortable = self.data[self.xortable_offset:self.xortable_offset + self.xortable_size]
|
||||
else:
|
||||
self.num_footnote_pages = 0
|
||||
self.num_sidebar_pages = 0
|
||||
self.first_footnote_page = -1
|
||||
self.first_sidebar_page = -1
|
||||
# Nothing needs to be done
|
||||
pass
|
||||
# self.num_bookinfo_pages = 0
|
||||
# self.num_chapter_pages = 0
|
||||
# self.num_link_pages = 0
|
||||
|
@ -267,10 +280,14 @@ class EreaderProcessor(object):
|
|||
encrypted_key_sha = r[44:44+20]
|
||||
encrypted_key = r[64:64+8]
|
||||
elif version == 260:
|
||||
if drm_sub_version != 13:
|
||||
if drm_sub_version != 13 and drm_sub_version != 11:
|
||||
raise ValueError('incorrect eReader version %d (error 3)' % drm_sub_version)
|
||||
encrypted_key = r[44:44+8]
|
||||
encrypted_key_sha = r[52:52+20]
|
||||
if drm_sub_version == 13:
|
||||
encrypted_key = r[44:44+8]
|
||||
encrypted_key_sha = r[52:52+20]
|
||||
else:
|
||||
encrypted_key = r[64:64+8]
|
||||
encrypted_key_sha = r[44:44+20]
|
||||
elif version == 272:
|
||||
encrypted_key = r[172:172+8]
|
||||
encrypted_key_sha = r[56:56+20]
|
||||
|
@ -356,6 +373,12 @@ class EreaderProcessor(object):
|
|||
r += fmarker
|
||||
fnote_ids = fnote_ids[id_len+4:]
|
||||
|
||||
# TODO: Handle dictionary index (?) pages - which are also marked as
|
||||
# sidebar_pages (?). For now dictionary sidebars are ignored
|
||||
# For dictionaries - record 0 is null terminated strings, followed by
|
||||
# blocks of around 62000 bytes and a final block. Not sure of the
|
||||
# encoding
|
||||
|
||||
# now handle sidebar pages
|
||||
if self.num_sidebar_pages > 0:
|
||||
r += '\n'
|
||||
|
@ -368,7 +391,7 @@ class EreaderProcessor(object):
|
|||
id_len = ord(sbar_ids[2])
|
||||
id = sbar_ids[3:3+id_len]
|
||||
smarker = '<sidebar id="%s">\n' % id
|
||||
smarker += zlib.decompress(des.decrypt(self.section_reader(self.first_footnote_page + i)))
|
||||
smarker += zlib.decompress(des.decrypt(self.section_reader(self.first_sidebar_page + i)))
|
||||
smarker += '\n</sidebar>\n'
|
||||
r += smarker
|
||||
sbar_ids = sbar_ids[id_len+4:]
|
||||
|
@ -389,7 +412,7 @@ def convertEreaderToPml(infile, name, cc, outdir):
|
|||
bookname = os.path.splitext(os.path.basename(infile))[0]
|
||||
print " Decoding File"
|
||||
sect = Sectionizer(infile, 'PNRdPPrs')
|
||||
er = EreaderProcessor(sect.loadSection, name, cc)
|
||||
er = EreaderProcessor(sect, name, cc)
|
||||
|
||||
if er.getNumImages() > 0:
|
||||
print " Extracting images"
|
||||
|
|
Loading…
Reference in New Issue