topazscripts 1.6
This commit is contained in:
parent
a1fec0b54d
commit
58e9c973ab
|
@ -1,3 +1,8 @@
|
|||
Changes in version 1.6
|
||||
- support for books whose paragraphs have no styles
|
||||
- support to run cmbtc_dump on Linux and Mac OSX provided you know your PID of your ipod or standalone Kindle
|
||||
(contributed by DiapDealer)
|
||||
|
||||
Changes in version 1.5
|
||||
- completely reworked generation of styles to use actual page heights and widths
|
||||
- added new script getpagedim.py to support the above
|
||||
|
|
|
@ -0,0 +1,504 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
from __future__ import with_statement
|
||||
|
||||
import csv
|
||||
import sys
|
||||
import os
|
||||
import getopt
|
||||
import zlib
|
||||
from struct import pack
|
||||
from struct import unpack
|
||||
|
||||
MAX_PATH = 255
|
||||
|
||||
# Put the first 8 characters of your Kindle PID here
|
||||
# or supply it with the -p option in the command line
|
||||
####################################################
|
||||
kindlePID = "12345678"
|
||||
####################################################
|
||||
|
||||
global bookFile
|
||||
global bookPayloadOffset
|
||||
global bookHeaderRecords
|
||||
global bookMetadata
|
||||
global bookKey
|
||||
global command
|
||||
|
||||
#
|
||||
# Exceptions for all the problems that might happen during the script
|
||||
#
|
||||
|
||||
class CMBDTCError(Exception):
|
||||
pass
|
||||
|
||||
class CMBDTCFatal(Exception):
|
||||
pass
|
||||
|
||||
|
||||
#
|
||||
# Open the book file at path
|
||||
#
|
||||
|
||||
def openBook(path):
|
||||
try:
|
||||
return open(path,'rb')
|
||||
except:
|
||||
raise CMBDTCFatal("Could not open book file: " + path)
|
||||
|
||||
#
|
||||
# Get a 7 bit encoded number from the book file
|
||||
#
|
||||
|
||||
def bookReadEncodedNumber():
|
||||
flag = False
|
||||
data = ord(bookFile.read(1))
|
||||
|
||||
if data == 0xFF:
|
||||
flag = True
|
||||
data = ord(bookFile.read(1))
|
||||
|
||||
if data >= 0x80:
|
||||
datax = (data & 0x7F)
|
||||
while data >= 0x80 :
|
||||
data = ord(bookFile.read(1))
|
||||
datax = (datax <<7) + (data & 0x7F)
|
||||
data = datax
|
||||
|
||||
if flag:
|
||||
data = -data
|
||||
return data
|
||||
|
||||
#
|
||||
# Encode a number in 7 bit format
|
||||
#
|
||||
|
||||
def encodeNumber(number):
|
||||
result = ""
|
||||
negative = False
|
||||
flag = 0
|
||||
print("Using encodeNumber routine")
|
||||
|
||||
if number < 0 :
|
||||
number = -number + 1
|
||||
negative = True
|
||||
|
||||
while True:
|
||||
byte = number & 0x7F
|
||||
number = number >> 7
|
||||
byte += flag
|
||||
result += chr(byte)
|
||||
flag = 0x80
|
||||
if number == 0 : break
|
||||
|
||||
if negative:
|
||||
result += chr(0xFF)
|
||||
|
||||
return result[::-1]
|
||||
|
||||
#
|
||||
# Get a length prefixed string from the file
|
||||
#
|
||||
|
||||
def bookReadString():
|
||||
stringLength = bookReadEncodedNumber()
|
||||
return unpack(str(stringLength)+"s",bookFile.read(stringLength))[0]
|
||||
|
||||
#
|
||||
# Returns a length prefixed string
|
||||
#
|
||||
|
||||
def lengthPrefixString(data):
|
||||
return encodeNumber(len(data))+data
|
||||
|
||||
|
||||
#
|
||||
# Read and return the data of one header record at the current book file position [[offset,decompressedLength,compressedLength],...]
|
||||
#
|
||||
|
||||
def bookReadHeaderRecordData():
|
||||
nbValues = bookReadEncodedNumber()
|
||||
values = []
|
||||
for i in range (0,nbValues):
|
||||
values.append([bookReadEncodedNumber(),bookReadEncodedNumber(),bookReadEncodedNumber()])
|
||||
return values
|
||||
|
||||
#
|
||||
# Read and parse one header record at the current book file position and return the associated data [[offset,decompressedLength,compressedLength],...]
|
||||
#
|
||||
|
||||
def parseTopazHeaderRecord():
|
||||
if ord(bookFile.read(1)) != 0x63:
|
||||
raise CMBDTCFatal("Parse Error : Invalid Header")
|
||||
|
||||
tag = bookReadString()
|
||||
record = bookReadHeaderRecordData()
|
||||
return [tag,record]
|
||||
|
||||
#
|
||||
# Parse the header of a Topaz file, get all the header records and the offset for the payload
|
||||
#
|
||||
|
||||
def parseTopazHeader():
|
||||
global bookHeaderRecords
|
||||
global bookPayloadOffset
|
||||
magic = unpack("4s",bookFile.read(4))[0]
|
||||
|
||||
if magic != 'TPZ0':
|
||||
raise CMBDTCFatal("Parse Error : Invalid Header, not a Topaz file")
|
||||
|
||||
nbRecords = bookReadEncodedNumber()
|
||||
bookHeaderRecords = {}
|
||||
|
||||
for i in range (0,nbRecords):
|
||||
result = parseTopazHeaderRecord()
|
||||
print result[0], result[1]
|
||||
bookHeaderRecords[result[0]] = result[1]
|
||||
|
||||
if ord(bookFile.read(1)) != 0x64 :
|
||||
raise CMBDTCFatal("Parse Error : Invalid Header")
|
||||
|
||||
bookPayloadOffset = bookFile.tell()
|
||||
|
||||
#
|
||||
# Get a record in the book payload, given its name and index. If necessary the record is decrypted. The record is not decompressed
|
||||
# Correction, the record is correctly decompressed too
|
||||
#
|
||||
|
||||
def getBookPayloadRecord(name, index):
|
||||
encrypted = False
|
||||
compressed = False
|
||||
|
||||
try:
|
||||
recordOffset = bookHeaderRecords[name][index][0]
|
||||
except:
|
||||
raise CMBDTCFatal("Parse Error : Invalid Record, record not found")
|
||||
|
||||
bookFile.seek(bookPayloadOffset + recordOffset)
|
||||
|
||||
tag = bookReadString()
|
||||
if tag != name :
|
||||
raise CMBDTCFatal("Parse Error : Invalid Record, record name doesn't match")
|
||||
|
||||
recordIndex = bookReadEncodedNumber()
|
||||
|
||||
if recordIndex < 0 :
|
||||
encrypted = True
|
||||
recordIndex = -recordIndex -1
|
||||
|
||||
if recordIndex != index :
|
||||
raise CMBDTCFatal("Parse Error : Invalid Record, index doesn't match")
|
||||
|
||||
if (bookHeaderRecords[name][index][2] > 0):
|
||||
compressed = True
|
||||
record = bookFile.read(bookHeaderRecords[name][index][2])
|
||||
else:
|
||||
record = bookFile.read(bookHeaderRecords[name][index][1])
|
||||
|
||||
if encrypted:
|
||||
ctx = topazCryptoInit(bookKey)
|
||||
record = topazCryptoDecrypt(record,ctx)
|
||||
|
||||
if compressed:
|
||||
record = zlib.decompress(record)
|
||||
|
||||
return record
|
||||
|
||||
#
|
||||
# Extract, decrypt and decompress a book record indicated by name and index and print it or save it in "filename"
|
||||
#
|
||||
|
||||
def extractBookPayloadRecord(name, index, filename):
|
||||
compressed = False
|
||||
|
||||
try:
|
||||
compressed = bookHeaderRecords[name][index][2] != 0
|
||||
record = getBookPayloadRecord(name,index)
|
||||
except:
|
||||
print("Could not find record")
|
||||
|
||||
# if compressed:
|
||||
# try:
|
||||
# record = zlib.decompress(record)
|
||||
# except:
|
||||
# raise CMBDTCFatal("Could not decompress record")
|
||||
|
||||
if filename != "":
|
||||
try:
|
||||
file = open(filename,"wb")
|
||||
file.write(record)
|
||||
file.close()
|
||||
except:
|
||||
raise CMBDTCFatal("Could not write to destination file")
|
||||
else:
|
||||
print(record)
|
||||
|
||||
#
|
||||
# return next record [key,value] from the book metadata from the current book position
|
||||
#
|
||||
|
||||
def readMetadataRecord():
|
||||
return [bookReadString(),bookReadString()]
|
||||
|
||||
#
|
||||
# Parse the metadata record from the book payload and return a list of [key,values]
|
||||
#
|
||||
|
||||
def parseMetadata():
|
||||
global bookHeaderRecords
|
||||
global bookPayloadAddress
|
||||
global bookMetadata
|
||||
bookMetadata = {}
|
||||
bookFile.seek(bookPayloadOffset + bookHeaderRecords["metadata"][0][0])
|
||||
tag = bookReadString()
|
||||
if tag != "metadata" :
|
||||
raise CMBDTCFatal("Parse Error : Record Names Don't Match")
|
||||
|
||||
flags = ord(bookFile.read(1))
|
||||
nbRecords = ord(bookFile.read(1))
|
||||
|
||||
for i in range (0,nbRecords) :
|
||||
record =readMetadataRecord()
|
||||
bookMetadata[record[0]] = record[1]
|
||||
|
||||
#
|
||||
# Context initialisation for the Topaz Crypto
|
||||
#
|
||||
|
||||
def topazCryptoInit(key):
|
||||
ctx1 = 0x0CAFFE19E
|
||||
|
||||
for keyChar in key:
|
||||
keyByte = ord(keyChar)
|
||||
ctx2 = ctx1
|
||||
ctx1 = ((((ctx1 >>2) * (ctx1 >>7))&0xFFFFFFFF) ^ (keyByte * keyByte * 0x0F902007)& 0xFFFFFFFF )
|
||||
return [ctx1,ctx2]
|
||||
|
||||
#
|
||||
# decrypt data with the context prepared by topazCryptoInit()
|
||||
#
|
||||
|
||||
def topazCryptoDecrypt(data, ctx):
|
||||
ctx1 = ctx[0]
|
||||
ctx2 = ctx[1]
|
||||
|
||||
plainText = ""
|
||||
|
||||
for dataChar in data:
|
||||
dataByte = ord(dataChar)
|
||||
m = (dataByte ^ ((ctx1 >> 3) &0xFF) ^ ((ctx2<<3) & 0xFF)) &0xFF
|
||||
ctx2 = ctx1
|
||||
ctx1 = (((ctx1 >> 2) * (ctx1 >> 7)) &0xFFFFFFFF) ^((m * m * 0x0F902007) &0xFFFFFFFF)
|
||||
plainText += chr(m)
|
||||
|
||||
return plainText
|
||||
|
||||
#
|
||||
# Decrypt a payload record with the PID
|
||||
#
|
||||
|
||||
def decryptRecord(data,PID):
|
||||
ctx = topazCryptoInit(PID)
|
||||
return topazCryptoDecrypt(data, ctx)
|
||||
|
||||
#
|
||||
# Try to decrypt a dkey record (contains the book PID)
|
||||
#
|
||||
|
||||
def decryptDkeyRecord(data,PID):
|
||||
record = decryptRecord(data,PID)
|
||||
fields = unpack("3sB8sB8s3s",record)
|
||||
|
||||
if fields[0] != "PID" or fields[5] != "pid" :
|
||||
raise CMBDTCError("Didn't find PID magic numbers in record")
|
||||
elif fields[1] != 8 or fields[3] != 8 :
|
||||
raise CMBDTCError("Record didn't contain correct length fields")
|
||||
elif fields[2] != PID :
|
||||
raise CMBDTCError("Record didn't contain PID")
|
||||
|
||||
return fields[4]
|
||||
|
||||
#
|
||||
# Decrypt all the book's dkey records (contain the book PID)
|
||||
#
|
||||
|
||||
def decryptDkeyRecords(data,PID):
|
||||
nbKeyRecords = ord(data[0])
|
||||
records = []
|
||||
data = data[1:]
|
||||
for i in range (0,nbKeyRecords):
|
||||
length = ord(data[0])
|
||||
try:
|
||||
key = decryptDkeyRecord(data[1:length+1],PID)
|
||||
records.append(key)
|
||||
except CMBDTCError:
|
||||
pass
|
||||
data = data[1+length:]
|
||||
|
||||
return records
|
||||
|
||||
#
|
||||
# Create decrypted book payload
|
||||
#
|
||||
|
||||
def createDecryptedPayload(payload):
|
||||
for headerRecord in bookHeaderRecords:
|
||||
name = headerRecord
|
||||
if name != "dkey" :
|
||||
ext = '.dat'
|
||||
if name == 'img' : ext = '.jpg'
|
||||
for index in range (0,len(bookHeaderRecords[name])) :
|
||||
fnum = "%04d" % index
|
||||
fname = name + fnum + ext
|
||||
destdir = payload
|
||||
if name == 'img':
|
||||
destdir = os.path.join(payload,'img')
|
||||
if name == 'page':
|
||||
destdir = os.path.join(payload,'page')
|
||||
if name == 'glyphs':
|
||||
destdir = os.path.join(payload,'glyphs')
|
||||
outputFile = os.path.join(destdir,fname)
|
||||
file(outputFile, 'wb').write(getBookPayloadRecord(name, index))
|
||||
|
||||
|
||||
# Create decrypted book
|
||||
#
|
||||
|
||||
def createDecryptedBook(outdir):
|
||||
if not os.path.exists(outdir):
|
||||
os.makedirs(outdir)
|
||||
|
||||
destdir = os.path.join(outdir,'img')
|
||||
if not os.path.exists(destdir):
|
||||
os.makedirs(destdir)
|
||||
|
||||
destdir = os.path.join(outdir,'page')
|
||||
if not os.path.exists(destdir):
|
||||
os.makedirs(destdir)
|
||||
|
||||
destdir = os.path.join(outdir,'glyphs')
|
||||
if not os.path.exists(destdir):
|
||||
os.makedirs(destdir)
|
||||
|
||||
createDecryptedPayload(outdir)
|
||||
|
||||
|
||||
#
|
||||
# Set the command to execute by the programm according to cmdLine parameters
|
||||
#
|
||||
|
||||
def setCommand(name) :
|
||||
global command
|
||||
if command != "" :
|
||||
raise CMBDTCFatal("Invalid command line parameters")
|
||||
else :
|
||||
command = name
|
||||
|
||||
#
|
||||
# Program usage
|
||||
#
|
||||
|
||||
def usage():
|
||||
print("\nUsage:")
|
||||
print("\ncmbtc_dump_linux.py [options] bookFileName\n")
|
||||
print("-p Adds a PID to the list of PIDs that are tried to decrypt the book key (can be used several times)")
|
||||
print("-d Dumps the unencrypted book as files to outdir")
|
||||
print("-o Output directory to save book files to")
|
||||
print("-v Verbose (can be used several times)")
|
||||
|
||||
|
||||
#
|
||||
# Main
|
||||
#
|
||||
|
||||
def main(argv=sys.argv):
|
||||
global bookMetadata
|
||||
global bookKey
|
||||
global bookFile
|
||||
global command
|
||||
|
||||
progname = os.path.basename(argv[0])
|
||||
|
||||
verbose = 0
|
||||
recordName = ""
|
||||
recordIndex = 0
|
||||
outdir = ""
|
||||
PIDs = []
|
||||
command = ""
|
||||
|
||||
# Preloads your Kindle pid from the top of the program.
|
||||
PIDs.append(kindlePID)
|
||||
|
||||
try:
|
||||
opts, args = getopt.getopt(sys.argv[1:], "vo:p:d")
|
||||
except getopt.GetoptError, err:
|
||||
# print help information and exit:
|
||||
print str(err) # will print something like "option -a not recognized"
|
||||
usage()
|
||||
sys.exit(2)
|
||||
|
||||
if len(opts) == 0 and len(args) == 0 :
|
||||
usage()
|
||||
sys.exit(2)
|
||||
|
||||
for o, a in opts:
|
||||
if o == "-v":
|
||||
verbose+=1
|
||||
if o =="-o":
|
||||
if a == None :
|
||||
raise CMBDTCFatal("Invalid parameter for -o")
|
||||
outdir = a
|
||||
if o =="-p":
|
||||
PIDs.append(a)
|
||||
if o =="-d":
|
||||
setCommand("doit")
|
||||
|
||||
if command == "" :
|
||||
raise CMBDTCFatal("No action supplied on command line")
|
||||
|
||||
#
|
||||
# Open book and parse metadata
|
||||
#
|
||||
|
||||
if len(args) == 1:
|
||||
|
||||
bookFile = openBook(args[0])
|
||||
parseTopazHeader()
|
||||
parseMetadata()
|
||||
|
||||
#
|
||||
# Decrypt book key
|
||||
#
|
||||
|
||||
dkey = getBookPayloadRecord('dkey', 0)
|
||||
|
||||
bookKeys = []
|
||||
for PID in PIDs :
|
||||
bookKeys+=decryptDkeyRecords(dkey,PID)
|
||||
|
||||
if len(bookKeys) == 0 :
|
||||
if verbose > 0 :
|
||||
print ("Book key could not be found. Maybe this book is not registered with this device.")
|
||||
else :
|
||||
bookKey = bookKeys[0]
|
||||
if verbose > 0:
|
||||
print("Book key: " + bookKey.encode('hex'))
|
||||
|
||||
|
||||
|
||||
if command == "printRecord" :
|
||||
extractBookPayloadRecord(recordName,int(recordIndex),outputFile)
|
||||
if outputFile != "" and verbose>0 :
|
||||
print("Wrote record to file: "+outputFile)
|
||||
elif command == "doit" :
|
||||
if outdir != "" :
|
||||
createDecryptedBook(outdir)
|
||||
if verbose >0 :
|
||||
print ("Decrypted book saved. Don't pirate!")
|
||||
elif verbose > 0:
|
||||
print("Output directory name was not supplied.")
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
|
@ -13,7 +13,8 @@ from struct import unpack
|
|||
class DocParser(object):
|
||||
def __init__(self, flatxml, classlst, fileid):
|
||||
self.id = os.path.basename(fileid).replace('.dat','')
|
||||
self.flatdoc = flatxml.split('\n')
|
||||
self.docList = flatxml.split('\n')
|
||||
self.docSize = len(self.docList)
|
||||
self.classList = {}
|
||||
tmpList = classlst.split('\n')
|
||||
for pclass in tmpList:
|
||||
|
@ -29,12 +30,10 @@ class DocParser(object):
|
|||
self.paracont_stemid = []
|
||||
self.parastems_stemid = []
|
||||
|
||||
# find tag if within pos to end inclusive
|
||||
# return tag at line pos in document
|
||||
def lineinDoc(self, pos) :
|
||||
docList = self.flatdoc
|
||||
cnt = len(docList)
|
||||
if (pos >= 0) and (pos < cnt) :
|
||||
item = docList[pos]
|
||||
if (pos >= 0) and (pos < self.docSize) :
|
||||
item = self.docList[pos]
|
||||
if item.find('=') >= 0:
|
||||
(name, argres) = item.split('=',1)
|
||||
else :
|
||||
|
@ -43,20 +42,18 @@ class DocParser(object):
|
|||
return name, argres
|
||||
|
||||
|
||||
# find tag if within pos to end inclusive
|
||||
# find tag in doc if within pos to end inclusive
|
||||
def findinDoc(self, tagpath, pos, end) :
|
||||
result = None
|
||||
docList = self.flatdoc
|
||||
cnt = len(docList)
|
||||
if end == -1 :
|
||||
end = cnt
|
||||
end = self.docSize
|
||||
else:
|
||||
end = min(cnt,end)
|
||||
end = min(self.docSize, end)
|
||||
foundat = -1
|
||||
for j in xrange(pos, end):
|
||||
item = docList[j]
|
||||
item = self.docList[j]
|
||||
if item.find('=') >= 0:
|
||||
(name, argres) = item.split('=')
|
||||
(name, argres) = item.split('=',1)
|
||||
else :
|
||||
name = item
|
||||
argres = ''
|
||||
|
@ -85,7 +82,7 @@ class DocParser(object):
|
|||
|
||||
result = []
|
||||
|
||||
# normal paragraph
|
||||
# paragraph
|
||||
(pos, pclass) = self.findinDoc('paragraph.class',start,end)
|
||||
|
||||
# class names are an issue given topaz may start them with numerals (not allowed),
|
||||
|
@ -94,19 +91,20 @@ class DocParser(object):
|
|||
# from a base class (but then not actually provide all of these _reclustereed
|
||||
# classes in the stylesheet!
|
||||
|
||||
# so we clean this up by lowercasing, prepend 'cl_', and getting any baseclass
|
||||
# so we clean this up by lowercasing, prepend 'cl-', and getting any baseclass
|
||||
# that exists in the stylesheet first, and then adding this specific class
|
||||
# after
|
||||
classres = ''
|
||||
pclass = pclass.lower()
|
||||
pclass = 'cl-' + pclass
|
||||
p = pclass.find('_')
|
||||
if p > 0 :
|
||||
baseclass = pclass[0:p]
|
||||
if baseclass in self.classList:
|
||||
classres += baseclass + ' '
|
||||
classres += pclass
|
||||
pclass = classres
|
||||
if pclass != None :
|
||||
classres = ''
|
||||
pclass = pclass.lower()
|
||||
pclass = 'cl-' + pclass
|
||||
p = pclass.find('_')
|
||||
if p > 0 :
|
||||
baseclass = pclass[0:p]
|
||||
if baseclass in self.classList:
|
||||
classres += baseclass + ' '
|
||||
classres += pclass
|
||||
pclass = classres
|
||||
|
||||
# build up a description of the paragraph in result and return it
|
||||
# first check for the basic - all words paragraph
|
||||
|
@ -128,9 +126,7 @@ class DocParser(object):
|
|||
|
||||
# if end is -1 then we must search to end of document
|
||||
if end == -1 :
|
||||
docList = self.flatdoc
|
||||
cnt = len(docList)
|
||||
end = cnt
|
||||
end = self.docSize
|
||||
|
||||
while (line < end) :
|
||||
|
||||
|
@ -171,20 +167,20 @@ class DocParser(object):
|
|||
return pclass, result
|
||||
|
||||
|
||||
def buildParagraph(self, cname, pdesc, type, regtype) :
|
||||
def buildParagraph(self, pclass, pdesc, type, regtype) :
|
||||
parares = ''
|
||||
sep =''
|
||||
|
||||
br_lb = False
|
||||
if (regtype == 'fixed') or (regtype == 'chapterheading'):
|
||||
br_lb = True
|
||||
classres = ''
|
||||
if pclass :
|
||||
classres = ' class="' + pclass + '"'
|
||||
|
||||
handle_links = False
|
||||
if len(self.link_id) > 0:
|
||||
handle_links = True
|
||||
br_lb = (regtype == 'fixed') or (regtype == 'chapterheading')
|
||||
|
||||
handle_links = len(self.link_id) > 0
|
||||
|
||||
if (type == 'full') or (type == 'begin') :
|
||||
parares += '<p class="' + cname + '">'
|
||||
parares += '<p' + classres + '>'
|
||||
|
||||
if (type == 'end'):
|
||||
parares += ' '
|
||||
|
@ -218,10 +214,7 @@ class DocParser(object):
|
|||
if word == '_link_' : word = ''
|
||||
|
||||
if word == '_lb_':
|
||||
if (num-1) in self.dehyphen_rootid :
|
||||
word = ''
|
||||
sep = ''
|
||||
elif handle_links :
|
||||
if ((num-1) in self.dehyphen_rootid ) or handle_links:
|
||||
word = ''
|
||||
sep = ''
|
||||
elif br_lb :
|
||||
|
@ -261,43 +254,51 @@ class DocParser(object):
|
|||
|
||||
htmlpage = ''
|
||||
|
||||
# first collect information from the xml doc that describes this page
|
||||
# get the ocr text
|
||||
(pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
|
||||
if argres : self.ocrtext = argres.split('|')
|
||||
|
||||
# get information to dehyphenate the text
|
||||
(pos, argres) = self.findinDoc('info.dehyphen.rootID',0,-1)
|
||||
if argres:
|
||||
argList = argres.split('|')
|
||||
self.dehyphen_rootid = [ int(strval) for strval in argList]
|
||||
|
||||
# determine if first paragraph is continued from previous page
|
||||
(pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1)
|
||||
if self.parastems_stemid == None : self.parastems_stemid = []
|
||||
|
||||
first_para_continued = (self.parastems_stemid != None)
|
||||
|
||||
# determine if last paragraph is continued onto the next page
|
||||
(pos, self.paracont_stemid) = self.findinDoc('info.paraCont.stemID',0,-1)
|
||||
if self.paracont_stemid == None : self.paracont_stemid = []
|
||||
|
||||
last_para_continued = (self.paracont_stemid != None)
|
||||
|
||||
# collect link ids
|
||||
(pos, argres) = self.findinDoc('info.word.link_id',0,-1)
|
||||
if argres:
|
||||
argList = argres.split('|')
|
||||
self.link_id = [ int(strval) for strval in argList]
|
||||
|
||||
# collect link destination page numbers
|
||||
(pos, argres) = self.findinDoc('info.links.page',0,-1)
|
||||
if argres :
|
||||
argList = argres.split('|')
|
||||
self.link_page = [ int(strval) for strval in argList]
|
||||
|
||||
# collect link titles
|
||||
(pos, argres) = self.findinDoc('info.links.title',0,-1)
|
||||
if argres :
|
||||
self.link_title = argres.split('|')
|
||||
else:
|
||||
self.link_title.append('')
|
||||
|
||||
|
||||
# get page type
|
||||
(pos, pagetype) = self.findinDoc('page.type',0,-1)
|
||||
|
||||
|
||||
# generate a list of each region starting point
|
||||
# each region has one paragraph,, or one image, or one chapterheading
|
||||
|
||||
regionList= self.posinDoc('region')
|
||||
regcnt = len(regionList)
|
||||
regionList.append(-1)
|
||||
|
@ -308,47 +309,48 @@ class DocParser(object):
|
|||
# process each region tag and convert what you can to html
|
||||
|
||||
for j in xrange(regcnt):
|
||||
|
||||
start = regionList[j]
|
||||
end = regionList[j+1]
|
||||
|
||||
(pos, regtype) = self.findinDoc('region.type',start,end)
|
||||
|
||||
# set anchor for link target on this page
|
||||
if not anchorSet and not first_para_continued:
|
||||
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '"> </div>\n'
|
||||
anchorSet = True
|
||||
|
||||
if regtype == 'graphic' :
|
||||
if not anchorSet:
|
||||
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '"> </div>\n'
|
||||
anchorSet = True
|
||||
(pos, simgsrc) = self.findinDoc('img.src',start,end)
|
||||
if simgsrc:
|
||||
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
|
||||
|
||||
|
||||
elif regtype == 'chapterheading' :
|
||||
(pclass, pdesc) = self.getParaDescription(start,end)
|
||||
if not breakSet:
|
||||
htmlpage += '<div style="page-break-after: always;"> </div>\n'
|
||||
breakSet = True
|
||||
if not anchorSet:
|
||||
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '"> </div>\n'
|
||||
anchorSet = True
|
||||
tag = 'h1'
|
||||
if pclass[3:7] == 'ch1-' : tag = 'h1'
|
||||
if pclass[3:7] == 'ch2-' : tag = 'h2'
|
||||
if pclass[3:7] == 'ch3-' : tag = 'h3'
|
||||
htmlpage += '<' + tag + ' class="' + pclass + '">'
|
||||
if pclass and (len(pclass) >= 7):
|
||||
if pclass[3:7] == 'ch1-' : tag = 'h1'
|
||||
if pclass[3:7] == 'ch2-' : tag = 'h2'
|
||||
if pclass[3:7] == 'ch3-' : tag = 'h3'
|
||||
htmlpage += '<' + tag + ' class="' + pclass + '">'
|
||||
else:
|
||||
htmlpage += '<' + tag + '>'
|
||||
htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
|
||||
htmlpage += '</' + tag + '>'
|
||||
|
||||
|
||||
elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') or (regtype == 'listitem') :
|
||||
ptype = 'full'
|
||||
# check to see if this is a continution from the previous page
|
||||
if (len(self.parastems_stemid) > 0):
|
||||
if first_para_continued :
|
||||
ptype = 'end'
|
||||
self.parastems_stemid=[]
|
||||
else:
|
||||
if not anchorSet:
|
||||
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '"> </div>\n'
|
||||
anchorSet = True
|
||||
first_para_continued = False
|
||||
(pclass, pdesc) = self.getParaDescription(start,end)
|
||||
if ptype == 'full' :
|
||||
if pclass and (len(pclass) >= 6) and (ptype == 'full'):
|
||||
tag = 'p'
|
||||
if pclass[3:6] == 'h1-' : tag = 'h4'
|
||||
if pclass[3:6] == 'h2-' : tag = 'h5'
|
||||
|
@ -359,28 +361,22 @@ class DocParser(object):
|
|||
else :
|
||||
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
|
||||
|
||||
|
||||
elif (regtype == 'tocentry') :
|
||||
ptype = 'full'
|
||||
# check to see if this is a continution from the previous page
|
||||
if (len(self.parastems_stemid) > 0) and (j == 0):
|
||||
# process the first paragraph as a continuation from the last page
|
||||
if first_para_continued :
|
||||
ptype = 'end'
|
||||
self.parastems_stemid = []
|
||||
else:
|
||||
if not anchorSet:
|
||||
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '"> </div>\n'
|
||||
anchorSet = True
|
||||
first_para_continued = False
|
||||
(pclass, pdesc) = self.getParaDescription(start,end)
|
||||
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
|
||||
|
||||
|
||||
elif (regtype == 'synth_fcvr.center') or (regtype == 'synth_text.center'):
|
||||
if not anchorSet:
|
||||
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '"> </div>\n'
|
||||
anchorSet = True
|
||||
(pos, simgsrc) = self.findinDoc('img.src',start,end)
|
||||
if simgsrc:
|
||||
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
|
||||
|
||||
|
||||
else :
|
||||
print 'Warning: Unknown region type', regtype
|
||||
(pos, temp) = self.findinDoc('paragraph',start,end)
|
||||
|
@ -389,15 +385,11 @@ class DocParser(object):
|
|||
regtype = 'fixed'
|
||||
ptype = 'full'
|
||||
# check to see if this is a continution from the previous page
|
||||
if (len(self.parastems_stemid) > 0):
|
||||
if first_para_continued :
|
||||
ptype = 'end'
|
||||
self.parastems_stemid=[]
|
||||
else:
|
||||
if not anchorSet:
|
||||
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '"> </div>\n'
|
||||
anchorSet = True
|
||||
first_para_continued = False
|
||||
(pclass, pdesc) = self.getParaDescription(start,end)
|
||||
if ptype == 'full' :
|
||||
if pclass and (ptype == 'full') and (len(pclass) >= 6):
|
||||
tag = 'p'
|
||||
if pclass[3:6] == 'h1-' : tag = 'h4'
|
||||
if pclass[3:6] == 'h2-' : tag = 'h5'
|
||||
|
@ -408,24 +400,20 @@ class DocParser(object):
|
|||
else :
|
||||
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
|
||||
else :
|
||||
print 'Treating this like a "image" region'
|
||||
if not anchorSet:
|
||||
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '"> </div>\n'
|
||||
anchorSet = True
|
||||
print 'Treating this like a "graphic" region'
|
||||
(pos, simgsrc) = self.findinDoc('img.src',start,end)
|
||||
if simgsrc:
|
||||
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
|
||||
|
||||
if len(self.paracont_stemid) > 0 :
|
||||
|
||||
if last_para_continued :
|
||||
if htmlpage[-4:] == '</p>':
|
||||
htmlpage = htmlpage[0:-4]
|
||||
htmlpage = htmlpage[0:-4]
|
||||
last_para_continued = False
|
||||
|
||||
return htmlpage
|
||||
|
||||
|
||||
return self.convert2HTML()
|
||||
|
||||
|
||||
|
||||
def convert2HTML(flatxml, classlst, fileid):
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ Contributors:
|
|||
clarknova - for all of the svg and glyph generation and many other bug fixes and improvements
|
||||
skindle - for figuing out the general case for the mode loops
|
||||
some updates - for conversion to xml, basic html
|
||||
DiapDealer - for extensive testing and feedback
|
||||
DiapDealer - for extensive testing and feedback, and standalone linux/macosx version of cmbtc_dump
|
||||
stewball - for extensive testing and feedback
|
||||
|
||||
and others for posting, feedback and testing
|
||||
|
@ -29,6 +29,17 @@ genxml.py - main program to convert everything to xml
|
|||
genhtml.py - main program to generate "book.html"
|
||||
gensvg.py - (author: clarknova) main program to create an svg grpahic of each page
|
||||
|
||||
|
||||
In addition there is now a new file:
|
||||
|
||||
cmbtc_dump_mac_linux.py
|
||||
|
||||
If you know the pid of your ipod and/or your standalone Kindle and your book
|
||||
was meant for that device, you can use this program to dump the proper sections
|
||||
on Mac OSX and Linux (and even Windows if you do not have Kindle4PC installed).
|
||||
Thank DiapDealer for creating it!
|
||||
|
||||
|
||||
Please note, gensvg.py, genhtml.py, and genxml.py import and use
|
||||
decode_meta.py, convert2xml.py, flatxml2html.py, getpagedim.py and stylexml2css.py
|
||||
so please keep all of these python scripts together in the same place.
|
||||
|
|
Loading…
Reference in New Issue