topazscripts 1.6

2010-01-21 12:14:31 +00:00 · 2010-01-21 12:14:31 +00:00 · 58e9c973ab
parent a1fec0b54d
commit 58e9c973ab
4 changed files with 598 additions and 90 deletions
--- a/Topaz_Tools/lib/changes.txt
+++ b/Topaz_Tools/lib/changes.txt
@ -1,3 +1,8 @@
 Changes in version 1.6
 	- support for books whose paragraphs have no styles
 	- support to run cmbtc_dump on Linux and Mac OSX provided you know your PID of your ipod or standalone Kindle
 	 (contributed by DiapDealer)
 Changes in version 1.5
 	- completely reworked generation of styles to use actual page heights and widths
 	- added new script getpagedim.py to support the above
--- a/Topaz_Tools/lib/cmbtc_dump_mac_linux.py
+++ b/Topaz_Tools/lib/cmbtc_dump_mac_linux.py
@ -0,0 +1,504 @@
 #! /usr/bin/python
 from __future__ import with_statement
 import csv
 import sys
 import os
 import getopt
 import zlib
 from struct import pack
 from struct import unpack
 MAX_PATH = 255
 # Put the first 8 characters of your Kindle PID here
 # or supply it with the -p option in the command line
 ####################################################
 kindlePID = "12345678"
 ####################################################
 global bookFile
 global bookPayloadOffset
 global bookHeaderRecords
 global bookMetadata
 global bookKey
 global command
 #
 # Exceptions for all the problems that might happen during the script
 #
 class CMBDTCError(Exception):
    pass
 class CMBDTCFatal(Exception):
    pass
 #
 # Open the book file at path
 #
 def openBook(path):
    try:
        return open(path,'rb')
    except:
        raise CMBDTCFatal("Could not open book file: " + path)
 #
 # Get a 7 bit encoded number from the book file
 #
 def bookReadEncodedNumber():
    flag = False
    data = ord(bookFile.read(1))
    if data == 0xFF:
       flag = True
       data = ord(bookFile.read(1))
    if data >= 0x80:
        datax = (data & 0x7F)
        while data >= 0x80 :
            data = ord(bookFile.read(1))
            datax = (datax <<7) + (data & 0x7F)
        data = datax 
    if flag:
       data = -data
    return data
 #
 # Encode a number in 7 bit format
 #
 def encodeNumber(number):
   result = ""
   negative = False
   flag = 0
   print("Using encodeNumber routine")
   if number < 0 :
       number = -number + 1
       negative = True
   while True:
       byte = number & 0x7F
       number = number >> 7
       byte += flag
       result += chr(byte)
       flag = 0x80
       if number == 0 : break
   if negative:
       result += chr(0xFF)
   return result[::-1]
 #
 # Get a length prefixed string from the file 
 #
 def bookReadString():
    stringLength = bookReadEncodedNumber()
    return unpack(str(stringLength)+"s",bookFile.read(stringLength))[0]  
 #
 # Returns a length prefixed string
 #
 def lengthPrefixString(data):
    return encodeNumber(len(data))+data
 #
 # Read and return the data of one header record at the current book file position [[offset,decompressedLength,compressedLength],...]
 #
 def bookReadHeaderRecordData():
    nbValues = bookReadEncodedNumber()
    values = []
    for i in range (0,nbValues):
        values.append([bookReadEncodedNumber(),bookReadEncodedNumber(),bookReadEncodedNumber()])
    return values
 #
 # Read and parse one header record at the current book file position and return the associated data [[offset,decompressedLength,compressedLength],...]
 #
 def parseTopazHeaderRecord():
    if ord(bookFile.read(1)) != 0x63:
        raise CMBDTCFatal("Parse Error : Invalid Header")
    tag = bookReadString()
    record = bookReadHeaderRecordData()
    return [tag,record]
 #
 # Parse the header of a Topaz file, get all the header records and the offset for the payload
 #
 def parseTopazHeader():
    global bookHeaderRecords
    global bookPayloadOffset
    magic = unpack("4s",bookFile.read(4))[0]
    if magic != 'TPZ0':
        raise CMBDTCFatal("Parse Error : Invalid Header, not a Topaz file")
    nbRecords = bookReadEncodedNumber()
    bookHeaderRecords = {}
    for i in range (0,nbRecords):
        result = parseTopazHeaderRecord()
        print result[0], result[1]
        bookHeaderRecords[result[0]] = result[1]
    if ord(bookFile.read(1))  != 0x64 :
        raise CMBDTCFatal("Parse Error : Invalid Header")
    bookPayloadOffset = bookFile.tell()
 #
 # Get a record in the book payload, given its name and index. If necessary the record is decrypted. The record is not decompressed
 # Correction, the record is correctly decompressed too
 #
 def getBookPayloadRecord(name, index):   
    encrypted = False
    compressed = False
    try: 
        recordOffset = bookHeaderRecords[name][index][0]
    except:
        raise CMBDTCFatal("Parse Error : Invalid Record, record not found")
    bookFile.seek(bookPayloadOffset + recordOffset)
    tag = bookReadString()
    if tag != name :
        raise CMBDTCFatal("Parse Error : Invalid Record, record name doesn't match")
    recordIndex = bookReadEncodedNumber()
    if recordIndex < 0 :
        encrypted = True
        recordIndex = -recordIndex -1
    if recordIndex != index :
      raise CMBDTCFatal("Parse Error : Invalid Record, index doesn't match")
    if (bookHeaderRecords[name][index][2] > 0):
        compressed = True
        record = bookFile.read(bookHeaderRecords[name][index][2])
    else:
        record = bookFile.read(bookHeaderRecords[name][index][1])
    if encrypted:
       ctx = topazCryptoInit(bookKey)
       record = topazCryptoDecrypt(record,ctx)
    if compressed:
        record = zlib.decompress(record)
    return record
 #
 # Extract, decrypt and decompress a book record indicated by name and index and print it or save it in "filename"
 #
 def extractBookPayloadRecord(name, index, filename):
    compressed = False
    try:
        compressed = bookHeaderRecords[name][index][2] != 0
        record = getBookPayloadRecord(name,index)
    except:
        print("Could not find record")
    # if compressed:
    #    try:
    #        record = zlib.decompress(record)
    #    except:
    #        raise CMBDTCFatal("Could not decompress record")
    if filename != "":
        try:
            file = open(filename,"wb")
            file.write(record)
            file.close()
        except:
            raise CMBDTCFatal("Could not write to destination file")
    else:
        print(record)
 #
 # return next record [key,value] from the book metadata from the current book position
 #  
 def readMetadataRecord():
    return [bookReadString(),bookReadString()]
 #
 # Parse the metadata record from the book payload and return a list of [key,values]
 #
 def parseMetadata():
    global bookHeaderRecords
    global bookPayloadAddress
    global bookMetadata
    bookMetadata = {}
    bookFile.seek(bookPayloadOffset + bookHeaderRecords["metadata"][0][0])
    tag = bookReadString()
    if tag != "metadata" :
        raise CMBDTCFatal("Parse Error : Record Names Don't Match")
    flags = ord(bookFile.read(1))
    nbRecords = ord(bookFile.read(1))
    for i in range (0,nbRecords) :
        record =readMetadataRecord()
        bookMetadata[record[0]] = record[1]
 #
 # Context initialisation for the Topaz Crypto
 #
 def topazCryptoInit(key):
    ctx1 = 0x0CAFFE19E
    for keyChar in key:
        keyByte = ord(keyChar)
        ctx2 = ctx1 
        ctx1 = ((((ctx1 >>2) * (ctx1 >>7))&0xFFFFFFFF) ^ (keyByte * keyByte * 0x0F902007)& 0xFFFFFFFF )
    return [ctx1,ctx2]
 #
 # decrypt data with the context prepared by topazCryptoInit()
 #
 def topazCryptoDecrypt(data, ctx):
    ctx1 = ctx[0]
    ctx2 = ctx[1]
    plainText = ""
    for dataChar in data:
        dataByte = ord(dataChar)
        m = (dataByte ^ ((ctx1 >> 3) &0xFF) ^ ((ctx2<<3) & 0xFF)) &0xFF
        ctx2 = ctx1
        ctx1 = (((ctx1 >> 2) * (ctx1 >> 7)) &0xFFFFFFFF) ^((m * m * 0x0F902007) &0xFFFFFFFF)
        plainText += chr(m)
    return plainText
 #
 # Decrypt a payload record with the PID
 #
 def decryptRecord(data,PID):
    ctx = topazCryptoInit(PID)
    return topazCryptoDecrypt(data, ctx)
 #
 # Try to decrypt a dkey record (contains the book PID)
 #
 def decryptDkeyRecord(data,PID):
    record = decryptRecord(data,PID)
    fields = unpack("3sB8sB8s3s",record)
    if fields[0] != "PID" or fields[5] != "pid" :
        raise CMBDTCError("Didn't find PID magic numbers in record")
    elif fields[1] != 8 or fields[3] != 8 :
        raise CMBDTCError("Record didn't contain correct length fields")
    elif fields[2] != PID :
        raise CMBDTCError("Record didn't contain PID")
    return fields[4]
 #
 # Decrypt all the book's dkey records (contain the book PID)
 #
 def decryptDkeyRecords(data,PID):
    nbKeyRecords = ord(data[0])
    records = []
    data = data[1:]
    for i in range (0,nbKeyRecords):
        length = ord(data[0])
        try:
            key = decryptDkeyRecord(data[1:length+1],PID)
            records.append(key)
        except CMBDTCError:
            pass
        data = data[1+length:]
    return records
 #
 # Create decrypted book payload
 #
 def createDecryptedPayload(payload):
    for headerRecord in bookHeaderRecords:
       name = headerRecord
       if name != "dkey" :
           ext = '.dat'
           if name == 'img' : ext = '.jpg'
           for index in range (0,len(bookHeaderRecords[name])) :
               fnum = "%04d" % index
               fname = name + fnum + ext
               destdir = payload
               if name == 'img':
                   destdir =  os.path.join(payload,'img')
               if name == 'page':
                   destdir =  os.path.join(payload,'page')
               if name == 'glyphs':
                   destdir =  os.path.join(payload,'glyphs')
               outputFile = os.path.join(destdir,fname)
               file(outputFile, 'wb').write(getBookPayloadRecord(name, index))
 # Create decrypted book
 #
 def createDecryptedBook(outdir):
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    destdir =  os.path.join(outdir,'img')
    if not os.path.exists(destdir):
        os.makedirs(destdir)
    destdir =  os.path.join(outdir,'page')
    if not os.path.exists(destdir):
        os.makedirs(destdir)
    destdir =  os.path.join(outdir,'glyphs')
    if not os.path.exists(destdir):
        os.makedirs(destdir)
    createDecryptedPayload(outdir)
 #
 # Set the command to execute by the programm according to cmdLine parameters
 #
 def setCommand(name) :
    global command
    if command != "" :
         raise CMBDTCFatal("Invalid command line parameters")
    else :
        command = name
 # 
 # Program usage
 #
 def usage():
    print("\nUsage:")
    print("\ncmbtc_dump_linux.py [options] bookFileName\n")
    print("-p Adds a PID to the list of PIDs that are tried to decrypt the book key (can be used several times)")
    print("-d Dumps the unencrypted book as files to outdir")
    print("-o Output directory to save book files to")
    print("-v Verbose (can be used several times)")
 #
 # Main
 #   
 def main(argv=sys.argv):
    global bookMetadata
    global bookKey
    global bookFile
    global command
    progname = os.path.basename(argv[0])
    verbose = 0
    recordName = ""
    recordIndex = 0
    outdir = ""
    PIDs = []
    command = ""
    # Preloads your Kindle pid from the top of the program.
    PIDs.append(kindlePID)
    try:
        opts, args = getopt.getopt(sys.argv[1:], "vo:p:d")
    except getopt.GetoptError, err:
        # print help information and exit:
        print str(err) # will print something like "option -a not recognized"
        usage()
        sys.exit(2)
    if len(opts) == 0 and len(args) == 0 :
        usage()
        sys.exit(2) 
    for o, a in opts:
        if o == "-v":
            verbose+=1
        if o =="-o":
            if a == None :
                raise CMBDTCFatal("Invalid parameter for -o")
            outdir = a
        if o =="-p":
            PIDs.append(a)
        if o =="-d":
            setCommand("doit")
    if command == "" :
        raise CMBDTCFatal("No action supplied on command line")
    #
    # Open book and parse metadata
    #
    if len(args) == 1:
        bookFile = openBook(args[0])
        parseTopazHeader()
        parseMetadata()
    #
    #  Decrypt book key
    #
        dkey = getBookPayloadRecord('dkey', 0) 
        bookKeys = []
        for PID in PIDs :
            bookKeys+=decryptDkeyRecords(dkey,PID)
        if len(bookKeys) == 0 :
            if verbose > 0 :
                print ("Book key could not be found. Maybe this book is not registered with this device.")
        else :
            bookKey = bookKeys[0]
            if verbose > 0:
                print("Book key: " + bookKey.encode('hex'))
            if command == "printRecord" :
                extractBookPayloadRecord(recordName,int(recordIndex),outputFile)
                if outputFile != "" and verbose>0 :
                    print("Wrote record to file: "+outputFile) 
            elif command == "doit" :
                if outdir != "" :
                    createDecryptedBook(outdir)
                    if verbose >0 :
                        print ("Decrypted book saved. Don't pirate!")
                elif verbose > 0:
                    print("Output directory name was not supplied.")
    return 0
 if __name__ == '__main__':
    sys.exit(main())
--- a/Topaz_Tools/lib/flatxml2html.py
+++ b/Topaz_Tools/lib/flatxml2html.py
@ -13,7 +13,8 @@ from struct import unpack
 class DocParser(object):
    def __init__(self, flatxml, classlst, fileid):
        self.id = os.path.basename(fileid).replace('.dat','')
-        self.flatdoc = flatxml.split('\n')
+        self.docList = flatxml.split('\n')
        self.docSize = len(self.docList)
        self.classList = {}
        tmpList = classlst.split('\n')
        for pclass in tmpList:
@ -29,12 +30,10 @@ class DocParser(object):
        self.paracont_stemid = []
        self.parastems_stemid = []
-    # find tag if within pos to end inclusive
+    # return tag at line pos in document
    def lineinDoc(self, pos) :
-        docList = self.flatdoc
+        if (pos >= 0) and (pos < self.docSize) :
-        cnt = len(docList)
+            item = self.docList[pos]
        if (pos >= 0) and (pos < cnt) :
            item = docList[pos]
            if item.find('=') >= 0:
                (name, argres) = item.split('=',1)
            else : 
@ -43,20 +42,18 @@ class DocParser(object):
        return name, argres
-    # find tag if within pos to end inclusive
+    # find tag in doc if within pos to end inclusive
    def findinDoc(self, tagpath, pos, end) :
        result = None
        docList = self.flatdoc
        cnt = len(docList)
        if end == -1 :
-            end = cnt
+            end = self.docSize
        else:
-            end = min(cnt,end)
+            end = min(self.docSize, end)
        foundat = -1
        for j in xrange(pos, end):
-            item = docList[j]
+            item = self.docList[j]
            if item.find('=') >= 0:
-                (name, argres) = item.split('=')
+                (name, argres) = item.split('=',1)
            else : 
                name = item
                argres = ''
@ -85,7 +82,7 @@ class DocParser(object):
        result = []
-        # normal paragraph
+        # paragraph
        (pos, pclass) = self.findinDoc('paragraph.class',start,end) 
        # class names are an issue given topaz may start them with numerals (not allowed),
@ -94,19 +91,20 @@ class DocParser(object):
        # from a base class (but then not actually provide all of these _reclustereed 
        # classes in the stylesheet!
-        # so we clean this up by lowercasing, prepend 'cl_', and getting any baseclass
+        # so we clean this up by lowercasing, prepend 'cl-', and getting any baseclass
        # that exists in the stylesheet first, and then adding this specific class
        # after
-        classres = ''
+        if pclass != None :
-        pclass = pclass.lower()
+            classres = ''
-        pclass = 'cl-' + pclass
+            pclass = pclass.lower()
-        p = pclass.find('_')
+            pclass = 'cl-' + pclass
-        if p > 0 :
+            p = pclass.find('_')
-            baseclass = pclass[0:p]
+            if p > 0 :
-            if baseclass in self.classList:
+                baseclass = pclass[0:p]
-                classres += baseclass + ' '
+                if baseclass in self.classList:
-        classres += pclass
+                    classres += baseclass + ' '
-        pclass = classres
+            classres += pclass
            pclass = classres
        # build up a description of the paragraph in result and return it
        # first check for the  basic - all words paragraph
@ -128,9 +126,7 @@ class DocParser(object):
        # if end is -1 then we must search to end of document
        if end == -1 :
-            docList = self.flatdoc
+            end = self.docSize
            cnt = len(docList)
            end = cnt
        while (line < end) :
@ -171,20 +167,20 @@ class DocParser(object):
        return pclass, result
-    def buildParagraph(self, cname, pdesc, type, regtype) :
+    def buildParagraph(self, pclass, pdesc, type, regtype) :
        parares = ''
        sep =''
-        br_lb = False
+        classres = ''
-        if (regtype == 'fixed') or (regtype == 'chapterheading'):
+        if pclass :
-            br_lb = True
+            classres = ' class="' + pclass + '"'
-        handle_links = False
+        br_lb = (regtype == 'fixed') or (regtype == 'chapterheading')
        if len(self.link_id) > 0:
            handle_links = True
        handle_links = len(self.link_id) > 0
        if (type == 'full') or (type == 'begin') :
-            parares += '<p class="' + cname + '">'
+            parares += '<p' + classres + '>'
        if (type == 'end'):
            parares += ' '
@ -218,10 +214,7 @@ class DocParser(object):
                        if word == '_link_' : word = ''
                if word == '_lb_':
-                    if (num-1) in self.dehyphen_rootid :
+                    if ((num-1) in self.dehyphen_rootid ) or handle_links:
                        word = ''
                        sep = ''
                    elif handle_links :
                        word = ''
                        sep = ''
                    elif br_lb :
@ -261,43 +254,51 @@ class DocParser(object):
        htmlpage = ''
-        # first collect information from the xml doc that describes this page
+        # get the ocr text
        (pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
        if argres :  self.ocrtext = argres.split('|')
        # get information to dehyphenate the text
        (pos, argres) = self.findinDoc('info.dehyphen.rootID',0,-1)
        if argres: 
            argList = argres.split('|')
            self.dehyphen_rootid = [ int(strval) for strval in argList]
        # determine if first paragraph is continued from previous page
        (pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1)
-        if self.parastems_stemid == None : self.parastems_stemid = []
+        first_para_continued = (self.parastems_stemid  != None) 
- 
+        
        # determine if last paragraph is continued onto the next page
        (pos, self.paracont_stemid) = self.findinDoc('info.paraCont.stemID',0,-1)
-        if self.paracont_stemid == None : self.paracont_stemid = []
+        last_para_continued = (self.paracont_stemid != None)
        # collect link ids
        (pos, argres) = self.findinDoc('info.word.link_id',0,-1)
        if argres:
            argList = argres.split('|')
            self.link_id = [ int(strval) for strval in argList]
        # collect link destination page numbers
        (pos, argres) = self.findinDoc('info.links.page',0,-1)
        if argres :
            argList = argres.split('|')
            self.link_page = [ int(strval) for strval in argList]
        # collect link titles
        (pos, argres) = self.findinDoc('info.links.title',0,-1)
        if argres :
            self.link_title = argres.split('|')
        else:
            self.link_title.append('')
        # get page type
        (pos, pagetype) = self.findinDoc('page.type',0,-1)
        # generate a list of each region starting point
        # each region has one paragraph,, or one image, or one chapterheading
        regionList= self.posinDoc('region')
        regcnt = len(regionList)
        regionList.append(-1)
@ -308,47 +309,48 @@ class DocParser(object):
        # process each region tag and convert what you can to html
        for j in xrange(regcnt):
            start = regionList[j]
            end = regionList[j+1]
            (pos, regtype) = self.findinDoc('region.type',start,end)
            # set anchor for link target on this page
            if not anchorSet and not first_para_continued:
                htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
                anchorSet = True
            if regtype == 'graphic' :
                if not anchorSet:
                    htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
                    anchorSet = True
                (pos, simgsrc) = self.findinDoc('img.src',start,end)
                if simgsrc:
                    htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
            elif regtype == 'chapterheading' :
                (pclass, pdesc) = self.getParaDescription(start,end)
                if not breakSet:
                    htmlpage += '<div style="page-break-after: always;">&nbsp;</div>\n'
                    breakSet = True
                if not anchorSet:
                    htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
                    anchorSet = True
                tag = 'h1'
-                if pclass[3:7] == 'ch1-' : tag = 'h1'
+                if pclass and (len(pclass) >= 7):
-                if pclass[3:7] == 'ch2-' : tag = 'h2'
+                    if pclass[3:7] == 'ch1-' : tag = 'h1'
-                if pclass[3:7] == 'ch3-' : tag = 'h3'
+                    if pclass[3:7] == 'ch2-' : tag = 'h2'
-                htmlpage += '<' + tag + ' class="' + pclass + '">'
+                    if pclass[3:7] == 'ch3-' : tag = 'h3'
                    htmlpage += '<' + tag + ' class="' + pclass + '">'
                else:
                    htmlpage += '<' + tag + '>'
                htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
                htmlpage += '</' + tag + '>'
            elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') or (regtype == 'listitem') :
                ptype = 'full'
                # check to see if this is a continution from the previous page
-                if (len(self.parastems_stemid) > 0):
+                if first_para_continued :
                    ptype = 'end'
-                    self.parastems_stemid=[]
+                    first_para_continued = False
                else:
                    if not anchorSet:
                        htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
                        anchorSet = True
                (pclass, pdesc) = self.getParaDescription(start,end)
-                if ptype == 'full' :
+                if pclass and (len(pclass) >= 6) and (ptype == 'full'):
                    tag = 'p'
                    if pclass[3:6] == 'h1-' : tag = 'h4'
                    if pclass[3:6] == 'h2-' : tag = 'h5'
@ -359,28 +361,22 @@ class DocParser(object):
                else :
                    htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
            elif (regtype == 'tocentry') :
                ptype = 'full'
-                # check to see if this is a continution from the previous page
+                if first_para_continued :
                if (len(self.parastems_stemid) > 0) and (j == 0):
                    # process the first paragraph as a continuation from the last page
                    ptype = 'end'
-                    self.parastems_stemid = []
+                    first_para_continued = False
                else:
                    if not anchorSet:
                        htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
                        anchorSet = True
                (pclass, pdesc) = self.getParaDescription(start,end)
                htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
            elif (regtype == 'synth_fcvr.center') or (regtype == 'synth_text.center'):
                if not anchorSet:
                    htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
                    anchorSet = True
                (pos, simgsrc) = self.findinDoc('img.src',start,end)
                if simgsrc:
                    htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
            else :
                print 'Warning: Unknown region type', regtype
                (pos, temp) = self.findinDoc('paragraph',start,end)
@ -389,15 +385,11 @@ class DocParser(object):
                    regtype = 'fixed'
                    ptype = 'full'
                    # check to see if this is a continution from the previous page
-                    if (len(self.parastems_stemid) > 0):
+                    if first_para_continued :
                        ptype = 'end'
-                        self.parastems_stemid=[]
+                        first_para_continued = False
                    else:
                        if not anchorSet:
                            htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
                            anchorSet = True
                    (pclass, pdesc) = self.getParaDescription(start,end)
-                    if ptype == 'full' :
+                    if pclass and (ptype == 'full') and (len(pclass) >= 6):
                        tag = 'p'
                        if pclass[3:6] == 'h1-' : tag = 'h4'
                        if pclass[3:6] == 'h2-' : tag = 'h5'
@ -408,24 +400,20 @@ class DocParser(object):
                    else :
                        htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
                else :
-                    print 'Treating this like a "image" region'
+                    print 'Treating this like a "graphic" region'
                    if not anchorSet:
                        htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
                        anchorSet = True
                    (pos, simgsrc) = self.findinDoc('img.src',start,end)
                    if simgsrc:
                        htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
-        if len(self.paracont_stemid) > 0 :
+
        if last_para_continued :
            if htmlpage[-4:] == '</p>':
-                htmlpage = htmlpage[0:-4]    
+                htmlpage = htmlpage[0:-4]
            last_para_continued = False
        return htmlpage
        return self.convert2HTML()
 def convert2HTML(flatxml, classlst, fileid):
--- a/Topaz_Tools/lib/readme.txt
+++ b/Topaz_Tools/lib/readme.txt
@ -3,7 +3,7 @@ Contributors:
     clarknova - for all of the svg and glyph generation and many other bug fixes and improvements
     skindle - for figuing out the general case for the mode loops
     some updates -  for conversion to xml, basic html
-     DiapDealer - for extensive testing and feedback
+     DiapDealer - for extensive testing and feedback, and standalone linux/macosx version of cmbtc_dump
     stewball - for extensive testing and feedback
 and others for posting, feedback and testing
@ -29,6 +29,17 @@ genxml.py - main program to convert everything to xml
 genhtml.py - main program to generate "book.html"
 gensvg.py - (author: clarknova) main program to create an svg grpahic of each page
 In addition there is now a new file:
 cmbtc_dump_mac_linux.py  
 If you know the pid of your ipod and/or your standalone Kindle and your book
 was meant for that device, you can use this program to dump the proper sections
 on Mac OSX and Linux (and even Windows if you do not have Kindle4PC installed).
 Thank DiapDealer for creating it!
 Please note, gensvg.py, genhtml.py, and genxml.py import and use
 decode_meta.py, convert2xml.py, flatxml2html.py, getpagedim.py and stylexml2css.py 
 so please keep all of these python scripts together in the same place.