More fixes for Amazon books, fixing identity checks, started on Topaz.
This commit is contained in:
parent
dc27c36761
commit
939cdbb0c9
|
@ -56,7 +56,7 @@ def readEncodedNumber(file):
|
|||
c = file.read(1)
|
||||
if (len(c) == 0):
|
||||
return None
|
||||
data = ord(c)
|
||||
data = c[0]
|
||||
datax = (datax <<7) + (data & 0x7F)
|
||||
data = datax
|
||||
|
||||
|
@ -188,232 +188,232 @@ class PageParser(object):
|
|||
# tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped)
|
||||
|
||||
token_tags = {
|
||||
'x' : (1, 'scalar_number', 0, 0),
|
||||
'y' : (1, 'scalar_number', 0, 0),
|
||||
'h' : (1, 'scalar_number', 0, 0),
|
||||
'w' : (1, 'scalar_number', 0, 0),
|
||||
'firstWord' : (1, 'scalar_number', 0, 0),
|
||||
'lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'rootID' : (1, 'scalar_number', 0, 0),
|
||||
'stemID' : (1, 'scalar_number', 0, 0),
|
||||
'type' : (1, 'scalar_text', 0, 0),
|
||||
b'x' : (1, 'scalar_number', 0, 0),
|
||||
b'y' : (1, 'scalar_number', 0, 0),
|
||||
b'h' : (1, 'scalar_number', 0, 0),
|
||||
b'w' : (1, 'scalar_number', 0, 0),
|
||||
b'firstWord' : (1, 'scalar_number', 0, 0),
|
||||
b'lastWord' : (1, 'scalar_number', 0, 0),
|
||||
b'rootID' : (1, 'scalar_number', 0, 0),
|
||||
b'stemID' : (1, 'scalar_number', 0, 0),
|
||||
b'type' : (1, 'scalar_text', 0, 0),
|
||||
|
||||
'info' : (0, 'number', 1, 0),
|
||||
b'info' : (0, 'number', 1, 0),
|
||||
|
||||
'info.word' : (0, 'number', 1, 1),
|
||||
'info.word.ocrText' : (1, 'text', 0, 0),
|
||||
'info.word.firstGlyph' : (1, 'raw', 0, 0),
|
||||
'info.word.lastGlyph' : (1, 'raw', 0, 0),
|
||||
'info.word.bl' : (1, 'raw', 0, 0),
|
||||
'info.word.link_id' : (1, 'number', 0, 0),
|
||||
b'info.word' : (0, 'number', 1, 1),
|
||||
b'info.word.ocrText' : (1, 'text', 0, 0),
|
||||
b'info.word.firstGlyph' : (1, 'raw', 0, 0),
|
||||
b'info.word.lastGlyph' : (1, 'raw', 0, 0),
|
||||
b'info.word.bl' : (1, 'raw', 0, 0),
|
||||
b'info.word.link_id' : (1, 'number', 0, 0),
|
||||
|
||||
'glyph' : (0, 'number', 1, 1),
|
||||
'glyph.x' : (1, 'number', 0, 0),
|
||||
'glyph.y' : (1, 'number', 0, 0),
|
||||
'glyph.glyphID' : (1, 'number', 0, 0),
|
||||
b'glyph' : (0, 'number', 1, 1),
|
||||
b'glyph.x' : (1, 'number', 0, 0),
|
||||
b'glyph.y' : (1, 'number', 0, 0),
|
||||
b'glyph.glyphID' : (1, 'number', 0, 0),
|
||||
|
||||
'dehyphen' : (0, 'number', 1, 1),
|
||||
'dehyphen.rootID' : (1, 'number', 0, 0),
|
||||
'dehyphen.stemID' : (1, 'number', 0, 0),
|
||||
'dehyphen.stemPage' : (1, 'number', 0, 0),
|
||||
'dehyphen.sh' : (1, 'number', 0, 0),
|
||||
b'dehyphen' : (0, 'number', 1, 1),
|
||||
b'dehyphen.rootID' : (1, 'number', 0, 0),
|
||||
b'dehyphen.stemID' : (1, 'number', 0, 0),
|
||||
b'dehyphen.stemPage' : (1, 'number', 0, 0),
|
||||
b'dehyphen.sh' : (1, 'number', 0, 0),
|
||||
|
||||
'links' : (0, 'number', 1, 1),
|
||||
'links.page' : (1, 'number', 0, 0),
|
||||
'links.rel' : (1, 'number', 0, 0),
|
||||
'links.row' : (1, 'number', 0, 0),
|
||||
'links.title' : (1, 'text', 0, 0),
|
||||
'links.href' : (1, 'text', 0, 0),
|
||||
'links.type' : (1, 'text', 0, 0),
|
||||
'links.id' : (1, 'number', 0, 0),
|
||||
b'links' : (0, 'number', 1, 1),
|
||||
b'links.page' : (1, 'number', 0, 0),
|
||||
b'links.rel' : (1, 'number', 0, 0),
|
||||
b'links.row' : (1, 'number', 0, 0),
|
||||
b'links.title' : (1, 'text', 0, 0),
|
||||
b'links.href' : (1, 'text', 0, 0),
|
||||
b'links.type' : (1, 'text', 0, 0),
|
||||
b'links.id' : (1, 'number', 0, 0),
|
||||
|
||||
'paraCont' : (0, 'number', 1, 1),
|
||||
'paraCont.rootID' : (1, 'number', 0, 0),
|
||||
'paraCont.stemID' : (1, 'number', 0, 0),
|
||||
'paraCont.stemPage' : (1, 'number', 0, 0),
|
||||
b'paraCont' : (0, 'number', 1, 1),
|
||||
b'paraCont.rootID' : (1, 'number', 0, 0),
|
||||
b'paraCont.stemID' : (1, 'number', 0, 0),
|
||||
b'paraCont.stemPage' : (1, 'number', 0, 0),
|
||||
|
||||
'paraStems' : (0, 'number', 1, 1),
|
||||
'paraStems.stemID' : (1, 'number', 0, 0),
|
||||
b'paraStems' : (0, 'number', 1, 1),
|
||||
b'paraStems.stemID' : (1, 'number', 0, 0),
|
||||
|
||||
'wordStems' : (0, 'number', 1, 1),
|
||||
'wordStems.stemID' : (1, 'number', 0, 0),
|
||||
b'wordStems' : (0, 'number', 1, 1),
|
||||
b'wordStems.stemID' : (1, 'number', 0, 0),
|
||||
|
||||
'empty' : (1, 'snippets', 1, 0),
|
||||
b'empty' : (1, 'snippets', 1, 0),
|
||||
|
||||
'page' : (1, 'snippets', 1, 0),
|
||||
'page.class' : (1, 'scalar_text', 0, 0),
|
||||
'page.pageid' : (1, 'scalar_text', 0, 0),
|
||||
'page.pagelabel' : (1, 'scalar_text', 0, 0),
|
||||
'page.type' : (1, 'scalar_text', 0, 0),
|
||||
'page.h' : (1, 'scalar_number', 0, 0),
|
||||
'page.w' : (1, 'scalar_number', 0, 0),
|
||||
'page.startID' : (1, 'scalar_number', 0, 0),
|
||||
b'page' : (1, 'snippets', 1, 0),
|
||||
b'page.class' : (1, 'scalar_text', 0, 0),
|
||||
b'page.pageid' : (1, 'scalar_text', 0, 0),
|
||||
b'page.pagelabel' : (1, 'scalar_text', 0, 0),
|
||||
b'page.type' : (1, 'scalar_text', 0, 0),
|
||||
b'page.h' : (1, 'scalar_number', 0, 0),
|
||||
b'page.w' : (1, 'scalar_number', 0, 0),
|
||||
b'page.startID' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
'group' : (1, 'snippets', 1, 0),
|
||||
'group.class' : (1, 'scalar_text', 0, 0),
|
||||
'group.type' : (1, 'scalar_text', 0, 0),
|
||||
'group._tag' : (1, 'scalar_text', 0, 0),
|
||||
'group.orientation': (1, 'scalar_text', 0, 0),
|
||||
b'group' : (1, 'snippets', 1, 0),
|
||||
b'group.class' : (1, 'scalar_text', 0, 0),
|
||||
b'group.type' : (1, 'scalar_text', 0, 0),
|
||||
b'group._tag' : (1, 'scalar_text', 0, 0),
|
||||
b'group.orientation': (1, 'scalar_text', 0, 0),
|
||||
|
||||
'region' : (1, 'snippets', 1, 0),
|
||||
'region.class' : (1, 'scalar_text', 0, 0),
|
||||
'region.type' : (1, 'scalar_text', 0, 0),
|
||||
'region.x' : (1, 'scalar_number', 0, 0),
|
||||
'region.y' : (1, 'scalar_number', 0, 0),
|
||||
'region.h' : (1, 'scalar_number', 0, 0),
|
||||
'region.w' : (1, 'scalar_number', 0, 0),
|
||||
'region.orientation' : (1, 'scalar_text', 0, 0),
|
||||
b'region' : (1, 'snippets', 1, 0),
|
||||
b'region.class' : (1, 'scalar_text', 0, 0),
|
||||
b'region.type' : (1, 'scalar_text', 0, 0),
|
||||
b'region.x' : (1, 'scalar_number', 0, 0),
|
||||
b'region.y' : (1, 'scalar_number', 0, 0),
|
||||
b'region.h' : (1, 'scalar_number', 0, 0),
|
||||
b'region.w' : (1, 'scalar_number', 0, 0),
|
||||
b'region.orientation' : (1, 'scalar_text', 0, 0),
|
||||
|
||||
'empty_text_region' : (1, 'snippets', 1, 0),
|
||||
b'empty_text_region' : (1, 'snippets', 1, 0),
|
||||
|
||||
'img' : (1, 'snippets', 1, 0),
|
||||
'img.x' : (1, 'scalar_number', 0, 0),
|
||||
'img.y' : (1, 'scalar_number', 0, 0),
|
||||
'img.h' : (1, 'scalar_number', 0, 0),
|
||||
'img.w' : (1, 'scalar_number', 0, 0),
|
||||
'img.src' : (1, 'scalar_number', 0, 0),
|
||||
'img.color_src' : (1, 'scalar_number', 0, 0),
|
||||
'img.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
'img.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'img.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
'img.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||
'img.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||
'img.image_type' : (1, 'scalar_number', 0, 0),
|
||||
b'img' : (1, 'snippets', 1, 0),
|
||||
b'img.x' : (1, 'scalar_number', 0, 0),
|
||||
b'img.y' : (1, 'scalar_number', 0, 0),
|
||||
b'img.h' : (1, 'scalar_number', 0, 0),
|
||||
b'img.w' : (1, 'scalar_number', 0, 0),
|
||||
b'img.src' : (1, 'scalar_number', 0, 0),
|
||||
b'img.color_src' : (1, 'scalar_number', 0, 0),
|
||||
b'img.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
b'img.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'img.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'img.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'img.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'img.image_type' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
'paragraph' : (1, 'snippets', 1, 0),
|
||||
'paragraph.class' : (1, 'scalar_text', 0, 0),
|
||||
'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'paragraph' : (1, 'snippets', 1, 0),
|
||||
b'paragraph.class' : (1, 'scalar_text', 0, 0),
|
||||
b'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
b'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
b'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
b'paragraph.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
b'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'paragraph.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'paragraph.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
|
||||
'word_semantic' : (1, 'snippets', 1, 1),
|
||||
'word_semantic.type' : (1, 'scalar_text', 0, 0),
|
||||
'word_semantic.class' : (1, 'scalar_text', 0, 0),
|
||||
'word_semantic.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
'word_semantic.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'word_semantic.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'word_semantic.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
'word_semantic.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||
'word_semantic.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'word_semantic' : (1, 'snippets', 1, 1),
|
||||
b'word_semantic.type' : (1, 'scalar_text', 0, 0),
|
||||
b'word_semantic.class' : (1, 'scalar_text', 0, 0),
|
||||
b'word_semantic.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
b'word_semantic.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
b'word_semantic.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'word_semantic.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'word_semantic.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'word_semantic.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
'word' : (1, 'snippets', 1, 0),
|
||||
'word.type' : (1, 'scalar_text', 0, 0),
|
||||
'word.class' : (1, 'scalar_text', 0, 0),
|
||||
'word.firstGlyph' : (1, 'scalar_number', 0, 0),
|
||||
'word.lastGlyph' : (1, 'scalar_number', 0, 0),
|
||||
b'word' : (1, 'snippets', 1, 0),
|
||||
b'word.type' : (1, 'scalar_text', 0, 0),
|
||||
b'word.class' : (1, 'scalar_text', 0, 0),
|
||||
b'word.firstGlyph' : (1, 'scalar_number', 0, 0),
|
||||
b'word.lastGlyph' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
'_span' : (1, 'snippets', 1, 0),
|
||||
'_span.class' : (1, 'scalar_text', 0, 0),
|
||||
'_span.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
'_span.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'_span.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
'_span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'_span.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
'_span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||
'_span.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'_span' : (1, 'snippets', 1, 0),
|
||||
b'_span.class' : (1, 'scalar_text', 0, 0),
|
||||
b'_span.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
b'_span.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
b'_span.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
b'_span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'_span.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'_span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'_span.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
'span' : (1, 'snippets', 1, 0),
|
||||
'span.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
'span.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'span.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
'span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'span.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
'span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||
'span.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'span' : (1, 'snippets', 1, 0),
|
||||
b'span.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
b'span.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
b'span.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
b'span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'span.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'span.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
'extratokens' : (1, 'snippets', 1, 0),
|
||||
'extratokens.class' : (1, 'scalar_text', 0, 0),
|
||||
'extratokens.type' : (1, 'scalar_text', 0, 0),
|
||||
'extratokens.firstGlyph' : (1, 'scalar_number', 0, 0),
|
||||
'extratokens.lastGlyph' : (1, 'scalar_number', 0, 0),
|
||||
'extratokens.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
'extratokens.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'extratokens.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
'extratokens.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||
'extratokens.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'extratokens' : (1, 'snippets', 1, 0),
|
||||
b'extratokens.class' : (1, 'scalar_text', 0, 0),
|
||||
b'extratokens.type' : (1, 'scalar_text', 0, 0),
|
||||
b'extratokens.firstGlyph' : (1, 'scalar_number', 0, 0),
|
||||
b'extratokens.lastGlyph' : (1, 'scalar_number', 0, 0),
|
||||
b'extratokens.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
b'extratokens.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'extratokens.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'extratokens.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'extratokens.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
'glyph.h' : (1, 'number', 0, 0),
|
||||
'glyph.w' : (1, 'number', 0, 0),
|
||||
'glyph.use' : (1, 'number', 0, 0),
|
||||
'glyph.vtx' : (1, 'number', 0, 1),
|
||||
'glyph.len' : (1, 'number', 0, 1),
|
||||
'glyph.dpi' : (1, 'number', 0, 0),
|
||||
'vtx' : (0, 'number', 1, 1),
|
||||
'vtx.x' : (1, 'number', 0, 0),
|
||||
'vtx.y' : (1, 'number', 0, 0),
|
||||
'len' : (0, 'number', 1, 1),
|
||||
'len.n' : (1, 'number', 0, 0),
|
||||
b'glyph.h' : (1, 'number', 0, 0),
|
||||
b'glyph.w' : (1, 'number', 0, 0),
|
||||
b'glyph.use' : (1, 'number', 0, 0),
|
||||
b'glyph.vtx' : (1, 'number', 0, 1),
|
||||
b'glyph.len' : (1, 'number', 0, 1),
|
||||
b'glyph.dpi' : (1, 'number', 0, 0),
|
||||
b'vtx' : (0, 'number', 1, 1),
|
||||
b'vtx.x' : (1, 'number', 0, 0),
|
||||
b'vtx.y' : (1, 'number', 0, 0),
|
||||
b'len' : (0, 'number', 1, 1),
|
||||
b'len.n' : (1, 'number', 0, 0),
|
||||
|
||||
'book' : (1, 'snippets', 1, 0),
|
||||
'version' : (1, 'snippets', 1, 0),
|
||||
'version.FlowEdit_1_id' : (1, 'scalar_text', 0, 0),
|
||||
'version.FlowEdit_1_version' : (1, 'scalar_text', 0, 0),
|
||||
'version.Schema_id' : (1, 'scalar_text', 0, 0),
|
||||
'version.Schema_version' : (1, 'scalar_text', 0, 0),
|
||||
'version.Topaz_version' : (1, 'scalar_text', 0, 0),
|
||||
'version.WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0),
|
||||
'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
|
||||
'version.ZoneEdit_1_id' : (1, 'scalar_text', 0, 0),
|
||||
'version.ZoneEdit_1_version' : (1, 'scalar_text', 0, 0),
|
||||
'version.chapterheaders' : (1, 'scalar_text', 0, 0),
|
||||
'version.creation_date' : (1, 'scalar_text', 0, 0),
|
||||
'version.header_footer' : (1, 'scalar_text', 0, 0),
|
||||
'version.init_from_ocr' : (1, 'scalar_text', 0, 0),
|
||||
'version.letter_insertion' : (1, 'scalar_text', 0, 0),
|
||||
'version.xmlinj_convert' : (1, 'scalar_text', 0, 0),
|
||||
'version.xmlinj_reflow' : (1, 'scalar_text', 0, 0),
|
||||
'version.xmlinj_transform' : (1, 'scalar_text', 0, 0),
|
||||
'version.findlists' : (1, 'scalar_text', 0, 0),
|
||||
'version.page_num' : (1, 'scalar_text', 0, 0),
|
||||
'version.page_type' : (1, 'scalar_text', 0, 0),
|
||||
'version.bad_text' : (1, 'scalar_text', 0, 0),
|
||||
'version.glyph_mismatch' : (1, 'scalar_text', 0, 0),
|
||||
'version.margins' : (1, 'scalar_text', 0, 0),
|
||||
'version.staggered_lines' : (1, 'scalar_text', 0, 0),
|
||||
'version.paragraph_continuation' : (1, 'scalar_text', 0, 0),
|
||||
'version.toc' : (1, 'scalar_text', 0, 0),
|
||||
b'book' : (1, 'snippets', 1, 0),
|
||||
b'version' : (1, 'snippets', 1, 0),
|
||||
b'version.FlowEdit_1_id' : (1, 'scalar_text', 0, 0),
|
||||
b'version.FlowEdit_1_version' : (1, 'scalar_text', 0, 0),
|
||||
b'version.Schema_id' : (1, 'scalar_text', 0, 0),
|
||||
b'version.Schema_version' : (1, 'scalar_text', 0, 0),
|
||||
b'version.Topaz_version' : (1, 'scalar_text', 0, 0),
|
||||
b'version.WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0),
|
||||
b'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
|
||||
b'version.ZoneEdit_1_id' : (1, 'scalar_text', 0, 0),
|
||||
b'version.ZoneEdit_1_version' : (1, 'scalar_text', 0, 0),
|
||||
b'version.chapterheaders' : (1, 'scalar_text', 0, 0),
|
||||
b'version.creation_date' : (1, 'scalar_text', 0, 0),
|
||||
b'version.header_footer' : (1, 'scalar_text', 0, 0),
|
||||
b'version.init_from_ocr' : (1, 'scalar_text', 0, 0),
|
||||
b'version.letter_insertion' : (1, 'scalar_text', 0, 0),
|
||||
b'version.xmlinj_convert' : (1, 'scalar_text', 0, 0),
|
||||
b'version.xmlinj_reflow' : (1, 'scalar_text', 0, 0),
|
||||
b'version.xmlinj_transform' : (1, 'scalar_text', 0, 0),
|
||||
b'version.findlists' : (1, 'scalar_text', 0, 0),
|
||||
b'version.page_num' : (1, 'scalar_text', 0, 0),
|
||||
b'version.page_type' : (1, 'scalar_text', 0, 0),
|
||||
b'version.bad_text' : (1, 'scalar_text', 0, 0),
|
||||
b'version.glyph_mismatch' : (1, 'scalar_text', 0, 0),
|
||||
b'version.margins' : (1, 'scalar_text', 0, 0),
|
||||
b'version.staggered_lines' : (1, 'scalar_text', 0, 0),
|
||||
b'version.paragraph_continuation' : (1, 'scalar_text', 0, 0),
|
||||
b'version.toc' : (1, 'scalar_text', 0, 0),
|
||||
|
||||
'stylesheet' : (1, 'snippets', 1, 0),
|
||||
'style' : (1, 'snippets', 1, 0),
|
||||
'style._tag' : (1, 'scalar_text', 0, 0),
|
||||
'style.type' : (1, 'scalar_text', 0, 0),
|
||||
'style._after_type' : (1, 'scalar_text', 0, 0),
|
||||
'style._parent_type' : (1, 'scalar_text', 0, 0),
|
||||
'style._after_parent_type' : (1, 'scalar_text', 0, 0),
|
||||
'style.class' : (1, 'scalar_text', 0, 0),
|
||||
'style._after_class' : (1, 'scalar_text', 0, 0),
|
||||
'rule' : (1, 'snippets', 1, 0),
|
||||
'rule.attr' : (1, 'scalar_text', 0, 0),
|
||||
'rule.value' : (1, 'scalar_text', 0, 0),
|
||||
b'stylesheet' : (1, 'snippets', 1, 0),
|
||||
b'style' : (1, 'snippets', 1, 0),
|
||||
b'style._tag' : (1, 'scalar_text', 0, 0),
|
||||
b'style.type' : (1, 'scalar_text', 0, 0),
|
||||
b'style._after_type' : (1, 'scalar_text', 0, 0),
|
||||
b'style._parent_type' : (1, 'scalar_text', 0, 0),
|
||||
b'style._after_parent_type' : (1, 'scalar_text', 0, 0),
|
||||
b'style.class' : (1, 'scalar_text', 0, 0),
|
||||
b'style._after_class' : (1, 'scalar_text', 0, 0),
|
||||
b'rule' : (1, 'snippets', 1, 0),
|
||||
b'rule.attr' : (1, 'scalar_text', 0, 0),
|
||||
b'rule.value' : (1, 'scalar_text', 0, 0),
|
||||
|
||||
'original' : (0, 'number', 1, 1),
|
||||
'original.pnum' : (1, 'number', 0, 0),
|
||||
'original.pid' : (1, 'text', 0, 0),
|
||||
'pages' : (0, 'number', 1, 1),
|
||||
'pages.ref' : (1, 'number', 0, 0),
|
||||
'pages.id' : (1, 'number', 0, 0),
|
||||
'startID' : (0, 'number', 1, 1),
|
||||
'startID.page' : (1, 'number', 0, 0),
|
||||
'startID.id' : (1, 'number', 0, 0),
|
||||
b'original' : (0, 'number', 1, 1),
|
||||
b'original.pnum' : (1, 'number', 0, 0),
|
||||
b'original.pid' : (1, 'text', 0, 0),
|
||||
b'pages' : (0, 'number', 1, 1),
|
||||
b'pages.ref' : (1, 'number', 0, 0),
|
||||
b'pages.id' : (1, 'number', 0, 0),
|
||||
b'startID' : (0, 'number', 1, 1),
|
||||
b'startID.page' : (1, 'number', 0, 0),
|
||||
b'startID.id' : (1, 'number', 0, 0),
|
||||
|
||||
'median_d' : (1, 'number', 0, 0),
|
||||
'median_h' : (1, 'number', 0, 0),
|
||||
'median_firsty' : (1, 'number', 0, 0),
|
||||
'median_lasty' : (1, 'number', 0, 0),
|
||||
b'median_d' : (1, 'number', 0, 0),
|
||||
b'median_h' : (1, 'number', 0, 0),
|
||||
b'median_firsty' : (1, 'number', 0, 0),
|
||||
b'median_lasty' : (1, 'number', 0, 0),
|
||||
|
||||
'num_footers_maybe' : (1, 'number', 0, 0),
|
||||
'num_footers_yes' : (1, 'number', 0, 0),
|
||||
'num_headers_maybe' : (1, 'number', 0, 0),
|
||||
'num_headers_yes' : (1, 'number', 0, 0),
|
||||
b'num_footers_maybe' : (1, 'number', 0, 0),
|
||||
b'num_footers_yes' : (1, 'number', 0, 0),
|
||||
b'num_headers_maybe' : (1, 'number', 0, 0),
|
||||
b'num_headers_yes' : (1, 'number', 0, 0),
|
||||
|
||||
'tracking' : (1, 'number', 0, 0),
|
||||
'src' : (1, 'text', 0, 0),
|
||||
b'tracking' : (1, 'number', 0, 0),
|
||||
b'src' : (1, 'text', 0, 0),
|
||||
|
||||
}
|
||||
|
||||
|
@ -430,7 +430,7 @@ class PageParser(object):
|
|||
cnt = len(self.tagpath)
|
||||
if i < cnt : result = self.tagpath[i]
|
||||
for j in range(i+1, cnt) :
|
||||
result += '.' + self.tagpath[j]
|
||||
result += b'.' + self.tagpath[j]
|
||||
return result
|
||||
|
||||
|
||||
|
@ -505,7 +505,7 @@ class PageParser(object):
|
|||
|
||||
if (subtags == 1):
|
||||
ntags = readEncodedNumber(self.fo)
|
||||
if self.debug : print('subtags: ' + token + ' has ' + str(ntags))
|
||||
if self.debug : print('subtags: ', token , ' has ' , str(ntags))
|
||||
for j in range(ntags):
|
||||
val = readEncodedNumber(self.fo)
|
||||
subtagres.append(self.procToken(self.dict.lookup(val)))
|
||||
|
@ -613,7 +613,7 @@ class PageParser(object):
|
|||
subtagList = tag[1]
|
||||
argtype = tag[2]
|
||||
argList = tag[3]
|
||||
nname = prefix + '.' + name
|
||||
nname = prefix + b'.' + name
|
||||
nsubtaglist = []
|
||||
for j in subtagList:
|
||||
nsubtaglist.append(self.updateName(j,prefix))
|
||||
|
@ -662,34 +662,34 @@ class PageParser(object):
|
|||
subtagList = node[1]
|
||||
argtype = node[2]
|
||||
argList = node[3]
|
||||
fullpathname = name.split('.')
|
||||
fullpathname = name.split(b'.')
|
||||
nodename = fullpathname.pop()
|
||||
ilvl = len(fullpathname)
|
||||
indent = ' ' * (3 * ilvl)
|
||||
indent = b' ' * (3 * ilvl)
|
||||
rlst = []
|
||||
rlst.append(indent + '<' + nodename + '>')
|
||||
rlst.append(indent + b'<' + nodename + b'>')
|
||||
if len(argList) > 0:
|
||||
alst = []
|
||||
for j in argList:
|
||||
if (argtype == 'text') or (argtype == 'scalar_text') :
|
||||
alst.append(j + '|')
|
||||
if (argtype == b'text') or (argtype == b'scalar_text') :
|
||||
alst.append(j + b'|')
|
||||
else :
|
||||
alst.append(str(j) + ',')
|
||||
argres = "".join(alst)
|
||||
alst.append(str(j).encode('utf-8') + b',')
|
||||
argres = b"".join(alst)
|
||||
argres = argres[0:-1]
|
||||
if argtype == 'snippets' :
|
||||
rlst.append('snippets:' + argres)
|
||||
if argtype == b'snippets' :
|
||||
rlst.append(b'snippets:' + argres)
|
||||
else :
|
||||
rlst.append(argres)
|
||||
if len(subtagList) > 0 :
|
||||
rlst.append('\n')
|
||||
rlst.append(b'\n')
|
||||
for j in subtagList:
|
||||
if len(j) > 0 :
|
||||
rlst.append(self.formatTag(j))
|
||||
rlst.append(indent + '</' + nodename + '>\n')
|
||||
rlst.append(indent + b'</' + nodename + b'>\n')
|
||||
else:
|
||||
rlst.append('</' + nodename + '>\n')
|
||||
return "".join(rlst)
|
||||
rlst.append(b'</' + nodename + b'>\n')
|
||||
return b"".join(rlst)
|
||||
|
||||
|
||||
# flatten tag
|
||||
|
@ -704,20 +704,20 @@ class PageParser(object):
|
|||
alst = []
|
||||
for j in argList:
|
||||
if (argtype == 'text') or (argtype == 'scalar_text') :
|
||||
alst.append(j + '|')
|
||||
alst.append(j + b'|')
|
||||
else :
|
||||
alst.append(str(j) + '|')
|
||||
argres = "".join(alst)
|
||||
alst.append(str(j).encode('utf-8') + b'|')
|
||||
argres = b"".join(alst)
|
||||
argres = argres[0:-1]
|
||||
if argtype == 'snippets' :
|
||||
rlst.append('.snippets=' + argres)
|
||||
if argtype == b'snippets' :
|
||||
rlst.append(b'.snippets=' + argres)
|
||||
else :
|
||||
rlst.append('=' + argres)
|
||||
rlst.append('\n')
|
||||
rlst.append(b'=' + argres)
|
||||
rlst.append(b'\n')
|
||||
for j in subtagList:
|
||||
if len(j) > 0 :
|
||||
rlst.append(self.flattenTag(j))
|
||||
return "".join(rlst)
|
||||
return b"".join(rlst)
|
||||
|
||||
|
||||
# reduce create xml output
|
||||
|
@ -729,7 +729,7 @@ class PageParser(object):
|
|||
rlst.append(self.flattenTag(j))
|
||||
else:
|
||||
rlst.append(self.formatTag(j))
|
||||
result = "".join(rlst)
|
||||
result = b"".join(rlst)
|
||||
if self.debug : print(result)
|
||||
return result
|
||||
|
||||
|
@ -747,16 +747,16 @@ class PageParser(object):
|
|||
|
||||
# peek at the first bytes to see what type of file it is
|
||||
magic = self.fo.read(9)
|
||||
if (magic[0:1] == 'p') and (magic[2:9] == 'marker_'):
|
||||
first_token = 'info'
|
||||
elif (magic[0:1] == 'p') and (magic[2:9] == '__PAGE_'):
|
||||
if (magic[0:1] == b'p') and (magic[2:9] == b'marker_'):
|
||||
first_token = b'info'
|
||||
elif (magic[0:1] == b'p') and (magic[2:9] == b'__PAGE_'):
|
||||
skip = self.fo.read(2)
|
||||
first_token = 'info'
|
||||
elif (magic[0:1] == 'p') and (magic[2:8] == '_PAGE_'):
|
||||
first_token = 'info'
|
||||
elif (magic[0:1] == 'g') and (magic[2:9] == '__GLYPH'):
|
||||
first_token = b'info'
|
||||
elif (magic[0:1] == b'p') and (magic[2:8] == b'_PAGE_'):
|
||||
first_token = b'info'
|
||||
elif (magic[0:1] == b'g') and (magic[2:9] == b'__GLYPH'):
|
||||
skip = self.fo.read(3)
|
||||
first_token = 'info'
|
||||
first_token = b'info'
|
||||
else :
|
||||
# other0.dat file
|
||||
first_token = None
|
||||
|
@ -778,7 +778,7 @@ class PageParser(object):
|
|||
break
|
||||
|
||||
if (v == 0x72):
|
||||
self.doLoop72('number')
|
||||
self.doLoop72(b'number')
|
||||
elif (v > 0) and (v < self.dict.getSize()) :
|
||||
tag = self.procToken(self.dict.lookup(v))
|
||||
if len(tag) > 0 :
|
||||
|
@ -789,7 +789,7 @@ class PageParser(object):
|
|||
if (v == 0):
|
||||
if (self.peek(1) == 0x5f):
|
||||
skip = self.fo.read(1)
|
||||
first_token = 'info'
|
||||
first_token = b'info'
|
||||
|
||||
# now do snippet injection
|
||||
if len(self.snippetList) > 0 :
|
||||
|
@ -809,14 +809,14 @@ class PageParser(object):
|
|||
|
||||
def fromData(dict, fname):
|
||||
flat_xml = True
|
||||
debug = False
|
||||
debug = True
|
||||
pp = PageParser(fname, dict, debug, flat_xml)
|
||||
xmlpage = pp.process()
|
||||
return xmlpage
|
||||
|
||||
def getXML(dict, fname):
|
||||
flat_xml = False
|
||||
debug = False
|
||||
debug = True
|
||||
pp = PageParser(fname, dict, debug, flat_xml)
|
||||
xmlpage = pp.process()
|
||||
return xmlpage
|
||||
|
@ -845,7 +845,7 @@ def main(argv):
|
|||
sys.stderr=SafeUnbuffered(sys.stderr)
|
||||
dictFile = ""
|
||||
pageFile = ""
|
||||
debug = False
|
||||
debug = True
|
||||
flat_xml = False
|
||||
printOutput = False
|
||||
if len(argv) == 0:
|
||||
|
|
|
@ -7,6 +7,7 @@ import csv
|
|||
import os
|
||||
import math
|
||||
import getopt
|
||||
import functools
|
||||
from struct import pack
|
||||
from struct import unpack
|
||||
|
||||
|
@ -15,14 +16,14 @@ class DocParser(object):
|
|||
def __init__(self, flatxml, classlst, fileid, bookDir, gdict, fixedimage):
|
||||
self.id = os.path.basename(fileid).replace('.dat','')
|
||||
self.svgcount = 0
|
||||
self.docList = flatxml.split('\n')
|
||||
self.docList = flatxml.split(b'\n')
|
||||
self.docSize = len(self.docList)
|
||||
self.classList = {}
|
||||
self.bookDir = bookDir
|
||||
self.gdict = gdict
|
||||
tmpList = classlst.split('\n')
|
||||
for pclass in tmpList:
|
||||
if pclass != '':
|
||||
if pclass != b'':
|
||||
# remove the leading period from the css name
|
||||
cname = pclass[1:]
|
||||
self.classList[cname] = True
|
||||
|
@ -57,9 +58,9 @@ class DocParser(object):
|
|||
imgfile = os.path.join(imgDir,imgname)
|
||||
|
||||
# get glyph information
|
||||
gxList = self.getData('info.glyph.x',0,-1)
|
||||
gyList = self.getData('info.glyph.y',0,-1)
|
||||
gidList = self.getData('info.glyph.glyphID',0,-1)
|
||||
gxList = self.getData(b'info.glyph.x',0,-1)
|
||||
gyList = self.getData(b'info.glyph.y',0,-1)
|
||||
gidList = self.getData(b'info.glyph.glyphID',0,-1)
|
||||
|
||||
gids = []
|
||||
maxws = []
|
||||
|
@ -122,11 +123,11 @@ class DocParser(object):
|
|||
def lineinDoc(self, pos) :
|
||||
if (pos >= 0) and (pos < self.docSize) :
|
||||
item = self.docList[pos]
|
||||
if item.find('=') >= 0:
|
||||
(name, argres) = item.split('=',1)
|
||||
if item.find(b'=') >= 0:
|
||||
(name, argres) = item.split(b'=',1)
|
||||
else :
|
||||
name = item
|
||||
argres = ''
|
||||
argres = b''
|
||||
return name, argres
|
||||
|
||||
|
||||
|
@ -140,11 +141,13 @@ class DocParser(object):
|
|||
foundat = -1
|
||||
for j in range(pos, end):
|
||||
item = self.docList[j]
|
||||
if item.find('=') >= 0:
|
||||
(name, argres) = item.split('=',1)
|
||||
if item.find(b'=') >= 0:
|
||||
(name, argres) = item.split(b'=',1)
|
||||
else :
|
||||
name = item
|
||||
argres = ''
|
||||
if (isinstance(tagpath,str)):
|
||||
tagpath = tagpath.encode('utf-8')
|
||||
if name.endswith(tagpath) :
|
||||
result = argres
|
||||
foundat = j
|
||||
|
@ -170,7 +173,7 @@ class DocParser(object):
|
|||
argres=[]
|
||||
(foundat, argt) = self.findinDoc(tagpath, pos, end)
|
||||
if (argt != None) and (len(argt) > 0) :
|
||||
argList = argt.split('|')
|
||||
argList = argt.split(b'|')
|
||||
argres = [ int(strval) for strval in argList]
|
||||
return argres
|
||||
|
||||
|
@ -191,21 +194,21 @@ class DocParser(object):
|
|||
|
||||
# also some class names have spaces in them so need to convert to dashes
|
||||
if nclass != None :
|
||||
nclass = nclass.replace(' ','-')
|
||||
classres = ''
|
||||
nclass = nclass.replace(b' ',b'-')
|
||||
classres = b''
|
||||
nclass = nclass.lower()
|
||||
nclass = 'cl-' + nclass
|
||||
baseclass = ''
|
||||
nclass = b'cl-' + nclass
|
||||
baseclass = b''
|
||||
# graphic is the base class for captions
|
||||
if nclass.find('cl-cap-') >=0 :
|
||||
classres = 'graphic' + ' '
|
||||
if nclass.find(b'cl-cap-') >=0 :
|
||||
classres = b'graphic' + b' '
|
||||
else :
|
||||
# strip to find baseclass
|
||||
p = nclass.find('_')
|
||||
p = nclass.find(b'_')
|
||||
if p > 0 :
|
||||
baseclass = nclass[0:p]
|
||||
if baseclass in self.classList:
|
||||
classres += baseclass + ' '
|
||||
classres += baseclass + b' '
|
||||
classres += nclass
|
||||
nclass = classres
|
||||
return nclass
|
||||
|
@ -225,11 +228,11 @@ class DocParser(object):
|
|||
return -1
|
||||
|
||||
result = []
|
||||
(pos, pagetype) = self.findinDoc('page.type',0,-1)
|
||||
(pos, pagetype) = self.findinDoc(b'page.type',0,-1)
|
||||
|
||||
groupList = self.posinDoc('page.group')
|
||||
groupregionList = self.posinDoc('page.group.region')
|
||||
pageregionList = self.posinDoc('page.region')
|
||||
groupList = self.posinDoc(b'page.group')
|
||||
groupregionList = self.posinDoc(b'page.group.region')
|
||||
pageregionList = self.posinDoc(b'page.region')
|
||||
# integrate into one list
|
||||
for j in groupList:
|
||||
result.append(('grpbeg',j))
|
||||
|
@ -237,7 +240,7 @@ class DocParser(object):
|
|||
result.append(('gregion',j))
|
||||
for j in pageregionList:
|
||||
result.append(('pregion',j))
|
||||
result.sort(compare)
|
||||
result.sort(key=functools.cmp_to_key(compare))
|
||||
|
||||
# insert group end and page end indicators
|
||||
inGroup = False
|
||||
|
@ -267,33 +270,33 @@ class DocParser(object):
|
|||
result = []
|
||||
|
||||
# paragraph
|
||||
(pos, pclass) = self.findinDoc('paragraph.class',start,end)
|
||||
(pos, pclass) = self.findinDoc(b'paragraph.class',start,end)
|
||||
|
||||
pclass = self.getClass(pclass)
|
||||
|
||||
# if paragraph uses extratokens (extra glyphs) then make it fixed
|
||||
(pos, extraglyphs) = self.findinDoc('paragraph.extratokens',start,end)
|
||||
(pos, extraglyphs) = self.findinDoc(b'paragraph.extratokens',start,end)
|
||||
|
||||
# build up a description of the paragraph in result and return it
|
||||
# first check for the basic - all words paragraph
|
||||
(pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
|
||||
(pos, slast) = self.findinDoc('paragraph.lastWord',start,end)
|
||||
(pos, sfirst) = self.findinDoc(b'paragraph.firstWord',start,end)
|
||||
(pos, slast) = self.findinDoc(b'paragraph.lastWord',start,end)
|
||||
if (sfirst != None) and (slast != None) :
|
||||
first = int(sfirst)
|
||||
last = int(slast)
|
||||
|
||||
makeImage = (regtype == 'vertical') or (regtype == 'table')
|
||||
makeImage = (regtype == b'vertical') or (regtype == b'table')
|
||||
makeImage = makeImage or (extraglyphs != None)
|
||||
if self.fixedimage:
|
||||
makeImage = makeImage or (regtype == 'fixed')
|
||||
makeImage = makeImage or (regtype == b'fixed')
|
||||
|
||||
if (pclass != None):
|
||||
makeImage = makeImage or (pclass.find('.inverted') >= 0)
|
||||
makeImage = makeImage or (pclass.find(b'.inverted') >= 0)
|
||||
if self.fixedimage :
|
||||
makeImage = makeImage or (pclass.find('cl-f-') >= 0)
|
||||
makeImage = makeImage or (pclass.find(b'cl-f-') >= 0)
|
||||
|
||||
# before creating an image make sure glyph info exists
|
||||
gidList = self.getData('info.glyph.glyphID',0,-1)
|
||||
gidList = self.getData(b'info.glyph.glyphID',0,-1)
|
||||
|
||||
makeImage = makeImage & (len(gidList) > 0)
|
||||
|
||||
|
@ -307,8 +310,8 @@ class DocParser(object):
|
|||
# translate first and last word into first and last glyphs
|
||||
# and generate inline image and include it
|
||||
glyphList = []
|
||||
firstglyphList = self.getData('word.firstGlyph',0,-1)
|
||||
gidList = self.getData('info.glyph.glyphID',0,-1)
|
||||
firstglyphList = self.getData(b'word.firstGlyph',0,-1)
|
||||
gidList = self.getData(b'info.glyph.glyphID',0,-1)
|
||||
firstGlyph = firstglyphList[first]
|
||||
if last < len(firstglyphList):
|
||||
lastGlyph = firstglyphList[last]
|
||||
|
@ -326,8 +329,8 @@ class DocParser(object):
|
|||
for glyphnum in range(firstGlyph, lastGlyph):
|
||||
glyphList.append(glyphnum)
|
||||
# include any extratokens if they exist
|
||||
(pos, sfg) = self.findinDoc('extratokens.firstGlyph',start,end)
|
||||
(pos, slg) = self.findinDoc('extratokens.lastGlyph',start,end)
|
||||
(pos, sfg) = self.findinDoc(b'extratokens.firstGlyph',start,end)
|
||||
(pos, slg) = self.findinDoc(b'extratokens.lastGlyph',start,end)
|
||||
if (sfg != None) and (slg != None):
|
||||
for glyphnum in range(int(sfg), int(slg)):
|
||||
glyphList.append(glyphnum)
|
||||
|
@ -368,39 +371,39 @@ class DocParser(object):
|
|||
|
||||
(name, argres) = self.lineinDoc(line)
|
||||
|
||||
if name.endswith('span.firstWord') :
|
||||
if name.endswith(b'span.firstWord') :
|
||||
sp_first = int(argres)
|
||||
|
||||
elif name.endswith('span.lastWord') :
|
||||
elif name.endswith(b'span.lastWord') :
|
||||
sp_last = int(argres)
|
||||
|
||||
elif name.endswith('word.firstGlyph') :
|
||||
elif name.endswith(b'word.firstGlyph') :
|
||||
gl_first = int(argres)
|
||||
|
||||
elif name.endswith('word.lastGlyph') :
|
||||
elif name.endswith(b'word.lastGlyph') :
|
||||
gl_last = int(argres)
|
||||
|
||||
elif name.endswith('word_semantic.firstWord'):
|
||||
elif name.endswith(b'word_semantic.firstWord'):
|
||||
ws_first = int(argres)
|
||||
|
||||
elif name.endswith('word_semantic.lastWord'):
|
||||
elif name.endswith(b'word_semantic.lastWord'):
|
||||
ws_last = int(argres)
|
||||
|
||||
elif name.endswith('word.class'):
|
||||
elif name.endswith(b'word.class'):
|
||||
# we only handle spaceafter word class
|
||||
try:
|
||||
(cname, space) = argres.split('-',1)
|
||||
if space == '' : space = '0'
|
||||
if (cname == 'spaceafter') and (int(space) > 0) :
|
||||
(cname, space) = argres.split(b'-',1)
|
||||
if space == b'' : space = b'0'
|
||||
if (cname == b'spaceafter') and (int(space) > 0) :
|
||||
word_class = 'sa'
|
||||
except:
|
||||
pass
|
||||
|
||||
elif name.endswith('word.img.src'):
|
||||
elif name.endswith(b'word.img.src'):
|
||||
result.append(('img' + word_class, int(argres)))
|
||||
word_class = ''
|
||||
|
||||
elif name.endswith('region.img.src'):
|
||||
elif name.endswith(b'region.img.src'):
|
||||
result.append(('img' + word_class, int(argres)))
|
||||
|
||||
if (sp_first != -1) and (sp_last != -1):
|
||||
|
@ -437,7 +440,7 @@ class DocParser(object):
|
|||
|
||||
classres = ''
|
||||
if pclass :
|
||||
classres = ' class="' + pclass + '"'
|
||||
classres = ' class="' + pclass.decode('utf-8') + '"'
|
||||
|
||||
br_lb = (regtype == 'fixed') or (regtype == 'chapterheading') or (regtype == 'vertical')
|
||||
|
||||
|
@ -470,8 +473,8 @@ class DocParser(object):
|
|||
if (link > 0):
|
||||
linktype = self.link_type[link-1]
|
||||
title = self.link_title[link-1]
|
||||
if (title == "") or (parares.rfind(title) < 0):
|
||||
title=parares[lstart:]
|
||||
if (title == b"") or (parares.rfind(title.decode('utf-8')) < 0):
|
||||
title=parares[lstart:].encode('utf-8')
|
||||
if linktype == 'external' :
|
||||
linkhref = self.link_href[link-1]
|
||||
linkhtml = '<a href="%s">' % linkhref
|
||||
|
@ -482,33 +485,34 @@ class DocParser(object):
|
|||
else :
|
||||
# just link to the current page
|
||||
linkhtml = '<a href="#' + self.id + '">'
|
||||
linkhtml += title + '</a>'
|
||||
pos = parares.rfind(title)
|
||||
linkhtml += title.decode('utf-8')
|
||||
linkhtml += '</a>'
|
||||
pos = parares.rfind(title.decode('utf-8'))
|
||||
if pos >= 0:
|
||||
parares = parares[0:pos] + linkhtml + parares[pos+len(title):]
|
||||
else :
|
||||
parares += linkhtml
|
||||
lstart = len(parares)
|
||||
if word == '_link_' : word = ''
|
||||
if word == b'_link_' : word = b''
|
||||
elif (link < 0) :
|
||||
if word == '_link_' : word = ''
|
||||
if word == b'_link_' : word = b''
|
||||
|
||||
if word == '_lb_':
|
||||
if word == b'_lb_':
|
||||
if ((num-1) in self.dehyphen_rootid ) or handle_links:
|
||||
word = ''
|
||||
word = b''
|
||||
sep = ''
|
||||
elif br_lb :
|
||||
word = '<br />\n'
|
||||
word = b'<br />\n'
|
||||
sep = ''
|
||||
else :
|
||||
word = '\n'
|
||||
word = b'\n'
|
||||
sep = ''
|
||||
|
||||
if num in self.dehyphen_rootid :
|
||||
word = word[0:-1]
|
||||
sep = ''
|
||||
|
||||
parares += word + sep
|
||||
parares += word.decode('utf-8') + sep
|
||||
|
||||
elif wtype == 'img' :
|
||||
sep = ''
|
||||
|
@ -522,7 +526,9 @@ class DocParser(object):
|
|||
|
||||
elif wtype == 'svg' :
|
||||
sep = ''
|
||||
parares += '<img src="img/' + self.id + '_%04d.svg" alt="" />' % num
|
||||
parares += '<img src="img/'
|
||||
parares += self.id
|
||||
parares += '_%04d.svg" alt="" />' % num
|
||||
parares += sep
|
||||
|
||||
if len(sep) > 0 : parares = parares[0:-1]
|
||||
|
@ -545,7 +551,7 @@ class DocParser(object):
|
|||
(wtype, num) = pdesc[j]
|
||||
|
||||
if wtype == 'ocr' :
|
||||
word = self.ocrtext[num]
|
||||
word = self.ocrtext[num].decode('utf-8')
|
||||
sep = ' '
|
||||
|
||||
if handle_links:
|
||||
|
@ -553,7 +559,7 @@ class DocParser(object):
|
|||
if (link > 0):
|
||||
linktype = self.link_type[link-1]
|
||||
title = self.link_title[link-1]
|
||||
title = title.rstrip('. ')
|
||||
title = title.rstrip(b'. ')
|
||||
alt_title = parares[lstart:]
|
||||
alt_title = alt_title.strip()
|
||||
# now strip off the actual printed page number
|
||||
|
@ -607,38 +613,38 @@ class DocParser(object):
|
|||
hlst = []
|
||||
|
||||
# get the ocr text
|
||||
(pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
|
||||
if argres : self.ocrtext = argres.split('|')
|
||||
(pos, argres) = self.findinDoc(b'info.word.ocrText',0,-1)
|
||||
if argres : self.ocrtext = argres.split(b'|')
|
||||
|
||||
# get information to dehyphenate the text
|
||||
self.dehyphen_rootid = self.getData('info.dehyphen.rootID',0,-1)
|
||||
self.dehyphen_rootid = self.getData(b'info.dehyphen.rootID',0,-1)
|
||||
|
||||
# determine if first paragraph is continued from previous page
|
||||
(pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1)
|
||||
(pos, self.parastems_stemid) = self.findinDoc(b'info.paraStems.stemID',0,-1)
|
||||
first_para_continued = (self.parastems_stemid != None)
|
||||
|
||||
# determine if last paragraph is continued onto the next page
|
||||
(pos, self.paracont_stemid) = self.findinDoc('info.paraCont.stemID',0,-1)
|
||||
(pos, self.paracont_stemid) = self.findinDoc(b'info.paraCont.stemID',0,-1)
|
||||
last_para_continued = (self.paracont_stemid != None)
|
||||
|
||||
# collect link ids
|
||||
self.link_id = self.getData('info.word.link_id',0,-1)
|
||||
self.link_id = self.getData(b'info.word.link_id',0,-1)
|
||||
|
||||
# collect link destination page numbers
|
||||
self.link_page = self.getData('info.links.page',0,-1)
|
||||
self.link_page = self.getData(b'info.links.page',0,-1)
|
||||
|
||||
# collect link types (container versus external)
|
||||
(pos, argres) = self.findinDoc('info.links.type',0,-1)
|
||||
if argres : self.link_type = argres.split('|')
|
||||
(pos, argres) = self.findinDoc(b'info.links.type',0,-1)
|
||||
if argres : self.link_type = argres.split(b'|')
|
||||
|
||||
# collect link destinations
|
||||
(pos, argres) = self.findinDoc('info.links.href',0,-1)
|
||||
if argres : self.link_href = argres.split('|')
|
||||
(pos, argres) = self.findinDoc(b'info.links.href',0,-1)
|
||||
if argres : self.link_href = argres.split(b'|')
|
||||
|
||||
# collect link titles
|
||||
(pos, argres) = self.findinDoc('info.links.title',0,-1)
|
||||
(pos, argres) = self.findinDoc(b'info.links.title',0,-1)
|
||||
if argres :
|
||||
self.link_title = argres.split('|')
|
||||
self.link_title = argres.split(b'|')
|
||||
else:
|
||||
self.link_title.append('')
|
||||
|
||||
|
@ -662,51 +668,51 @@ class DocParser(object):
|
|||
# set anchor for link target on this page
|
||||
if not anchorSet and not first_para_continued:
|
||||
hlst.append('<div style="visibility: hidden; height: 0; width: 0;" id="')
|
||||
hlst.append(self.id + '" title="pagetype_' + pagetype + '"></div>\n')
|
||||
hlst.append(self.id + '" title="pagetype_' + pagetype.decode('utf-8') + '"></div>\n')
|
||||
anchorSet = True
|
||||
|
||||
# handle groups of graphics with text captions
|
||||
if (etype == 'grpbeg'):
|
||||
(pos, grptype) = self.findinDoc('group.type', start, end)
|
||||
if (etype == b'grpbeg'):
|
||||
(pos, grptype) = self.findinDoc(b'group.type', start, end)
|
||||
if grptype != None:
|
||||
if grptype == 'graphic':
|
||||
gcstr = ' class="' + grptype + '"'
|
||||
if grptype == b'graphic':
|
||||
gcstr = ' class="' + grptype.decode('utf-8') + '"'
|
||||
hlst.append('<div' + gcstr + '>')
|
||||
inGroup = True
|
||||
|
||||
elif (etype == 'grpend'):
|
||||
elif (etype == b'grpend'):
|
||||
if inGroup:
|
||||
hlst.append('</div>\n')
|
||||
inGroup = False
|
||||
|
||||
else:
|
||||
(pos, regtype) = self.findinDoc('region.type',start,end)
|
||||
(pos, regtype) = self.findinDoc(b'region.type',start,end)
|
||||
|
||||
if regtype == 'graphic' :
|
||||
(pos, simgsrc) = self.findinDoc('img.src',start,end)
|
||||
if regtype == b'graphic' :
|
||||
(pos, simgsrc) = self.findinDoc(b'img.src',start,end)
|
||||
if simgsrc:
|
||||
if inGroup:
|
||||
hlst.append('<img src="img/img%04d.jpg" alt="" />' % int(simgsrc))
|
||||
else:
|
||||
hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc))
|
||||
|
||||
elif regtype == 'chapterheading' :
|
||||
elif regtype == b'chapterheading' :
|
||||
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
|
||||
if not breakSet:
|
||||
hlst.append('<div style="page-break-after: always;"> </div>\n')
|
||||
breakSet = True
|
||||
tag = 'h1'
|
||||
if pclass and (len(pclass) >= 7):
|
||||
if pclass[3:7] == 'ch1-' : tag = 'h1'
|
||||
if pclass[3:7] == 'ch2-' : tag = 'h2'
|
||||
if pclass[3:7] == 'ch3-' : tag = 'h3'
|
||||
hlst.append('<' + tag + ' class="' + pclass + '">')
|
||||
if pclass[3:7] == b'ch1-' : tag = 'h1'
|
||||
if pclass[3:7] == b'ch2-' : tag = 'h2'
|
||||
if pclass[3:7] == b'ch3-' : tag = 'h3'
|
||||
hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">')
|
||||
else:
|
||||
hlst.append('<' + tag + '>')
|
||||
hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype))
|
||||
hlst.append('</' + tag + '>')
|
||||
|
||||
elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') or (regtype == 'listitem'):
|
||||
elif (regtype == b'text') or (regtype == b'fixed') or (regtype == b'insert') or (regtype == b'listitem'):
|
||||
ptype = 'full'
|
||||
# check to see if this is a continution from the previous page
|
||||
if first_para_continued :
|
||||
|
@ -715,16 +721,16 @@ class DocParser(object):
|
|||
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
|
||||
if pclass and (len(pclass) >= 6) and (ptype == 'full'):
|
||||
tag = 'p'
|
||||
if pclass[3:6] == 'h1-' : tag = 'h4'
|
||||
if pclass[3:6] == 'h2-' : tag = 'h5'
|
||||
if pclass[3:6] == 'h3-' : tag = 'h6'
|
||||
hlst.append('<' + tag + ' class="' + pclass + '">')
|
||||
if pclass[3:6] == b'h1-' : tag = 'h4'
|
||||
if pclass[3:6] == b'h2-' : tag = 'h5'
|
||||
if pclass[3:6] == b'h3-' : tag = 'h6'
|
||||
hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">')
|
||||
hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype))
|
||||
hlst.append('</' + tag + '>')
|
||||
else :
|
||||
hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
|
||||
|
||||
elif (regtype == 'tocentry') :
|
||||
elif (regtype == b'tocentry') :
|
||||
ptype = 'full'
|
||||
if first_para_continued :
|
||||
ptype = 'end'
|
||||
|
@ -733,7 +739,7 @@ class DocParser(object):
|
|||
tocinfo += self.buildTOCEntry(pdesc)
|
||||
hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
|
||||
|
||||
elif (regtype == 'vertical') or (regtype == 'table') :
|
||||
elif (regtype == b'vertical') or (regtype == b'table') :
|
||||
ptype = 'full'
|
||||
if inGroup:
|
||||
ptype = 'middle'
|
||||
|
@ -744,19 +750,19 @@ class DocParser(object):
|
|||
hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
|
||||
|
||||
|
||||
elif (regtype == 'synth_fcvr.center'):
|
||||
(pos, simgsrc) = self.findinDoc('img.src',start,end)
|
||||
elif (regtype == b'synth_fcvr.center'):
|
||||
(pos, simgsrc) = self.findinDoc(b'img.src',start,end)
|
||||
if simgsrc:
|
||||
hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc))
|
||||
|
||||
else :
|
||||
print(' Making region type', regtype, end=' ')
|
||||
(pos, temp) = self.findinDoc('paragraph',start,end)
|
||||
(pos2, temp) = self.findinDoc('span',start,end)
|
||||
(pos, temp) = self.findinDoc(b'paragraph',start,end)
|
||||
(pos2, temp) = self.findinDoc(b'span',start,end)
|
||||
if pos != -1 or pos2 != -1:
|
||||
print(' a "text" region')
|
||||
orig_regtype = regtype
|
||||
regtype = 'fixed'
|
||||
regtype = b'fixed'
|
||||
ptype = 'full'
|
||||
# check to see if this is a continution from the previous page
|
||||
if first_para_continued :
|
||||
|
@ -764,23 +770,23 @@ class DocParser(object):
|
|||
first_para_continued = False
|
||||
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
|
||||
if not pclass:
|
||||
if orig_regtype.endswith('.right') : pclass = 'cl-right'
|
||||
elif orig_regtype.endswith('.center') : pclass = 'cl-center'
|
||||
elif orig_regtype.endswith('.left') : pclass = 'cl-left'
|
||||
elif orig_regtype.endswith('.justify') : pclass = 'cl-justify'
|
||||
if orig_regtype.endswith(b'.right') : pclass = 'cl-right'
|
||||
elif orig_regtype.endswith(b'.center') : pclass = 'cl-center'
|
||||
elif orig_regtype.endswith(b'.left') : pclass = 'cl-left'
|
||||
elif orig_regtype.endswith(b'.justify') : pclass = 'cl-justify'
|
||||
if pclass and (ptype == 'full') and (len(pclass) >= 6):
|
||||
tag = 'p'
|
||||
if pclass[3:6] == 'h1-' : tag = 'h4'
|
||||
if pclass[3:6] == 'h2-' : tag = 'h5'
|
||||
if pclass[3:6] == 'h3-' : tag = 'h6'
|
||||
hlst.append('<' + tag + ' class="' + pclass + '">')
|
||||
if pclass[3:6] == b'h1-' : tag = 'h4'
|
||||
if pclass[3:6] == b'h2-' : tag = 'h5'
|
||||
if pclass[3:6] == b'h3-' : tag = 'h6'
|
||||
hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">')
|
||||
hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype))
|
||||
hlst.append('</' + tag + '>')
|
||||
else :
|
||||
hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
|
||||
else :
|
||||
print(' a "graphic" region')
|
||||
(pos, simgsrc) = self.findinDoc('img.src',start,end)
|
||||
(pos, simgsrc) = self.findinDoc(b'img.src',start,end)
|
||||
if simgsrc:
|
||||
hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc))
|
||||
|
||||
|
|
|
@ -12,7 +12,7 @@ from struct import unpack
|
|||
class PParser(object):
|
||||
def __init__(self, gd, flatxml, meta_array):
|
||||
self.gd = gd
|
||||
self.flatdoc = flatxml.split('\n')
|
||||
self.flatdoc = flatxml.split(b'\n')
|
||||
self.docSize = len(self.flatdoc)
|
||||
self.temp = []
|
||||
|
||||
|
@ -58,11 +58,11 @@ class PParser(object):
|
|||
def lineinDoc(self, pos) :
|
||||
if (pos >= 0) and (pos < self.docSize) :
|
||||
item = self.flatdoc[pos]
|
||||
if item.find('=') >= 0:
|
||||
(name, argres) = item.split('=',1)
|
||||
if item.find(b'=') >= 0:
|
||||
(name, argres) = item.split(b'=',1)
|
||||
else :
|
||||
name = item
|
||||
argres = ''
|
||||
argres = b''
|
||||
return name, argres
|
||||
|
||||
# find tag in doc if within pos to end inclusive
|
||||
|
@ -75,11 +75,13 @@ class PParser(object):
|
|||
foundat = -1
|
||||
for j in range(pos, end):
|
||||
item = self.flatdoc[j]
|
||||
if item.find('=') >= 0:
|
||||
(name, argres) = item.split('=',1)
|
||||
if item.find(b'=') >= 0:
|
||||
(name, argres) = item.split(b'=',1)
|
||||
else :
|
||||
name = item
|
||||
argres = ''
|
||||
argres = b''
|
||||
if (isinstance(tagpath,str)):
|
||||
tagpath = tagpath.encode('utf-8')
|
||||
if name.endswith(tagpath) :
|
||||
result = argres
|
||||
foundat = j
|
||||
|
@ -103,9 +105,9 @@ class PParser(object):
|
|||
cnt = len(self.flatdoc)
|
||||
for j in range(cnt):
|
||||
item = self.flatdoc[j]
|
||||
if item.find('=') >= 0:
|
||||
(name, argt) = item.split('=')
|
||||
argres = argt.split('|')
|
||||
if item.find(b'=') >= 0:
|
||||
(name, argt) = item.split(b'=')
|
||||
argres = argt.split(b'|')
|
||||
else:
|
||||
name = item
|
||||
argres = []
|
||||
|
@ -120,15 +122,17 @@ class PParser(object):
|
|||
def getDataatPos(self, path, pos):
|
||||
result = None
|
||||
item = self.flatdoc[pos]
|
||||
if item.find('=') >= 0:
|
||||
(name, argt) = item.split('=')
|
||||
argres = argt.split('|')
|
||||
if item.find(b'=') >= 0:
|
||||
(name, argt) = item.split(b'=')
|
||||
argres = argt.split(b'|')
|
||||
else:
|
||||
name = item
|
||||
argres = []
|
||||
if (len(argres) > 0) :
|
||||
for j in range(0,len(argres)):
|
||||
argres[j] = int(argres[j])
|
||||
if (isinstance(path,str)):
|
||||
path = path.encode('utf-8')
|
||||
if (name.endswith(path)):
|
||||
result = argres
|
||||
return result
|
||||
|
@ -138,12 +142,14 @@ class PParser(object):
|
|||
cnt = len(self.temp)
|
||||
for j in range(cnt):
|
||||
item = self.temp[j]
|
||||
if item.find('=') >= 0:
|
||||
(name, argt) = item.split('=')
|
||||
argres = argt.split('|')
|
||||
if item.find(b'=') >= 0:
|
||||
(name, argt) = item.split(b'=')
|
||||
argres = argt.split(b'|')
|
||||
else:
|
||||
name = item
|
||||
argres = []
|
||||
if (isinstance(path,str)):
|
||||
path = path.encode('utf-8')
|
||||
if (name.endswith(path)):
|
||||
result = argres
|
||||
self.temp.pop(j)
|
||||
|
|
|
@ -44,10 +44,10 @@ if inCalibre :
|
|||
from calibre_plugins.dedrm import flatxml2svg
|
||||
from calibre_plugins.dedrm import stylexml2css
|
||||
else :
|
||||
from . import convert2xml
|
||||
from . import flatxml2html
|
||||
from . import flatxml2svg
|
||||
from . import stylexml2css
|
||||
import convert2xml
|
||||
import flatxml2html
|
||||
import flatxml2svg
|
||||
import stylexml2css
|
||||
|
||||
# global switch
|
||||
buildXML = False
|
||||
|
@ -117,10 +117,10 @@ class Dictionary(object):
|
|||
self.stable.append(self.escapestr(readString(self.fo)))
|
||||
self.pos = 0
|
||||
def escapestr(self, str):
|
||||
str = str.replace('&','&')
|
||||
str = str.replace('<','<')
|
||||
str = str.replace('>','>')
|
||||
str = str.replace('=','=')
|
||||
str = str.replace(b'&',b'&')
|
||||
str = str.replace(b'<',b'<')
|
||||
str = str.replace(b'>',b'>')
|
||||
str = str.replace(b'=',b'=')
|
||||
return str
|
||||
def lookup(self,val):
|
||||
if ((val >= 0) and (val < self.size)) :
|
||||
|
@ -138,7 +138,7 @@ class Dictionary(object):
|
|||
|
||||
class PageDimParser(object):
|
||||
def __init__(self, flatxml):
|
||||
self.flatdoc = flatxml.split('\n')
|
||||
self.flatdoc = flatxml.split(b'\n')
|
||||
# find tag if within pos to end inclusive
|
||||
def findinDoc(self, tagpath, pos, end) :
|
||||
result = None
|
||||
|
@ -151,8 +151,8 @@ class PageDimParser(object):
|
|||
foundat = -1
|
||||
for j in range(pos, end):
|
||||
item = docList[j]
|
||||
if item.find('=') >= 0:
|
||||
(name, argres) = item.split('=')
|
||||
if item.find(b'=') >= 0:
|
||||
(name, argres) = item.split(b'=')
|
||||
else :
|
||||
name = item
|
||||
argres = ''
|
||||
|
@ -162,8 +162,8 @@ class PageDimParser(object):
|
|||
break
|
||||
return foundat, result
|
||||
def process(self):
|
||||
(pos, sph) = self.findinDoc('page.h',0,-1)
|
||||
(pos, spw) = self.findinDoc('page.w',0,-1)
|
||||
(pos, sph) = self.findinDoc(b'page.h',0,-1)
|
||||
(pos, spw) = self.findinDoc(b'page.w',0,-1)
|
||||
if (sph == None): sph = '-1'
|
||||
if (spw == None): spw = '-1'
|
||||
return sph, spw
|
||||
|
@ -176,21 +176,21 @@ def getPageDim(flatxml):
|
|||
|
||||
class GParser(object):
|
||||
def __init__(self, flatxml):
|
||||
self.flatdoc = flatxml.split('\n')
|
||||
self.flatdoc = flatxml.split(b'\n')
|
||||
self.dpi = 1440
|
||||
self.gh = self.getData('info.glyph.h')
|
||||
self.gw = self.getData('info.glyph.w')
|
||||
self.guse = self.getData('info.glyph.use')
|
||||
self.gh = self.getData(b'info.glyph.h')
|
||||
self.gw = self.getData(b'info.glyph.w')
|
||||
self.guse = self.getData(b'info.glyph.use')
|
||||
if self.guse :
|
||||
self.count = len(self.guse)
|
||||
else :
|
||||
self.count = 0
|
||||
self.gvtx = self.getData('info.glyph.vtx')
|
||||
self.glen = self.getData('info.glyph.len')
|
||||
self.gdpi = self.getData('info.glyph.dpi')
|
||||
self.vx = self.getData('info.vtx.x')
|
||||
self.vy = self.getData('info.vtx.y')
|
||||
self.vlen = self.getData('info.len.n')
|
||||
self.gvtx = self.getData(b'info.glyph.vtx')
|
||||
self.glen = self.getData(b'info.glyph.len')
|
||||
self.gdpi = self.getData(b'info.glyph.dpi')
|
||||
self.vx = self.getData(b'info.vtx.x')
|
||||
self.vy = self.getData(b'info.vtx.y')
|
||||
self.vlen = self.getData(b'info.len.n')
|
||||
if self.vlen :
|
||||
self.glen.append(len(self.vlen))
|
||||
elif self.glen:
|
||||
|
@ -204,9 +204,9 @@ class GParser(object):
|
|||
cnt = len(self.flatdoc)
|
||||
for j in range(cnt):
|
||||
item = self.flatdoc[j]
|
||||
if item.find('=') >= 0:
|
||||
(name, argt) = item.split('=')
|
||||
argres = argt.split('|')
|
||||
if item.find(b'=') >= 0:
|
||||
(name, argt) = item.split(b'=')
|
||||
argres = argt.split(b'|')
|
||||
else:
|
||||
name = item
|
||||
argres = []
|
||||
|
@ -431,7 +431,7 @@ def generateBook(bookDir, raw, fixedimage):
|
|||
|
||||
# now get the css info
|
||||
cssstr , classlst = stylexml2css.convert2CSS(flat_xml, fontsize, ph, pw)
|
||||
open(xname, 'wb').write(cssstr)
|
||||
open(xname, 'w').write(cssstr)
|
||||
if buildXML:
|
||||
xname = os.path.join(xmlDir, 'other0000.xml')
|
||||
open(xname, 'wb').write(convert2xml.getXML(dict, otherFile))
|
||||
|
@ -525,7 +525,7 @@ def generateBook(bookDir, raw, fixedimage):
|
|||
hlst.append('</body>\n</html>\n')
|
||||
htmlstr = "".join(hlst)
|
||||
hlst = None
|
||||
open(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
|
||||
open(os.path.join(bookDir, htmlFileName), 'w').write(htmlstr)
|
||||
|
||||
print(" ")
|
||||
print('Extracting Table of Contents from Amazon OCR')
|
||||
|
@ -571,7 +571,7 @@ def generateBook(bookDir, raw, fixedimage):
|
|||
tlst.append('</body>\n')
|
||||
tlst.append('</html>\n')
|
||||
tochtml = "".join(tlst)
|
||||
open(os.path.join(svgDir, 'toc.xhtml'), 'wb').write(tochtml)
|
||||
open(os.path.join(svgDir, 'toc.xhtml'), 'w').write(tochtml)
|
||||
|
||||
|
||||
# now create index_svg.xhtml that points to all required files
|
||||
|
@ -608,7 +608,7 @@ def generateBook(bookDir, raw, fixedimage):
|
|||
flst = []
|
||||
for page in pagelst:
|
||||
flst.append(xmllst[page])
|
||||
flat_svg = "".join(flst)
|
||||
flat_svg = b"".join(flst)
|
||||
flst=None
|
||||
svgxml = flatxml2svg.convert2SVG(gd, flat_svg, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi)
|
||||
if (raw) :
|
||||
|
@ -626,7 +626,7 @@ def generateBook(bookDir, raw, fixedimage):
|
|||
slst.append('</body>\n</html>\n')
|
||||
svgindex = "".join(slst)
|
||||
slst = None
|
||||
open(os.path.join(bookDir, 'index_svg.xhtml'), 'wb').write(svgindex)
|
||||
open(os.path.join(bookDir, 'index_svg.xhtml'), 'w').write(svgindex)
|
||||
|
||||
print(" ")
|
||||
|
||||
|
@ -637,16 +637,16 @@ def generateBook(bookDir, raw, fixedimage):
|
|||
olst.append('<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="guid_id">\n')
|
||||
# adding metadata
|
||||
olst.append(' <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">\n')
|
||||
if 'GUID' in meta_array:
|
||||
olst.append(' <dc:identifier opf:scheme="GUID" id="guid_id">' + meta_array['GUID'] + '</dc:identifier>\n')
|
||||
if 'ASIN' in meta_array:
|
||||
olst.append(' <dc:identifier opf:scheme="ASIN">' + meta_array['ASIN'] + '</dc:identifier>\n')
|
||||
if 'oASIN' in meta_array:
|
||||
olst.append(' <dc:identifier opf:scheme="oASIN">' + meta_array['oASIN'] + '</dc:identifier>\n')
|
||||
olst.append(' <dc:title>' + meta_array['Title'] + '</dc:title>\n')
|
||||
olst.append(' <dc:creator opf:role="aut">' + meta_array['Authors'] + '</dc:creator>\n')
|
||||
if b'GUID' in meta_array:
|
||||
olst.append(' <dc:identifier opf:scheme="GUID" id="guid_id">' + meta_array[b'GUID'].decode('utf-8') + '</dc:identifier>\n')
|
||||
if b'ASIN' in meta_array:
|
||||
olst.append(' <dc:identifier opf:scheme="ASIN">' + meta_array[b'ASIN'].decode('utf-8') + '</dc:identifier>\n')
|
||||
if b'oASIN' in meta_array:
|
||||
olst.append(' <dc:identifier opf:scheme="oASIN">' + meta_array[b'oASIN'].decode('utf-8') + '</dc:identifier>\n')
|
||||
olst.append(' <dc:title>' + meta_array[b'Title'].decode('utf-8') + '</dc:title>\n')
|
||||
olst.append(' <dc:creator opf:role="aut">' + meta_array[b'Authors'].decode('utf-8') + '</dc:creator>\n')
|
||||
olst.append(' <dc:language>en</dc:language>\n')
|
||||
olst.append(' <dc:date>' + meta_array['UpdateTime'] + '</dc:date>\n')
|
||||
olst.append(' <dc:date>' + meta_array[b'UpdateTime'].decode('utf-8') + '</dc:date>\n')
|
||||
if isCover:
|
||||
olst.append(' <meta name="cover" content="bookcover"/>\n')
|
||||
olst.append(' </metadata>\n')
|
||||
|
@ -675,7 +675,7 @@ def generateBook(bookDir, raw, fixedimage):
|
|||
olst.append('</package>\n')
|
||||
opfstr = "".join(olst)
|
||||
olst = None
|
||||
open(opfname, 'wb').write(opfstr)
|
||||
open(opfname, 'w').write(opfstr)
|
||||
|
||||
print('Processing Complete')
|
||||
|
||||
|
|
|
@ -49,14 +49,15 @@ def SHA1(message):
|
|||
|
||||
|
||||
# Encode the bytes in data with the characters in map
|
||||
# data and map should be byte arrays
|
||||
def encode(data, map):
|
||||
result = ''
|
||||
result = b''
|
||||
for char in data:
|
||||
value = ord(char)
|
||||
value = char
|
||||
Q = (value ^ 0x80) // len(map)
|
||||
R = value % len(map)
|
||||
result += map[Q]
|
||||
result += map[R]
|
||||
result += bytes([map[Q]])
|
||||
result += bytes([map[R]])
|
||||
return result
|
||||
|
||||
# Hash the bytes in data and then encode the digest with the characters in map
|
||||
|
@ -117,7 +118,7 @@ def generatePidEncryptionTable() :
|
|||
def generatePidSeed(table,dsn) :
|
||||
value = 0
|
||||
for counter in range (0,4) :
|
||||
index = (ord(dsn[counter]) ^ value) &0xFF
|
||||
index = (dsn[counter] ^ value) & 0xFF
|
||||
value = (value >> 8) ^ table[index]
|
||||
return value
|
||||
|
||||
|
@ -129,7 +130,7 @@ def generateDevicePID(table,dsn,nbRoll):
|
|||
pid = [(seed >>24) &0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF,(seed>>24) & 0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF]
|
||||
index = 0
|
||||
for counter in range (0,nbRoll):
|
||||
pid[index] = pid[index] ^ ord(dsn[counter])
|
||||
pid[index] = pid[index] ^ dsn[counter]
|
||||
index = (index+1) %8
|
||||
for counter in range (0,8):
|
||||
index = ((((pid[counter] >>5) & 3) ^ pid[counter]) & 0x1f) + (pid[counter] >> 7)
|
||||
|
@ -205,7 +206,7 @@ def getK4Pids(rec209, token, kindleDatabase):
|
|||
|
||||
try:
|
||||
# Get the kindle account token, if present
|
||||
kindleAccountToken = bytearray.fromhex((kindleDatabase[1])[b'kindle.account.tokens']).decode()
|
||||
kindleAccountToken = bytearray.fromhex((kindleDatabase[1])['kindle.account.tokens'])
|
||||
|
||||
except KeyError:
|
||||
kindleAccountToken=""
|
||||
|
@ -213,30 +214,30 @@ def getK4Pids(rec209, token, kindleDatabase):
|
|||
|
||||
try:
|
||||
# Get the DSN token, if present
|
||||
DSN = bytearray.fromhex((kindleDatabase[1])['DSN']).decode()
|
||||
DSN = bytearray.fromhex((kindleDatabase[1])['DSN'])
|
||||
print("Got DSN key from database {0}".format(kindleDatabase[0]))
|
||||
except KeyError:
|
||||
# See if we have the info to generate the DSN
|
||||
try:
|
||||
# Get the Mazama Random number
|
||||
MazamaRandomNumber = bytearray.fromhex((kindleDatabase[1])[b'MazamaRandomNumber']).decode()
|
||||
MazamaRandomNumber = bytearray.fromhex((kindleDatabase[1])['MazamaRandomNumber'])
|
||||
#print "Got MazamaRandomNumber from database {0}".format(kindleDatabase[0])
|
||||
|
||||
try:
|
||||
# Get the SerialNumber token, if present
|
||||
IDString = bytearray.fromhex((kindleDatabase[1])[b'SerialNumber']).decode()
|
||||
IDString = bytearray.fromhex((kindleDatabase[1])['SerialNumber'])
|
||||
print("Got SerialNumber from database {0}".format(kindleDatabase[0]))
|
||||
except KeyError:
|
||||
# Get the IDString we added
|
||||
IDString = bytearray.fromhex((kindleDatabase[1])[b'IDString']).decode()
|
||||
IDString = bytearray.fromhex((kindleDatabase[1])['IDString'])
|
||||
|
||||
try:
|
||||
# Get the UsernameHash token, if present
|
||||
encodedUsername = bytearray.fromhex((kindleDatabase[1])[b'UsernameHash']).decode()
|
||||
encodedUsername = bytearray.fromhex((kindleDatabase[1])['UsernameHash'])
|
||||
print("Got UsernameHash from database {0}".format(kindleDatabase[0]))
|
||||
except KeyError:
|
||||
# Get the UserName we added
|
||||
UserName = bytearray.fromhex((kindleDatabase[1])[b'UserName']).decode()
|
||||
UserName = bytearray.fromhex((kindleDatabase[1])['UserName'])
|
||||
# encode it
|
||||
encodedUsername = encodeHash(UserName,charMap1)
|
||||
#print "encodedUsername",encodedUsername.encode('hex')
|
||||
|
@ -266,19 +267,19 @@ def getK4Pids(rec209, token, kindleDatabase):
|
|||
# Compute book PIDs
|
||||
|
||||
# book pid
|
||||
pidHash = SHA1(DSN.encode()+kindleAccountToken.encode()+rec209+token)
|
||||
pidHash = SHA1(DSN+kindleAccountToken+rec209+token)
|
||||
bookPID = encodePID(pidHash)
|
||||
bookPID = checksumPid(bookPID)
|
||||
pids.append(bookPID)
|
||||
|
||||
# variant 1
|
||||
pidHash = SHA1(kindleAccountToken.encode()+rec209+token)
|
||||
pidHash = SHA1(kindleAccountToken+rec209+token)
|
||||
bookPID = encodePID(pidHash)
|
||||
bookPID = checksumPid(bookPID)
|
||||
pids.append(bookPID)
|
||||
|
||||
# variant 2
|
||||
pidHash = SHA1(DSN.encode()+rec209+token)
|
||||
pidHash = SHA1(DSN+rec209+token)
|
||||
bookPID = encodePID(pidHash)
|
||||
bookPID = checksumPid(bookPID)
|
||||
pids.append(bookPID)
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
|
||||
from __future__ import print_function
|
||||
__license__ = 'GPL v3'
|
||||
__version__ = "1.00"
|
||||
__version__ = "1.0"
|
||||
|
||||
# This is a python script. You need a Python interpreter to run it.
|
||||
# For example, ActiveState Python, which exists for windows.
|
||||
|
@ -73,7 +73,7 @@ __version__ = "1.00"
|
|||
# 0.40 - moved unicode_argv call inside main for Windows DeDRM compatibility
|
||||
# 0.41 - Fixed potential unicode problem in command line calls
|
||||
# 0.42 - Added GPL v3 licence. updated/removed some print statements
|
||||
# 1.00 - Python 3 compatibility for calibre 5.0
|
||||
# 1.0 - Python 3 compatibility for calibre 5.0
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
@ -330,7 +330,7 @@ class MobiBook:
|
|||
}
|
||||
title = ''
|
||||
codec = 'windows-1252'
|
||||
if self.magic == 'BOOKMOBI':
|
||||
if self.magic == b'BOOKMOBI':
|
||||
if 503 in self.meta_array:
|
||||
title = self.meta_array[503]
|
||||
else:
|
||||
|
|
|
@ -15,36 +15,36 @@ debug = False
|
|||
|
||||
class DocParser(object):
|
||||
def __init__(self, flatxml, fontsize, ph, pw):
|
||||
self.flatdoc = flatxml.split('\n')
|
||||
self.flatdoc = flatxml.split(b'\n')
|
||||
self.fontsize = int(fontsize)
|
||||
self.ph = int(ph) * 1.0
|
||||
self.pw = int(pw) * 1.0
|
||||
|
||||
stags = {
|
||||
'paragraph' : 'p',
|
||||
'graphic' : '.graphic'
|
||||
b'paragraph' : 'p',
|
||||
b'graphic' : '.graphic'
|
||||
}
|
||||
|
||||
attr_val_map = {
|
||||
'hang' : 'text-indent: ',
|
||||
'indent' : 'text-indent: ',
|
||||
'line-space' : 'line-height: ',
|
||||
'margin-bottom' : 'margin-bottom: ',
|
||||
'margin-left' : 'margin-left: ',
|
||||
'margin-right' : 'margin-right: ',
|
||||
'margin-top' : 'margin-top: ',
|
||||
'space-after' : 'padding-bottom: ',
|
||||
b'hang' : 'text-indent: ',
|
||||
b'indent' : 'text-indent: ',
|
||||
b'line-space' : 'line-height: ',
|
||||
b'margin-bottom' : 'margin-bottom: ',
|
||||
b'margin-left' : 'margin-left: ',
|
||||
b'margin-right' : 'margin-right: ',
|
||||
b'margin-top' : 'margin-top: ',
|
||||
b'space-after' : 'padding-bottom: ',
|
||||
}
|
||||
|
||||
attr_str_map = {
|
||||
'align-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
|
||||
'align-left' : 'text-align: left;',
|
||||
'align-right' : 'text-align: right;',
|
||||
'align-justify' : 'text-align: justify;',
|
||||
'display-inline' : 'display: inline;',
|
||||
'pos-left' : 'text-align: left;',
|
||||
'pos-right' : 'text-align: right;',
|
||||
'pos-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
|
||||
b'align-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
|
||||
b'align-left' : 'text-align: left;',
|
||||
b'align-right' : 'text-align: right;',
|
||||
b'align-justify' : 'text-align: justify;',
|
||||
b'display-inline' : 'display: inline;',
|
||||
b'pos-left' : 'text-align: left;',
|
||||
b'pos-right' : 'text-align: right;',
|
||||
b'pos-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
|
||||
}
|
||||
|
||||
|
||||
|
@ -60,11 +60,13 @@ class DocParser(object):
|
|||
foundat = -1
|
||||
for j in range(pos, end):
|
||||
item = docList[j]
|
||||
if item.find('=') >= 0:
|
||||
(name, argres) = item.split('=',1)
|
||||
if item.find(b'=') >= 0:
|
||||
(name, argres) = item.split(b'=',1)
|
||||
else :
|
||||
name = item
|
||||
argres = ''
|
||||
argres = b''
|
||||
if (isinstance(tagpath,str)):
|
||||
tagpath = tagpath.encode('utf-8')
|
||||
if name.endswith(tagpath) :
|
||||
result = argres
|
||||
foundat = j
|
||||
|
@ -76,7 +78,7 @@ class DocParser(object):
|
|||
def posinDoc(self, tagpath):
|
||||
startpos = []
|
||||
pos = 0
|
||||
res = ""
|
||||
res = b""
|
||||
while res != None :
|
||||
(foundpos, res) = self.findinDoc(tagpath, pos, -1)
|
||||
if res != None :
|
||||
|
@ -87,11 +89,11 @@ class DocParser(object):
|
|||
# returns a vector of integers for the tagpath
|
||||
def getData(self, tagpath, pos, end, clean=False):
|
||||
if clean:
|
||||
digits_only = re.compile(r'''([0-9]+)''')
|
||||
digits_only = re.compile(rb'''([0-9]+)''')
|
||||
argres=[]
|
||||
(foundat, argt) = self.findinDoc(tagpath, pos, end)
|
||||
if (argt != None) and (len(argt) > 0) :
|
||||
argList = argt.split('|')
|
||||
argList = argt.split(b'|')
|
||||
for strval in argList:
|
||||
if clean:
|
||||
m = re.search(digits_only, strval)
|
||||
|
@ -109,7 +111,7 @@ class DocParser(object):
|
|||
csspage += '.cl-justify { text-align: justify; }\n'
|
||||
|
||||
# generate a list of each <style> starting point in the stylesheet
|
||||
styleList= self.posinDoc('book.stylesheet.style')
|
||||
styleList= self.posinDoc(b'book.stylesheet.style')
|
||||
stylecnt = len(styleList)
|
||||
styleList.append(-1)
|
||||
|
||||
|
@ -121,30 +123,30 @@ class DocParser(object):
|
|||
start = styleList[j]
|
||||
end = styleList[j+1]
|
||||
|
||||
(pos, tag) = self.findinDoc('style._tag',start,end)
|
||||
(pos, tag) = self.findinDoc(b'style._tag',start,end)
|
||||
if tag == None :
|
||||
(pos, tag) = self.findinDoc('style.type',start,end)
|
||||
(pos, tag) = self.findinDoc(b'style.type',start,end)
|
||||
|
||||
# Is this something we know how to convert to css
|
||||
if tag in self.stags :
|
||||
|
||||
# get the style class
|
||||
(pos, sclass) = self.findinDoc('style.class',start,end)
|
||||
(pos, sclass) = self.findinDoc(b'style.class',start,end)
|
||||
if sclass != None:
|
||||
sclass = sclass.replace(' ','-')
|
||||
sclass = '.cl-' + sclass.lower()
|
||||
sclass = sclass.replace(b' ',b'-')
|
||||
sclass = b'.cl-' + sclass.lower()
|
||||
else :
|
||||
sclass = ''
|
||||
sclass = b''
|
||||
|
||||
if debug: print('sclass', sclass)
|
||||
|
||||
# check for any "after class" specifiers
|
||||
(pos, aftclass) = self.findinDoc('style._after_class',start,end)
|
||||
(pos, aftclass) = self.findinDoc(b'style._after_class',start,end)
|
||||
if aftclass != None:
|
||||
aftclass = aftclass.replace(' ','-')
|
||||
aftclass = '.cl-' + aftclass.lower()
|
||||
aftclass = aftclass.replace(b' ',b'-')
|
||||
aftclass = b'.cl-' + aftclass.lower()
|
||||
else :
|
||||
aftclass = ''
|
||||
aftclass = b''
|
||||
|
||||
if debug: print('aftclass', aftclass)
|
||||
|
||||
|
@ -152,34 +154,37 @@ class DocParser(object):
|
|||
|
||||
while True :
|
||||
|
||||
(pos1, attr) = self.findinDoc('style.rule.attr', start, end)
|
||||
(pos2, val) = self.findinDoc('style.rule.value', start, end)
|
||||
(pos1, attr) = self.findinDoc(b'style.rule.attr', start, end)
|
||||
(pos2, val) = self.findinDoc(b'style.rule.value', start, end)
|
||||
|
||||
if debug: print('attr', attr)
|
||||
if debug: print('val', val)
|
||||
|
||||
if attr == None : break
|
||||
|
||||
if (attr == 'display') or (attr == 'pos') or (attr == 'align'):
|
||||
if (attr == b'display') or (attr == b'pos') or (attr == b'align'):
|
||||
# handle text based attributess
|
||||
attr = attr + '-' + val
|
||||
attr = attr + b'-' + val
|
||||
if attr in self.attr_str_map :
|
||||
cssargs[attr] = (self.attr_str_map[attr], '')
|
||||
cssargs[attr] = (self.attr_str_map[attr], b'')
|
||||
else :
|
||||
# handle value based attributes
|
||||
if attr in self.attr_val_map :
|
||||
name = self.attr_val_map[attr]
|
||||
if attr in ('margin-bottom', 'margin-top', 'space-after') :
|
||||
if attr in (b'margin-bottom', b'margin-top', b'space-after') :
|
||||
scale = self.ph
|
||||
elif attr in ('margin-right', 'indent', 'margin-left', 'hang') :
|
||||
elif attr in (b'margin-right', b'indent', b'margin-left', b'hang') :
|
||||
scale = self.pw
|
||||
elif attr == 'line-space':
|
||||
elif attr == b'line-space':
|
||||
scale = self.fontsize * 2.0
|
||||
else:
|
||||
print("Scale not defined!")
|
||||
scale = 1.0
|
||||
|
||||
if val == "":
|
||||
val = 0
|
||||
|
||||
if not ((attr == 'hang') and (int(val) == 0)):
|
||||
if not ((attr == b'hang') and (int(val) == 0)):
|
||||
try:
|
||||
f = float(val)
|
||||
except:
|
||||
|
@ -198,32 +203,32 @@ class DocParser(object):
|
|||
if debug: print('keeping style')
|
||||
# make sure line-space does not go below 100% or above 300% since
|
||||
# it can be wacky in some styles
|
||||
if 'line-space' in cssargs:
|
||||
seg = cssargs['line-space'][0]
|
||||
val = cssargs['line-space'][1]
|
||||
if b'line-space' in cssargs:
|
||||
seg = cssargs[b'line-space'][0]
|
||||
val = cssargs[b'line-space'][1]
|
||||
if val < 1.0: val = 1.0
|
||||
if val > 3.0: val = 3.0
|
||||
del cssargs['line-space']
|
||||
cssargs['line-space'] = (self.attr_val_map['line-space'], val)
|
||||
del cssargs[b'line-space']
|
||||
cssargs[b'line-space'] = (self.attr_val_map[b'line-space'], val)
|
||||
|
||||
|
||||
# handle modifications for css style hanging indents
|
||||
if 'hang' in cssargs:
|
||||
hseg = cssargs['hang'][0]
|
||||
hval = cssargs['hang'][1]
|
||||
del cssargs['hang']
|
||||
cssargs['hang'] = (self.attr_val_map['hang'], -hval)
|
||||
if b'hang' in cssargs:
|
||||
hseg = cssargs[b'hang'][0]
|
||||
hval = cssargs[b'hang'][1]
|
||||
del cssargs[b'hang']
|
||||
cssargs[b'hang'] = (self.attr_val_map[b'hang'], -hval)
|
||||
mval = 0
|
||||
mseg = 'margin-left: '
|
||||
mval = hval
|
||||
if 'margin-left' in cssargs:
|
||||
mseg = cssargs['margin-left'][0]
|
||||
mval = cssargs['margin-left'][1]
|
||||
if b'margin-left' in cssargs:
|
||||
mseg = cssargs[b'margin-left'][0]
|
||||
mval = cssargs[b'margin-left'][1]
|
||||
if mval < 0: mval = 0
|
||||
mval = hval + mval
|
||||
cssargs['margin-left'] = (mseg, mval)
|
||||
if 'indent' in cssargs:
|
||||
del cssargs['indent']
|
||||
cssargs[b'margin-left'] = (mseg, mval)
|
||||
if b'indent' in cssargs:
|
||||
del cssargs[b'indent']
|
||||
|
||||
cssline = sclass + ' { '
|
||||
for key in iter(cssargs):
|
||||
|
|
|
@ -173,7 +173,7 @@ def decryptRecord(data,PID):
|
|||
def decryptDkeyRecord(data,PID):
|
||||
record = decryptRecord(data,PID)
|
||||
fields = unpack('3sB8sB8s3s',record)
|
||||
if fields[0] != 'PID' or fields[5] != 'pid' :
|
||||
if fields[0] != b'PID' or fields[5] != b'pid' :
|
||||
raise DrmException("Didn't find PID magic numbers in record")
|
||||
elif fields[1] != 8 or fields[3] != 8 :
|
||||
raise DrmException("Record didn't contain correct length fields")
|
||||
|
@ -183,11 +183,11 @@ def decryptDkeyRecord(data,PID):
|
|||
|
||||
# Decrypt all dkey records (contain the book PID)
|
||||
def decryptDkeyRecords(data,PID):
|
||||
nbKeyRecords = ord(data[0])
|
||||
nbKeyRecords = data[0]
|
||||
records = []
|
||||
data = data[1:]
|
||||
for i in range (0,nbKeyRecords):
|
||||
length = ord(data[0])
|
||||
length = data[0]
|
||||
try:
|
||||
key = decryptDkeyRecord(data[1:length+1],PID)
|
||||
records.append(key)
|
||||
|
@ -209,7 +209,7 @@ class TopazBook:
|
|||
self.bookMetadata = {}
|
||||
self.bookKey = None
|
||||
magic = unpack('4s',self.fo.read(4))[0]
|
||||
if magic != 'TPZ0':
|
||||
if magic != b'TPZ0':
|
||||
raise DrmException("Parse Error : Invalid Header, not a Topaz file")
|
||||
self.parseTopazHeaders()
|
||||
self.parseMetadata()
|
||||
|
@ -244,9 +244,9 @@ class TopazBook:
|
|||
|
||||
def parseMetadata(self):
|
||||
# Parse the metadata record from the book payload and return a list of [key,values]
|
||||
self.fo.seek(self.bookPayloadOffset + self.bookHeaderRecords['metadata'][0][0])
|
||||
self.fo.seek(self.bookPayloadOffset + self.bookHeaderRecords[b'metadata'][0][0])
|
||||
tag = bookReadString(self.fo)
|
||||
if tag != 'metadata' :
|
||||
if tag != b'metadata' :
|
||||
raise DrmException("Parse Error : Record Names Don't Match")
|
||||
flags = ord(self.fo.read(1))
|
||||
nbRecords = ord(self.fo.read(1))
|
||||
|
@ -260,18 +260,18 @@ class TopazBook:
|
|||
return self.bookMetadata
|
||||
|
||||
def getPIDMetaInfo(self):
|
||||
keysRecord = self.bookMetadata.get('keys','')
|
||||
keysRecordRecord = ''
|
||||
if keysRecord != '':
|
||||
keylst = keysRecord.split(',')
|
||||
keysRecord = self.bookMetadata.get(b'keys',b'')
|
||||
keysRecordRecord = b''
|
||||
if keysRecord != b'':
|
||||
keylst = keysRecord.split(b',')
|
||||
for keyval in keylst:
|
||||
keysRecordRecord += self.bookMetadata.get(keyval,'')
|
||||
keysRecordRecord += self.bookMetadata.get(keyval,b'')
|
||||
return keysRecord, keysRecordRecord
|
||||
|
||||
def getBookTitle(self):
|
||||
title = ''
|
||||
if 'Title' in self.bookMetadata:
|
||||
title = self.bookMetadata['Title']
|
||||
title = b''
|
||||
if b'Title' in self.bookMetadata:
|
||||
title = self.bookMetadata[b'Title']
|
||||
return title.decode('utf-8')
|
||||
|
||||
def setBookKey(self, key):
|
||||
|
@ -323,7 +323,7 @@ class TopazBook:
|
|||
raw = 0
|
||||
fixedimage=True
|
||||
try:
|
||||
keydata = self.getBookPayloadRecord('dkey', 0)
|
||||
keydata = self.getBookPayloadRecord(b'dkey', 0)
|
||||
except DrmException as e:
|
||||
print("no dkey record found, book may not be encrypted")
|
||||
print("attempting to extrct files without a book key")
|
||||
|
@ -354,7 +354,7 @@ class TopazBook:
|
|||
pass
|
||||
else:
|
||||
bookKey = bookKeys[0]
|
||||
print("Book Key Found! ({0})".format(bookKey.encode('hex')))
|
||||
print("Book Key Found! ({0})".format(bookKey.hex()))
|
||||
break
|
||||
|
||||
if not bookKey:
|
||||
|
@ -396,26 +396,26 @@ class TopazBook:
|
|||
outdir = self.outdir
|
||||
for headerRecord in self.bookHeaderRecords:
|
||||
name = headerRecord
|
||||
if name != 'dkey':
|
||||
if name != b'dkey':
|
||||
ext = ".dat"
|
||||
if name == 'img': ext = ".jpg"
|
||||
if name == 'color' : ext = ".jpg"
|
||||
print("Processing Section: {0}\n. . .".format(name), end=' ')
|
||||
if name == b'img': ext = ".jpg"
|
||||
if name == b'color' : ext = ".jpg"
|
||||
print("Processing Section: {0}\n. . .".format(name.decode('utf-8')), end=' ')
|
||||
for index in range (0,len(self.bookHeaderRecords[name])) :
|
||||
fname = "{0}{1:04d}{2}".format(name,index,ext)
|
||||
fname = "{0}{1:04d}{2}".format(name.decode('utf-8'),index,ext)
|
||||
destdir = outdir
|
||||
if name == 'img':
|
||||
if name == b'img':
|
||||
destdir = os.path.join(outdir,"img")
|
||||
if name == 'color':
|
||||
if name == b'color':
|
||||
destdir = os.path.join(outdir,"color_img")
|
||||
if name == 'page':
|
||||
if name == b'page':
|
||||
destdir = os.path.join(outdir,"page")
|
||||
if name == 'glyphs':
|
||||
if name == b'glyphs':
|
||||
destdir = os.path.join(outdir,"glyphs")
|
||||
outputFile = os.path.join(destdir,fname)
|
||||
print(".", end=' ')
|
||||
record = self.getBookPayloadRecord(name,index)
|
||||
if record != '':
|
||||
if record != b'':
|
||||
open(outputFile, 'wb').write(record)
|
||||
print(" ")
|
||||
|
||||
|
|
Loading…
Reference in New Issue