More fixes for Amazon books, fixing identity checks, started on Topaz.
This commit is contained in:
parent
dc27c36761
commit
939cdbb0c9
|
@ -56,7 +56,7 @@ def readEncodedNumber(file):
|
||||||
c = file.read(1)
|
c = file.read(1)
|
||||||
if (len(c) == 0):
|
if (len(c) == 0):
|
||||||
return None
|
return None
|
||||||
data = ord(c)
|
data = c[0]
|
||||||
datax = (datax <<7) + (data & 0x7F)
|
datax = (datax <<7) + (data & 0x7F)
|
||||||
data = datax
|
data = datax
|
||||||
|
|
||||||
|
@ -188,232 +188,232 @@ class PageParser(object):
|
||||||
# tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped)
|
# tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped)
|
||||||
|
|
||||||
token_tags = {
|
token_tags = {
|
||||||
'x' : (1, 'scalar_number', 0, 0),
|
b'x' : (1, 'scalar_number', 0, 0),
|
||||||
'y' : (1, 'scalar_number', 0, 0),
|
b'y' : (1, 'scalar_number', 0, 0),
|
||||||
'h' : (1, 'scalar_number', 0, 0),
|
b'h' : (1, 'scalar_number', 0, 0),
|
||||||
'w' : (1, 'scalar_number', 0, 0),
|
b'w' : (1, 'scalar_number', 0, 0),
|
||||||
'firstWord' : (1, 'scalar_number', 0, 0),
|
b'firstWord' : (1, 'scalar_number', 0, 0),
|
||||||
'lastWord' : (1, 'scalar_number', 0, 0),
|
b'lastWord' : (1, 'scalar_number', 0, 0),
|
||||||
'rootID' : (1, 'scalar_number', 0, 0),
|
b'rootID' : (1, 'scalar_number', 0, 0),
|
||||||
'stemID' : (1, 'scalar_number', 0, 0),
|
b'stemID' : (1, 'scalar_number', 0, 0),
|
||||||
'type' : (1, 'scalar_text', 0, 0),
|
b'type' : (1, 'scalar_text', 0, 0),
|
||||||
|
|
||||||
'info' : (0, 'number', 1, 0),
|
b'info' : (0, 'number', 1, 0),
|
||||||
|
|
||||||
'info.word' : (0, 'number', 1, 1),
|
b'info.word' : (0, 'number', 1, 1),
|
||||||
'info.word.ocrText' : (1, 'text', 0, 0),
|
b'info.word.ocrText' : (1, 'text', 0, 0),
|
||||||
'info.word.firstGlyph' : (1, 'raw', 0, 0),
|
b'info.word.firstGlyph' : (1, 'raw', 0, 0),
|
||||||
'info.word.lastGlyph' : (1, 'raw', 0, 0),
|
b'info.word.lastGlyph' : (1, 'raw', 0, 0),
|
||||||
'info.word.bl' : (1, 'raw', 0, 0),
|
b'info.word.bl' : (1, 'raw', 0, 0),
|
||||||
'info.word.link_id' : (1, 'number', 0, 0),
|
b'info.word.link_id' : (1, 'number', 0, 0),
|
||||||
|
|
||||||
'glyph' : (0, 'number', 1, 1),
|
b'glyph' : (0, 'number', 1, 1),
|
||||||
'glyph.x' : (1, 'number', 0, 0),
|
b'glyph.x' : (1, 'number', 0, 0),
|
||||||
'glyph.y' : (1, 'number', 0, 0),
|
b'glyph.y' : (1, 'number', 0, 0),
|
||||||
'glyph.glyphID' : (1, 'number', 0, 0),
|
b'glyph.glyphID' : (1, 'number', 0, 0),
|
||||||
|
|
||||||
'dehyphen' : (0, 'number', 1, 1),
|
b'dehyphen' : (0, 'number', 1, 1),
|
||||||
'dehyphen.rootID' : (1, 'number', 0, 0),
|
b'dehyphen.rootID' : (1, 'number', 0, 0),
|
||||||
'dehyphen.stemID' : (1, 'number', 0, 0),
|
b'dehyphen.stemID' : (1, 'number', 0, 0),
|
||||||
'dehyphen.stemPage' : (1, 'number', 0, 0),
|
b'dehyphen.stemPage' : (1, 'number', 0, 0),
|
||||||
'dehyphen.sh' : (1, 'number', 0, 0),
|
b'dehyphen.sh' : (1, 'number', 0, 0),
|
||||||
|
|
||||||
'links' : (0, 'number', 1, 1),
|
b'links' : (0, 'number', 1, 1),
|
||||||
'links.page' : (1, 'number', 0, 0),
|
b'links.page' : (1, 'number', 0, 0),
|
||||||
'links.rel' : (1, 'number', 0, 0),
|
b'links.rel' : (1, 'number', 0, 0),
|
||||||
'links.row' : (1, 'number', 0, 0),
|
b'links.row' : (1, 'number', 0, 0),
|
||||||
'links.title' : (1, 'text', 0, 0),
|
b'links.title' : (1, 'text', 0, 0),
|
||||||
'links.href' : (1, 'text', 0, 0),
|
b'links.href' : (1, 'text', 0, 0),
|
||||||
'links.type' : (1, 'text', 0, 0),
|
b'links.type' : (1, 'text', 0, 0),
|
||||||
'links.id' : (1, 'number', 0, 0),
|
b'links.id' : (1, 'number', 0, 0),
|
||||||
|
|
||||||
'paraCont' : (0, 'number', 1, 1),
|
b'paraCont' : (0, 'number', 1, 1),
|
||||||
'paraCont.rootID' : (1, 'number', 0, 0),
|
b'paraCont.rootID' : (1, 'number', 0, 0),
|
||||||
'paraCont.stemID' : (1, 'number', 0, 0),
|
b'paraCont.stemID' : (1, 'number', 0, 0),
|
||||||
'paraCont.stemPage' : (1, 'number', 0, 0),
|
b'paraCont.stemPage' : (1, 'number', 0, 0),
|
||||||
|
|
||||||
'paraStems' : (0, 'number', 1, 1),
|
b'paraStems' : (0, 'number', 1, 1),
|
||||||
'paraStems.stemID' : (1, 'number', 0, 0),
|
b'paraStems.stemID' : (1, 'number', 0, 0),
|
||||||
|
|
||||||
'wordStems' : (0, 'number', 1, 1),
|
b'wordStems' : (0, 'number', 1, 1),
|
||||||
'wordStems.stemID' : (1, 'number', 0, 0),
|
b'wordStems.stemID' : (1, 'number', 0, 0),
|
||||||
|
|
||||||
'empty' : (1, 'snippets', 1, 0),
|
b'empty' : (1, 'snippets', 1, 0),
|
||||||
|
|
||||||
'page' : (1, 'snippets', 1, 0),
|
b'page' : (1, 'snippets', 1, 0),
|
||||||
'page.class' : (1, 'scalar_text', 0, 0),
|
b'page.class' : (1, 'scalar_text', 0, 0),
|
||||||
'page.pageid' : (1, 'scalar_text', 0, 0),
|
b'page.pageid' : (1, 'scalar_text', 0, 0),
|
||||||
'page.pagelabel' : (1, 'scalar_text', 0, 0),
|
b'page.pagelabel' : (1, 'scalar_text', 0, 0),
|
||||||
'page.type' : (1, 'scalar_text', 0, 0),
|
b'page.type' : (1, 'scalar_text', 0, 0),
|
||||||
'page.h' : (1, 'scalar_number', 0, 0),
|
b'page.h' : (1, 'scalar_number', 0, 0),
|
||||||
'page.w' : (1, 'scalar_number', 0, 0),
|
b'page.w' : (1, 'scalar_number', 0, 0),
|
||||||
'page.startID' : (1, 'scalar_number', 0, 0),
|
b'page.startID' : (1, 'scalar_number', 0, 0),
|
||||||
|
|
||||||
'group' : (1, 'snippets', 1, 0),
|
b'group' : (1, 'snippets', 1, 0),
|
||||||
'group.class' : (1, 'scalar_text', 0, 0),
|
b'group.class' : (1, 'scalar_text', 0, 0),
|
||||||
'group.type' : (1, 'scalar_text', 0, 0),
|
b'group.type' : (1, 'scalar_text', 0, 0),
|
||||||
'group._tag' : (1, 'scalar_text', 0, 0),
|
b'group._tag' : (1, 'scalar_text', 0, 0),
|
||||||
'group.orientation': (1, 'scalar_text', 0, 0),
|
b'group.orientation': (1, 'scalar_text', 0, 0),
|
||||||
|
|
||||||
'region' : (1, 'snippets', 1, 0),
|
b'region' : (1, 'snippets', 1, 0),
|
||||||
'region.class' : (1, 'scalar_text', 0, 0),
|
b'region.class' : (1, 'scalar_text', 0, 0),
|
||||||
'region.type' : (1, 'scalar_text', 0, 0),
|
b'region.type' : (1, 'scalar_text', 0, 0),
|
||||||
'region.x' : (1, 'scalar_number', 0, 0),
|
b'region.x' : (1, 'scalar_number', 0, 0),
|
||||||
'region.y' : (1, 'scalar_number', 0, 0),
|
b'region.y' : (1, 'scalar_number', 0, 0),
|
||||||
'region.h' : (1, 'scalar_number', 0, 0),
|
b'region.h' : (1, 'scalar_number', 0, 0),
|
||||||
'region.w' : (1, 'scalar_number', 0, 0),
|
b'region.w' : (1, 'scalar_number', 0, 0),
|
||||||
'region.orientation' : (1, 'scalar_text', 0, 0),
|
b'region.orientation' : (1, 'scalar_text', 0, 0),
|
||||||
|
|
||||||
'empty_text_region' : (1, 'snippets', 1, 0),
|
b'empty_text_region' : (1, 'snippets', 1, 0),
|
||||||
|
|
||||||
'img' : (1, 'snippets', 1, 0),
|
b'img' : (1, 'snippets', 1, 0),
|
||||||
'img.x' : (1, 'scalar_number', 0, 0),
|
b'img.x' : (1, 'scalar_number', 0, 0),
|
||||||
'img.y' : (1, 'scalar_number', 0, 0),
|
b'img.y' : (1, 'scalar_number', 0, 0),
|
||||||
'img.h' : (1, 'scalar_number', 0, 0),
|
b'img.h' : (1, 'scalar_number', 0, 0),
|
||||||
'img.w' : (1, 'scalar_number', 0, 0),
|
b'img.w' : (1, 'scalar_number', 0, 0),
|
||||||
'img.src' : (1, 'scalar_number', 0, 0),
|
b'img.src' : (1, 'scalar_number', 0, 0),
|
||||||
'img.color_src' : (1, 'scalar_number', 0, 0),
|
b'img.color_src' : (1, 'scalar_number', 0, 0),
|
||||||
'img.gridSize' : (1, 'scalar_number', 0, 0),
|
b'img.gridSize' : (1, 'scalar_number', 0, 0),
|
||||||
'img.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
b'img.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||||
'img.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
b'img.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||||
'img.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
b'img.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||||
'img.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
b'img.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||||
'img.image_type' : (1, 'scalar_number', 0, 0),
|
b'img.image_type' : (1, 'scalar_number', 0, 0),
|
||||||
|
|
||||||
'paragraph' : (1, 'snippets', 1, 0),
|
b'paragraph' : (1, 'snippets', 1, 0),
|
||||||
'paragraph.class' : (1, 'scalar_text', 0, 0),
|
b'paragraph.class' : (1, 'scalar_text', 0, 0),
|
||||||
'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
|
b'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
|
||||||
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
|
b'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
|
||||||
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
|
b'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
|
||||||
'paragraph.gridSize' : (1, 'scalar_number', 0, 0),
|
b'paragraph.gridSize' : (1, 'scalar_number', 0, 0),
|
||||||
'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
b'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||||
'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
b'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||||
'paragraph.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
b'paragraph.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||||
'paragraph.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
b'paragraph.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||||
|
|
||||||
|
|
||||||
'word_semantic' : (1, 'snippets', 1, 1),
|
b'word_semantic' : (1, 'snippets', 1, 1),
|
||||||
'word_semantic.type' : (1, 'scalar_text', 0, 0),
|
b'word_semantic.type' : (1, 'scalar_text', 0, 0),
|
||||||
'word_semantic.class' : (1, 'scalar_text', 0, 0),
|
b'word_semantic.class' : (1, 'scalar_text', 0, 0),
|
||||||
'word_semantic.firstWord' : (1, 'scalar_number', 0, 0),
|
b'word_semantic.firstWord' : (1, 'scalar_number', 0, 0),
|
||||||
'word_semantic.lastWord' : (1, 'scalar_number', 0, 0),
|
b'word_semantic.lastWord' : (1, 'scalar_number', 0, 0),
|
||||||
'word_semantic.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
b'word_semantic.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||||
'word_semantic.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
b'word_semantic.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||||
'word_semantic.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
b'word_semantic.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||||
'word_semantic.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
b'word_semantic.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||||
|
|
||||||
'word' : (1, 'snippets', 1, 0),
|
b'word' : (1, 'snippets', 1, 0),
|
||||||
'word.type' : (1, 'scalar_text', 0, 0),
|
b'word.type' : (1, 'scalar_text', 0, 0),
|
||||||
'word.class' : (1, 'scalar_text', 0, 0),
|
b'word.class' : (1, 'scalar_text', 0, 0),
|
||||||
'word.firstGlyph' : (1, 'scalar_number', 0, 0),
|
b'word.firstGlyph' : (1, 'scalar_number', 0, 0),
|
||||||
'word.lastGlyph' : (1, 'scalar_number', 0, 0),
|
b'word.lastGlyph' : (1, 'scalar_number', 0, 0),
|
||||||
|
|
||||||
'_span' : (1, 'snippets', 1, 0),
|
b'_span' : (1, 'snippets', 1, 0),
|
||||||
'_span.class' : (1, 'scalar_text', 0, 0),
|
b'_span.class' : (1, 'scalar_text', 0, 0),
|
||||||
'_span.firstWord' : (1, 'scalar_number', 0, 0),
|
b'_span.firstWord' : (1, 'scalar_number', 0, 0),
|
||||||
'_span.lastWord' : (1, 'scalar_number', 0, 0),
|
b'_span.lastWord' : (1, 'scalar_number', 0, 0),
|
||||||
'_span.gridSize' : (1, 'scalar_number', 0, 0),
|
b'_span.gridSize' : (1, 'scalar_number', 0, 0),
|
||||||
'_span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
b'_span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||||
'_span.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
b'_span.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||||
'_span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
b'_span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||||
'_span.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
b'_span.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||||
|
|
||||||
'span' : (1, 'snippets', 1, 0),
|
b'span' : (1, 'snippets', 1, 0),
|
||||||
'span.firstWord' : (1, 'scalar_number', 0, 0),
|
b'span.firstWord' : (1, 'scalar_number', 0, 0),
|
||||||
'span.lastWord' : (1, 'scalar_number', 0, 0),
|
b'span.lastWord' : (1, 'scalar_number', 0, 0),
|
||||||
'span.gridSize' : (1, 'scalar_number', 0, 0),
|
b'span.gridSize' : (1, 'scalar_number', 0, 0),
|
||||||
'span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
b'span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||||
'span.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
b'span.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||||
'span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
b'span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||||
'span.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
b'span.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||||
|
|
||||||
'extratokens' : (1, 'snippets', 1, 0),
|
b'extratokens' : (1, 'snippets', 1, 0),
|
||||||
'extratokens.class' : (1, 'scalar_text', 0, 0),
|
b'extratokens.class' : (1, 'scalar_text', 0, 0),
|
||||||
'extratokens.type' : (1, 'scalar_text', 0, 0),
|
b'extratokens.type' : (1, 'scalar_text', 0, 0),
|
||||||
'extratokens.firstGlyph' : (1, 'scalar_number', 0, 0),
|
b'extratokens.firstGlyph' : (1, 'scalar_number', 0, 0),
|
||||||
'extratokens.lastGlyph' : (1, 'scalar_number', 0, 0),
|
b'extratokens.lastGlyph' : (1, 'scalar_number', 0, 0),
|
||||||
'extratokens.gridSize' : (1, 'scalar_number', 0, 0),
|
b'extratokens.gridSize' : (1, 'scalar_number', 0, 0),
|
||||||
'extratokens.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
b'extratokens.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||||
'extratokens.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
b'extratokens.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||||
'extratokens.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
b'extratokens.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||||
'extratokens.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
b'extratokens.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||||
|
|
||||||
'glyph.h' : (1, 'number', 0, 0),
|
b'glyph.h' : (1, 'number', 0, 0),
|
||||||
'glyph.w' : (1, 'number', 0, 0),
|
b'glyph.w' : (1, 'number', 0, 0),
|
||||||
'glyph.use' : (1, 'number', 0, 0),
|
b'glyph.use' : (1, 'number', 0, 0),
|
||||||
'glyph.vtx' : (1, 'number', 0, 1),
|
b'glyph.vtx' : (1, 'number', 0, 1),
|
||||||
'glyph.len' : (1, 'number', 0, 1),
|
b'glyph.len' : (1, 'number', 0, 1),
|
||||||
'glyph.dpi' : (1, 'number', 0, 0),
|
b'glyph.dpi' : (1, 'number', 0, 0),
|
||||||
'vtx' : (0, 'number', 1, 1),
|
b'vtx' : (0, 'number', 1, 1),
|
||||||
'vtx.x' : (1, 'number', 0, 0),
|
b'vtx.x' : (1, 'number', 0, 0),
|
||||||
'vtx.y' : (1, 'number', 0, 0),
|
b'vtx.y' : (1, 'number', 0, 0),
|
||||||
'len' : (0, 'number', 1, 1),
|
b'len' : (0, 'number', 1, 1),
|
||||||
'len.n' : (1, 'number', 0, 0),
|
b'len.n' : (1, 'number', 0, 0),
|
||||||
|
|
||||||
'book' : (1, 'snippets', 1, 0),
|
b'book' : (1, 'snippets', 1, 0),
|
||||||
'version' : (1, 'snippets', 1, 0),
|
b'version' : (1, 'snippets', 1, 0),
|
||||||
'version.FlowEdit_1_id' : (1, 'scalar_text', 0, 0),
|
b'version.FlowEdit_1_id' : (1, 'scalar_text', 0, 0),
|
||||||
'version.FlowEdit_1_version' : (1, 'scalar_text', 0, 0),
|
b'version.FlowEdit_1_version' : (1, 'scalar_text', 0, 0),
|
||||||
'version.Schema_id' : (1, 'scalar_text', 0, 0),
|
b'version.Schema_id' : (1, 'scalar_text', 0, 0),
|
||||||
'version.Schema_version' : (1, 'scalar_text', 0, 0),
|
b'version.Schema_version' : (1, 'scalar_text', 0, 0),
|
||||||
'version.Topaz_version' : (1, 'scalar_text', 0, 0),
|
b'version.Topaz_version' : (1, 'scalar_text', 0, 0),
|
||||||
'version.WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0),
|
b'version.WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0),
|
||||||
'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
|
b'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
|
||||||
'version.ZoneEdit_1_id' : (1, 'scalar_text', 0, 0),
|
b'version.ZoneEdit_1_id' : (1, 'scalar_text', 0, 0),
|
||||||
'version.ZoneEdit_1_version' : (1, 'scalar_text', 0, 0),
|
b'version.ZoneEdit_1_version' : (1, 'scalar_text', 0, 0),
|
||||||
'version.chapterheaders' : (1, 'scalar_text', 0, 0),
|
b'version.chapterheaders' : (1, 'scalar_text', 0, 0),
|
||||||
'version.creation_date' : (1, 'scalar_text', 0, 0),
|
b'version.creation_date' : (1, 'scalar_text', 0, 0),
|
||||||
'version.header_footer' : (1, 'scalar_text', 0, 0),
|
b'version.header_footer' : (1, 'scalar_text', 0, 0),
|
||||||
'version.init_from_ocr' : (1, 'scalar_text', 0, 0),
|
b'version.init_from_ocr' : (1, 'scalar_text', 0, 0),
|
||||||
'version.letter_insertion' : (1, 'scalar_text', 0, 0),
|
b'version.letter_insertion' : (1, 'scalar_text', 0, 0),
|
||||||
'version.xmlinj_convert' : (1, 'scalar_text', 0, 0),
|
b'version.xmlinj_convert' : (1, 'scalar_text', 0, 0),
|
||||||
'version.xmlinj_reflow' : (1, 'scalar_text', 0, 0),
|
b'version.xmlinj_reflow' : (1, 'scalar_text', 0, 0),
|
||||||
'version.xmlinj_transform' : (1, 'scalar_text', 0, 0),
|
b'version.xmlinj_transform' : (1, 'scalar_text', 0, 0),
|
||||||
'version.findlists' : (1, 'scalar_text', 0, 0),
|
b'version.findlists' : (1, 'scalar_text', 0, 0),
|
||||||
'version.page_num' : (1, 'scalar_text', 0, 0),
|
b'version.page_num' : (1, 'scalar_text', 0, 0),
|
||||||
'version.page_type' : (1, 'scalar_text', 0, 0),
|
b'version.page_type' : (1, 'scalar_text', 0, 0),
|
||||||
'version.bad_text' : (1, 'scalar_text', 0, 0),
|
b'version.bad_text' : (1, 'scalar_text', 0, 0),
|
||||||
'version.glyph_mismatch' : (1, 'scalar_text', 0, 0),
|
b'version.glyph_mismatch' : (1, 'scalar_text', 0, 0),
|
||||||
'version.margins' : (1, 'scalar_text', 0, 0),
|
b'version.margins' : (1, 'scalar_text', 0, 0),
|
||||||
'version.staggered_lines' : (1, 'scalar_text', 0, 0),
|
b'version.staggered_lines' : (1, 'scalar_text', 0, 0),
|
||||||
'version.paragraph_continuation' : (1, 'scalar_text', 0, 0),
|
b'version.paragraph_continuation' : (1, 'scalar_text', 0, 0),
|
||||||
'version.toc' : (1, 'scalar_text', 0, 0),
|
b'version.toc' : (1, 'scalar_text', 0, 0),
|
||||||
|
|
||||||
'stylesheet' : (1, 'snippets', 1, 0),
|
b'stylesheet' : (1, 'snippets', 1, 0),
|
||||||
'style' : (1, 'snippets', 1, 0),
|
b'style' : (1, 'snippets', 1, 0),
|
||||||
'style._tag' : (1, 'scalar_text', 0, 0),
|
b'style._tag' : (1, 'scalar_text', 0, 0),
|
||||||
'style.type' : (1, 'scalar_text', 0, 0),
|
b'style.type' : (1, 'scalar_text', 0, 0),
|
||||||
'style._after_type' : (1, 'scalar_text', 0, 0),
|
b'style._after_type' : (1, 'scalar_text', 0, 0),
|
||||||
'style._parent_type' : (1, 'scalar_text', 0, 0),
|
b'style._parent_type' : (1, 'scalar_text', 0, 0),
|
||||||
'style._after_parent_type' : (1, 'scalar_text', 0, 0),
|
b'style._after_parent_type' : (1, 'scalar_text', 0, 0),
|
||||||
'style.class' : (1, 'scalar_text', 0, 0),
|
b'style.class' : (1, 'scalar_text', 0, 0),
|
||||||
'style._after_class' : (1, 'scalar_text', 0, 0),
|
b'style._after_class' : (1, 'scalar_text', 0, 0),
|
||||||
'rule' : (1, 'snippets', 1, 0),
|
b'rule' : (1, 'snippets', 1, 0),
|
||||||
'rule.attr' : (1, 'scalar_text', 0, 0),
|
b'rule.attr' : (1, 'scalar_text', 0, 0),
|
||||||
'rule.value' : (1, 'scalar_text', 0, 0),
|
b'rule.value' : (1, 'scalar_text', 0, 0),
|
||||||
|
|
||||||
'original' : (0, 'number', 1, 1),
|
b'original' : (0, 'number', 1, 1),
|
||||||
'original.pnum' : (1, 'number', 0, 0),
|
b'original.pnum' : (1, 'number', 0, 0),
|
||||||
'original.pid' : (1, 'text', 0, 0),
|
b'original.pid' : (1, 'text', 0, 0),
|
||||||
'pages' : (0, 'number', 1, 1),
|
b'pages' : (0, 'number', 1, 1),
|
||||||
'pages.ref' : (1, 'number', 0, 0),
|
b'pages.ref' : (1, 'number', 0, 0),
|
||||||
'pages.id' : (1, 'number', 0, 0),
|
b'pages.id' : (1, 'number', 0, 0),
|
||||||
'startID' : (0, 'number', 1, 1),
|
b'startID' : (0, 'number', 1, 1),
|
||||||
'startID.page' : (1, 'number', 0, 0),
|
b'startID.page' : (1, 'number', 0, 0),
|
||||||
'startID.id' : (1, 'number', 0, 0),
|
b'startID.id' : (1, 'number', 0, 0),
|
||||||
|
|
||||||
'median_d' : (1, 'number', 0, 0),
|
b'median_d' : (1, 'number', 0, 0),
|
||||||
'median_h' : (1, 'number', 0, 0),
|
b'median_h' : (1, 'number', 0, 0),
|
||||||
'median_firsty' : (1, 'number', 0, 0),
|
b'median_firsty' : (1, 'number', 0, 0),
|
||||||
'median_lasty' : (1, 'number', 0, 0),
|
b'median_lasty' : (1, 'number', 0, 0),
|
||||||
|
|
||||||
'num_footers_maybe' : (1, 'number', 0, 0),
|
b'num_footers_maybe' : (1, 'number', 0, 0),
|
||||||
'num_footers_yes' : (1, 'number', 0, 0),
|
b'num_footers_yes' : (1, 'number', 0, 0),
|
||||||
'num_headers_maybe' : (1, 'number', 0, 0),
|
b'num_headers_maybe' : (1, 'number', 0, 0),
|
||||||
'num_headers_yes' : (1, 'number', 0, 0),
|
b'num_headers_yes' : (1, 'number', 0, 0),
|
||||||
|
|
||||||
'tracking' : (1, 'number', 0, 0),
|
b'tracking' : (1, 'number', 0, 0),
|
||||||
'src' : (1, 'text', 0, 0),
|
b'src' : (1, 'text', 0, 0),
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -430,7 +430,7 @@ class PageParser(object):
|
||||||
cnt = len(self.tagpath)
|
cnt = len(self.tagpath)
|
||||||
if i < cnt : result = self.tagpath[i]
|
if i < cnt : result = self.tagpath[i]
|
||||||
for j in range(i+1, cnt) :
|
for j in range(i+1, cnt) :
|
||||||
result += '.' + self.tagpath[j]
|
result += b'.' + self.tagpath[j]
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@ -505,7 +505,7 @@ class PageParser(object):
|
||||||
|
|
||||||
if (subtags == 1):
|
if (subtags == 1):
|
||||||
ntags = readEncodedNumber(self.fo)
|
ntags = readEncodedNumber(self.fo)
|
||||||
if self.debug : print('subtags: ' + token + ' has ' + str(ntags))
|
if self.debug : print('subtags: ', token , ' has ' , str(ntags))
|
||||||
for j in range(ntags):
|
for j in range(ntags):
|
||||||
val = readEncodedNumber(self.fo)
|
val = readEncodedNumber(self.fo)
|
||||||
subtagres.append(self.procToken(self.dict.lookup(val)))
|
subtagres.append(self.procToken(self.dict.lookup(val)))
|
||||||
|
@ -613,7 +613,7 @@ class PageParser(object):
|
||||||
subtagList = tag[1]
|
subtagList = tag[1]
|
||||||
argtype = tag[2]
|
argtype = tag[2]
|
||||||
argList = tag[3]
|
argList = tag[3]
|
||||||
nname = prefix + '.' + name
|
nname = prefix + b'.' + name
|
||||||
nsubtaglist = []
|
nsubtaglist = []
|
||||||
for j in subtagList:
|
for j in subtagList:
|
||||||
nsubtaglist.append(self.updateName(j,prefix))
|
nsubtaglist.append(self.updateName(j,prefix))
|
||||||
|
@ -662,34 +662,34 @@ class PageParser(object):
|
||||||
subtagList = node[1]
|
subtagList = node[1]
|
||||||
argtype = node[2]
|
argtype = node[2]
|
||||||
argList = node[3]
|
argList = node[3]
|
||||||
fullpathname = name.split('.')
|
fullpathname = name.split(b'.')
|
||||||
nodename = fullpathname.pop()
|
nodename = fullpathname.pop()
|
||||||
ilvl = len(fullpathname)
|
ilvl = len(fullpathname)
|
||||||
indent = ' ' * (3 * ilvl)
|
indent = b' ' * (3 * ilvl)
|
||||||
rlst = []
|
rlst = []
|
||||||
rlst.append(indent + '<' + nodename + '>')
|
rlst.append(indent + b'<' + nodename + b'>')
|
||||||
if len(argList) > 0:
|
if len(argList) > 0:
|
||||||
alst = []
|
alst = []
|
||||||
for j in argList:
|
for j in argList:
|
||||||
if (argtype == 'text') or (argtype == 'scalar_text') :
|
if (argtype == b'text') or (argtype == b'scalar_text') :
|
||||||
alst.append(j + '|')
|
alst.append(j + b'|')
|
||||||
else :
|
else :
|
||||||
alst.append(str(j) + ',')
|
alst.append(str(j).encode('utf-8') + b',')
|
||||||
argres = "".join(alst)
|
argres = b"".join(alst)
|
||||||
argres = argres[0:-1]
|
argres = argres[0:-1]
|
||||||
if argtype == 'snippets' :
|
if argtype == b'snippets' :
|
||||||
rlst.append('snippets:' + argres)
|
rlst.append(b'snippets:' + argres)
|
||||||
else :
|
else :
|
||||||
rlst.append(argres)
|
rlst.append(argres)
|
||||||
if len(subtagList) > 0 :
|
if len(subtagList) > 0 :
|
||||||
rlst.append('\n')
|
rlst.append(b'\n')
|
||||||
for j in subtagList:
|
for j in subtagList:
|
||||||
if len(j) > 0 :
|
if len(j) > 0 :
|
||||||
rlst.append(self.formatTag(j))
|
rlst.append(self.formatTag(j))
|
||||||
rlst.append(indent + '</' + nodename + '>\n')
|
rlst.append(indent + b'</' + nodename + b'>\n')
|
||||||
else:
|
else:
|
||||||
rlst.append('</' + nodename + '>\n')
|
rlst.append(b'</' + nodename + b'>\n')
|
||||||
return "".join(rlst)
|
return b"".join(rlst)
|
||||||
|
|
||||||
|
|
||||||
# flatten tag
|
# flatten tag
|
||||||
|
@ -704,20 +704,20 @@ class PageParser(object):
|
||||||
alst = []
|
alst = []
|
||||||
for j in argList:
|
for j in argList:
|
||||||
if (argtype == 'text') or (argtype == 'scalar_text') :
|
if (argtype == 'text') or (argtype == 'scalar_text') :
|
||||||
alst.append(j + '|')
|
alst.append(j + b'|')
|
||||||
else :
|
else :
|
||||||
alst.append(str(j) + '|')
|
alst.append(str(j).encode('utf-8') + b'|')
|
||||||
argres = "".join(alst)
|
argres = b"".join(alst)
|
||||||
argres = argres[0:-1]
|
argres = argres[0:-1]
|
||||||
if argtype == 'snippets' :
|
if argtype == b'snippets' :
|
||||||
rlst.append('.snippets=' + argres)
|
rlst.append(b'.snippets=' + argres)
|
||||||
else :
|
else :
|
||||||
rlst.append('=' + argres)
|
rlst.append(b'=' + argres)
|
||||||
rlst.append('\n')
|
rlst.append(b'\n')
|
||||||
for j in subtagList:
|
for j in subtagList:
|
||||||
if len(j) > 0 :
|
if len(j) > 0 :
|
||||||
rlst.append(self.flattenTag(j))
|
rlst.append(self.flattenTag(j))
|
||||||
return "".join(rlst)
|
return b"".join(rlst)
|
||||||
|
|
||||||
|
|
||||||
# reduce create xml output
|
# reduce create xml output
|
||||||
|
@ -729,7 +729,7 @@ class PageParser(object):
|
||||||
rlst.append(self.flattenTag(j))
|
rlst.append(self.flattenTag(j))
|
||||||
else:
|
else:
|
||||||
rlst.append(self.formatTag(j))
|
rlst.append(self.formatTag(j))
|
||||||
result = "".join(rlst)
|
result = b"".join(rlst)
|
||||||
if self.debug : print(result)
|
if self.debug : print(result)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
@ -747,16 +747,16 @@ class PageParser(object):
|
||||||
|
|
||||||
# peek at the first bytes to see what type of file it is
|
# peek at the first bytes to see what type of file it is
|
||||||
magic = self.fo.read(9)
|
magic = self.fo.read(9)
|
||||||
if (magic[0:1] == 'p') and (magic[2:9] == 'marker_'):
|
if (magic[0:1] == b'p') and (magic[2:9] == b'marker_'):
|
||||||
first_token = 'info'
|
first_token = b'info'
|
||||||
elif (magic[0:1] == 'p') and (magic[2:9] == '__PAGE_'):
|
elif (magic[0:1] == b'p') and (magic[2:9] == b'__PAGE_'):
|
||||||
skip = self.fo.read(2)
|
skip = self.fo.read(2)
|
||||||
first_token = 'info'
|
first_token = b'info'
|
||||||
elif (magic[0:1] == 'p') and (magic[2:8] == '_PAGE_'):
|
elif (magic[0:1] == b'p') and (magic[2:8] == b'_PAGE_'):
|
||||||
first_token = 'info'
|
first_token = b'info'
|
||||||
elif (magic[0:1] == 'g') and (magic[2:9] == '__GLYPH'):
|
elif (magic[0:1] == b'g') and (magic[2:9] == b'__GLYPH'):
|
||||||
skip = self.fo.read(3)
|
skip = self.fo.read(3)
|
||||||
first_token = 'info'
|
first_token = b'info'
|
||||||
else :
|
else :
|
||||||
# other0.dat file
|
# other0.dat file
|
||||||
first_token = None
|
first_token = None
|
||||||
|
@ -778,7 +778,7 @@ class PageParser(object):
|
||||||
break
|
break
|
||||||
|
|
||||||
if (v == 0x72):
|
if (v == 0x72):
|
||||||
self.doLoop72('number')
|
self.doLoop72(b'number')
|
||||||
elif (v > 0) and (v < self.dict.getSize()) :
|
elif (v > 0) and (v < self.dict.getSize()) :
|
||||||
tag = self.procToken(self.dict.lookup(v))
|
tag = self.procToken(self.dict.lookup(v))
|
||||||
if len(tag) > 0 :
|
if len(tag) > 0 :
|
||||||
|
@ -789,7 +789,7 @@ class PageParser(object):
|
||||||
if (v == 0):
|
if (v == 0):
|
||||||
if (self.peek(1) == 0x5f):
|
if (self.peek(1) == 0x5f):
|
||||||
skip = self.fo.read(1)
|
skip = self.fo.read(1)
|
||||||
first_token = 'info'
|
first_token = b'info'
|
||||||
|
|
||||||
# now do snippet injection
|
# now do snippet injection
|
||||||
if len(self.snippetList) > 0 :
|
if len(self.snippetList) > 0 :
|
||||||
|
@ -809,14 +809,14 @@ class PageParser(object):
|
||||||
|
|
||||||
def fromData(dict, fname):
|
def fromData(dict, fname):
|
||||||
flat_xml = True
|
flat_xml = True
|
||||||
debug = False
|
debug = True
|
||||||
pp = PageParser(fname, dict, debug, flat_xml)
|
pp = PageParser(fname, dict, debug, flat_xml)
|
||||||
xmlpage = pp.process()
|
xmlpage = pp.process()
|
||||||
return xmlpage
|
return xmlpage
|
||||||
|
|
||||||
def getXML(dict, fname):
|
def getXML(dict, fname):
|
||||||
flat_xml = False
|
flat_xml = False
|
||||||
debug = False
|
debug = True
|
||||||
pp = PageParser(fname, dict, debug, flat_xml)
|
pp = PageParser(fname, dict, debug, flat_xml)
|
||||||
xmlpage = pp.process()
|
xmlpage = pp.process()
|
||||||
return xmlpage
|
return xmlpage
|
||||||
|
@ -845,7 +845,7 @@ def main(argv):
|
||||||
sys.stderr=SafeUnbuffered(sys.stderr)
|
sys.stderr=SafeUnbuffered(sys.stderr)
|
||||||
dictFile = ""
|
dictFile = ""
|
||||||
pageFile = ""
|
pageFile = ""
|
||||||
debug = False
|
debug = True
|
||||||
flat_xml = False
|
flat_xml = False
|
||||||
printOutput = False
|
printOutput = False
|
||||||
if len(argv) == 0:
|
if len(argv) == 0:
|
||||||
|
|
|
@ -7,6 +7,7 @@ import csv
|
||||||
import os
|
import os
|
||||||
import math
|
import math
|
||||||
import getopt
|
import getopt
|
||||||
|
import functools
|
||||||
from struct import pack
|
from struct import pack
|
||||||
from struct import unpack
|
from struct import unpack
|
||||||
|
|
||||||
|
@ -15,14 +16,14 @@ class DocParser(object):
|
||||||
def __init__(self, flatxml, classlst, fileid, bookDir, gdict, fixedimage):
|
def __init__(self, flatxml, classlst, fileid, bookDir, gdict, fixedimage):
|
||||||
self.id = os.path.basename(fileid).replace('.dat','')
|
self.id = os.path.basename(fileid).replace('.dat','')
|
||||||
self.svgcount = 0
|
self.svgcount = 0
|
||||||
self.docList = flatxml.split('\n')
|
self.docList = flatxml.split(b'\n')
|
||||||
self.docSize = len(self.docList)
|
self.docSize = len(self.docList)
|
||||||
self.classList = {}
|
self.classList = {}
|
||||||
self.bookDir = bookDir
|
self.bookDir = bookDir
|
||||||
self.gdict = gdict
|
self.gdict = gdict
|
||||||
tmpList = classlst.split('\n')
|
tmpList = classlst.split('\n')
|
||||||
for pclass in tmpList:
|
for pclass in tmpList:
|
||||||
if pclass != '':
|
if pclass != b'':
|
||||||
# remove the leading period from the css name
|
# remove the leading period from the css name
|
||||||
cname = pclass[1:]
|
cname = pclass[1:]
|
||||||
self.classList[cname] = True
|
self.classList[cname] = True
|
||||||
|
@ -57,9 +58,9 @@ class DocParser(object):
|
||||||
imgfile = os.path.join(imgDir,imgname)
|
imgfile = os.path.join(imgDir,imgname)
|
||||||
|
|
||||||
# get glyph information
|
# get glyph information
|
||||||
gxList = self.getData('info.glyph.x',0,-1)
|
gxList = self.getData(b'info.glyph.x',0,-1)
|
||||||
gyList = self.getData('info.glyph.y',0,-1)
|
gyList = self.getData(b'info.glyph.y',0,-1)
|
||||||
gidList = self.getData('info.glyph.glyphID',0,-1)
|
gidList = self.getData(b'info.glyph.glyphID',0,-1)
|
||||||
|
|
||||||
gids = []
|
gids = []
|
||||||
maxws = []
|
maxws = []
|
||||||
|
@ -122,11 +123,11 @@ class DocParser(object):
|
||||||
def lineinDoc(self, pos) :
|
def lineinDoc(self, pos) :
|
||||||
if (pos >= 0) and (pos < self.docSize) :
|
if (pos >= 0) and (pos < self.docSize) :
|
||||||
item = self.docList[pos]
|
item = self.docList[pos]
|
||||||
if item.find('=') >= 0:
|
if item.find(b'=') >= 0:
|
||||||
(name, argres) = item.split('=',1)
|
(name, argres) = item.split(b'=',1)
|
||||||
else :
|
else :
|
||||||
name = item
|
name = item
|
||||||
argres = ''
|
argres = b''
|
||||||
return name, argres
|
return name, argres
|
||||||
|
|
||||||
|
|
||||||
|
@ -140,11 +141,13 @@ class DocParser(object):
|
||||||
foundat = -1
|
foundat = -1
|
||||||
for j in range(pos, end):
|
for j in range(pos, end):
|
||||||
item = self.docList[j]
|
item = self.docList[j]
|
||||||
if item.find('=') >= 0:
|
if item.find(b'=') >= 0:
|
||||||
(name, argres) = item.split('=',1)
|
(name, argres) = item.split(b'=',1)
|
||||||
else :
|
else :
|
||||||
name = item
|
name = item
|
||||||
argres = ''
|
argres = ''
|
||||||
|
if (isinstance(tagpath,str)):
|
||||||
|
tagpath = tagpath.encode('utf-8')
|
||||||
if name.endswith(tagpath) :
|
if name.endswith(tagpath) :
|
||||||
result = argres
|
result = argres
|
||||||
foundat = j
|
foundat = j
|
||||||
|
@ -170,7 +173,7 @@ class DocParser(object):
|
||||||
argres=[]
|
argres=[]
|
||||||
(foundat, argt) = self.findinDoc(tagpath, pos, end)
|
(foundat, argt) = self.findinDoc(tagpath, pos, end)
|
||||||
if (argt != None) and (len(argt) > 0) :
|
if (argt != None) and (len(argt) > 0) :
|
||||||
argList = argt.split('|')
|
argList = argt.split(b'|')
|
||||||
argres = [ int(strval) for strval in argList]
|
argres = [ int(strval) for strval in argList]
|
||||||
return argres
|
return argres
|
||||||
|
|
||||||
|
@ -191,21 +194,21 @@ class DocParser(object):
|
||||||
|
|
||||||
# also some class names have spaces in them so need to convert to dashes
|
# also some class names have spaces in them so need to convert to dashes
|
||||||
if nclass != None :
|
if nclass != None :
|
||||||
nclass = nclass.replace(' ','-')
|
nclass = nclass.replace(b' ',b'-')
|
||||||
classres = ''
|
classres = b''
|
||||||
nclass = nclass.lower()
|
nclass = nclass.lower()
|
||||||
nclass = 'cl-' + nclass
|
nclass = b'cl-' + nclass
|
||||||
baseclass = ''
|
baseclass = b''
|
||||||
# graphic is the base class for captions
|
# graphic is the base class for captions
|
||||||
if nclass.find('cl-cap-') >=0 :
|
if nclass.find(b'cl-cap-') >=0 :
|
||||||
classres = 'graphic' + ' '
|
classres = b'graphic' + b' '
|
||||||
else :
|
else :
|
||||||
# strip to find baseclass
|
# strip to find baseclass
|
||||||
p = nclass.find('_')
|
p = nclass.find(b'_')
|
||||||
if p > 0 :
|
if p > 0 :
|
||||||
baseclass = nclass[0:p]
|
baseclass = nclass[0:p]
|
||||||
if baseclass in self.classList:
|
if baseclass in self.classList:
|
||||||
classres += baseclass + ' '
|
classres += baseclass + b' '
|
||||||
classres += nclass
|
classres += nclass
|
||||||
nclass = classres
|
nclass = classres
|
||||||
return nclass
|
return nclass
|
||||||
|
@ -225,11 +228,11 @@ class DocParser(object):
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
result = []
|
result = []
|
||||||
(pos, pagetype) = self.findinDoc('page.type',0,-1)
|
(pos, pagetype) = self.findinDoc(b'page.type',0,-1)
|
||||||
|
|
||||||
groupList = self.posinDoc('page.group')
|
groupList = self.posinDoc(b'page.group')
|
||||||
groupregionList = self.posinDoc('page.group.region')
|
groupregionList = self.posinDoc(b'page.group.region')
|
||||||
pageregionList = self.posinDoc('page.region')
|
pageregionList = self.posinDoc(b'page.region')
|
||||||
# integrate into one list
|
# integrate into one list
|
||||||
for j in groupList:
|
for j in groupList:
|
||||||
result.append(('grpbeg',j))
|
result.append(('grpbeg',j))
|
||||||
|
@ -237,7 +240,7 @@ class DocParser(object):
|
||||||
result.append(('gregion',j))
|
result.append(('gregion',j))
|
||||||
for j in pageregionList:
|
for j in pageregionList:
|
||||||
result.append(('pregion',j))
|
result.append(('pregion',j))
|
||||||
result.sort(compare)
|
result.sort(key=functools.cmp_to_key(compare))
|
||||||
|
|
||||||
# insert group end and page end indicators
|
# insert group end and page end indicators
|
||||||
inGroup = False
|
inGroup = False
|
||||||
|
@ -267,33 +270,33 @@ class DocParser(object):
|
||||||
result = []
|
result = []
|
||||||
|
|
||||||
# paragraph
|
# paragraph
|
||||||
(pos, pclass) = self.findinDoc('paragraph.class',start,end)
|
(pos, pclass) = self.findinDoc(b'paragraph.class',start,end)
|
||||||
|
|
||||||
pclass = self.getClass(pclass)
|
pclass = self.getClass(pclass)
|
||||||
|
|
||||||
# if paragraph uses extratokens (extra glyphs) then make it fixed
|
# if paragraph uses extratokens (extra glyphs) then make it fixed
|
||||||
(pos, extraglyphs) = self.findinDoc('paragraph.extratokens',start,end)
|
(pos, extraglyphs) = self.findinDoc(b'paragraph.extratokens',start,end)
|
||||||
|
|
||||||
# build up a description of the paragraph in result and return it
|
# build up a description of the paragraph in result and return it
|
||||||
# first check for the basic - all words paragraph
|
# first check for the basic - all words paragraph
|
||||||
(pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
|
(pos, sfirst) = self.findinDoc(b'paragraph.firstWord',start,end)
|
||||||
(pos, slast) = self.findinDoc('paragraph.lastWord',start,end)
|
(pos, slast) = self.findinDoc(b'paragraph.lastWord',start,end)
|
||||||
if (sfirst != None) and (slast != None) :
|
if (sfirst != None) and (slast != None) :
|
||||||
first = int(sfirst)
|
first = int(sfirst)
|
||||||
last = int(slast)
|
last = int(slast)
|
||||||
|
|
||||||
makeImage = (regtype == 'vertical') or (regtype == 'table')
|
makeImage = (regtype == b'vertical') or (regtype == b'table')
|
||||||
makeImage = makeImage or (extraglyphs != None)
|
makeImage = makeImage or (extraglyphs != None)
|
||||||
if self.fixedimage:
|
if self.fixedimage:
|
||||||
makeImage = makeImage or (regtype == 'fixed')
|
makeImage = makeImage or (regtype == b'fixed')
|
||||||
|
|
||||||
if (pclass != None):
|
if (pclass != None):
|
||||||
makeImage = makeImage or (pclass.find('.inverted') >= 0)
|
makeImage = makeImage or (pclass.find(b'.inverted') >= 0)
|
||||||
if self.fixedimage :
|
if self.fixedimage :
|
||||||
makeImage = makeImage or (pclass.find('cl-f-') >= 0)
|
makeImage = makeImage or (pclass.find(b'cl-f-') >= 0)
|
||||||
|
|
||||||
# before creating an image make sure glyph info exists
|
# before creating an image make sure glyph info exists
|
||||||
gidList = self.getData('info.glyph.glyphID',0,-1)
|
gidList = self.getData(b'info.glyph.glyphID',0,-1)
|
||||||
|
|
||||||
makeImage = makeImage & (len(gidList) > 0)
|
makeImage = makeImage & (len(gidList) > 0)
|
||||||
|
|
||||||
|
@ -307,8 +310,8 @@ class DocParser(object):
|
||||||
# translate first and last word into first and last glyphs
|
# translate first and last word into first and last glyphs
|
||||||
# and generate inline image and include it
|
# and generate inline image and include it
|
||||||
glyphList = []
|
glyphList = []
|
||||||
firstglyphList = self.getData('word.firstGlyph',0,-1)
|
firstglyphList = self.getData(b'word.firstGlyph',0,-1)
|
||||||
gidList = self.getData('info.glyph.glyphID',0,-1)
|
gidList = self.getData(b'info.glyph.glyphID',0,-1)
|
||||||
firstGlyph = firstglyphList[first]
|
firstGlyph = firstglyphList[first]
|
||||||
if last < len(firstglyphList):
|
if last < len(firstglyphList):
|
||||||
lastGlyph = firstglyphList[last]
|
lastGlyph = firstglyphList[last]
|
||||||
|
@ -326,8 +329,8 @@ class DocParser(object):
|
||||||
for glyphnum in range(firstGlyph, lastGlyph):
|
for glyphnum in range(firstGlyph, lastGlyph):
|
||||||
glyphList.append(glyphnum)
|
glyphList.append(glyphnum)
|
||||||
# include any extratokens if they exist
|
# include any extratokens if they exist
|
||||||
(pos, sfg) = self.findinDoc('extratokens.firstGlyph',start,end)
|
(pos, sfg) = self.findinDoc(b'extratokens.firstGlyph',start,end)
|
||||||
(pos, slg) = self.findinDoc('extratokens.lastGlyph',start,end)
|
(pos, slg) = self.findinDoc(b'extratokens.lastGlyph',start,end)
|
||||||
if (sfg != None) and (slg != None):
|
if (sfg != None) and (slg != None):
|
||||||
for glyphnum in range(int(sfg), int(slg)):
|
for glyphnum in range(int(sfg), int(slg)):
|
||||||
glyphList.append(glyphnum)
|
glyphList.append(glyphnum)
|
||||||
|
@ -368,39 +371,39 @@ class DocParser(object):
|
||||||
|
|
||||||
(name, argres) = self.lineinDoc(line)
|
(name, argres) = self.lineinDoc(line)
|
||||||
|
|
||||||
if name.endswith('span.firstWord') :
|
if name.endswith(b'span.firstWord') :
|
||||||
sp_first = int(argres)
|
sp_first = int(argres)
|
||||||
|
|
||||||
elif name.endswith('span.lastWord') :
|
elif name.endswith(b'span.lastWord') :
|
||||||
sp_last = int(argres)
|
sp_last = int(argres)
|
||||||
|
|
||||||
elif name.endswith('word.firstGlyph') :
|
elif name.endswith(b'word.firstGlyph') :
|
||||||
gl_first = int(argres)
|
gl_first = int(argres)
|
||||||
|
|
||||||
elif name.endswith('word.lastGlyph') :
|
elif name.endswith(b'word.lastGlyph') :
|
||||||
gl_last = int(argres)
|
gl_last = int(argres)
|
||||||
|
|
||||||
elif name.endswith('word_semantic.firstWord'):
|
elif name.endswith(b'word_semantic.firstWord'):
|
||||||
ws_first = int(argres)
|
ws_first = int(argres)
|
||||||
|
|
||||||
elif name.endswith('word_semantic.lastWord'):
|
elif name.endswith(b'word_semantic.lastWord'):
|
||||||
ws_last = int(argres)
|
ws_last = int(argres)
|
||||||
|
|
||||||
elif name.endswith('word.class'):
|
elif name.endswith(b'word.class'):
|
||||||
# we only handle spaceafter word class
|
# we only handle spaceafter word class
|
||||||
try:
|
try:
|
||||||
(cname, space) = argres.split('-',1)
|
(cname, space) = argres.split(b'-',1)
|
||||||
if space == '' : space = '0'
|
if space == b'' : space = b'0'
|
||||||
if (cname == 'spaceafter') and (int(space) > 0) :
|
if (cname == b'spaceafter') and (int(space) > 0) :
|
||||||
word_class = 'sa'
|
word_class = 'sa'
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
elif name.endswith('word.img.src'):
|
elif name.endswith(b'word.img.src'):
|
||||||
result.append(('img' + word_class, int(argres)))
|
result.append(('img' + word_class, int(argres)))
|
||||||
word_class = ''
|
word_class = ''
|
||||||
|
|
||||||
elif name.endswith('region.img.src'):
|
elif name.endswith(b'region.img.src'):
|
||||||
result.append(('img' + word_class, int(argres)))
|
result.append(('img' + word_class, int(argres)))
|
||||||
|
|
||||||
if (sp_first != -1) and (sp_last != -1):
|
if (sp_first != -1) and (sp_last != -1):
|
||||||
|
@ -437,7 +440,7 @@ class DocParser(object):
|
||||||
|
|
||||||
classres = ''
|
classres = ''
|
||||||
if pclass :
|
if pclass :
|
||||||
classres = ' class="' + pclass + '"'
|
classres = ' class="' + pclass.decode('utf-8') + '"'
|
||||||
|
|
||||||
br_lb = (regtype == 'fixed') or (regtype == 'chapterheading') or (regtype == 'vertical')
|
br_lb = (regtype == 'fixed') or (regtype == 'chapterheading') or (regtype == 'vertical')
|
||||||
|
|
||||||
|
@ -470,8 +473,8 @@ class DocParser(object):
|
||||||
if (link > 0):
|
if (link > 0):
|
||||||
linktype = self.link_type[link-1]
|
linktype = self.link_type[link-1]
|
||||||
title = self.link_title[link-1]
|
title = self.link_title[link-1]
|
||||||
if (title == "") or (parares.rfind(title) < 0):
|
if (title == b"") or (parares.rfind(title.decode('utf-8')) < 0):
|
||||||
title=parares[lstart:]
|
title=parares[lstart:].encode('utf-8')
|
||||||
if linktype == 'external' :
|
if linktype == 'external' :
|
||||||
linkhref = self.link_href[link-1]
|
linkhref = self.link_href[link-1]
|
||||||
linkhtml = '<a href="%s">' % linkhref
|
linkhtml = '<a href="%s">' % linkhref
|
||||||
|
@ -482,33 +485,34 @@ class DocParser(object):
|
||||||
else :
|
else :
|
||||||
# just link to the current page
|
# just link to the current page
|
||||||
linkhtml = '<a href="#' + self.id + '">'
|
linkhtml = '<a href="#' + self.id + '">'
|
||||||
linkhtml += title + '</a>'
|
linkhtml += title.decode('utf-8')
|
||||||
pos = parares.rfind(title)
|
linkhtml += '</a>'
|
||||||
|
pos = parares.rfind(title.decode('utf-8'))
|
||||||
if pos >= 0:
|
if pos >= 0:
|
||||||
parares = parares[0:pos] + linkhtml + parares[pos+len(title):]
|
parares = parares[0:pos] + linkhtml + parares[pos+len(title):]
|
||||||
else :
|
else :
|
||||||
parares += linkhtml
|
parares += linkhtml
|
||||||
lstart = len(parares)
|
lstart = len(parares)
|
||||||
if word == '_link_' : word = ''
|
if word == b'_link_' : word = b''
|
||||||
elif (link < 0) :
|
elif (link < 0) :
|
||||||
if word == '_link_' : word = ''
|
if word == b'_link_' : word = b''
|
||||||
|
|
||||||
if word == '_lb_':
|
if word == b'_lb_':
|
||||||
if ((num-1) in self.dehyphen_rootid ) or handle_links:
|
if ((num-1) in self.dehyphen_rootid ) or handle_links:
|
||||||
word = ''
|
word = b''
|
||||||
sep = ''
|
sep = ''
|
||||||
elif br_lb :
|
elif br_lb :
|
||||||
word = '<br />\n'
|
word = b'<br />\n'
|
||||||
sep = ''
|
sep = ''
|
||||||
else :
|
else :
|
||||||
word = '\n'
|
word = b'\n'
|
||||||
sep = ''
|
sep = ''
|
||||||
|
|
||||||
if num in self.dehyphen_rootid :
|
if num in self.dehyphen_rootid :
|
||||||
word = word[0:-1]
|
word = word[0:-1]
|
||||||
sep = ''
|
sep = ''
|
||||||
|
|
||||||
parares += word + sep
|
parares += word.decode('utf-8') + sep
|
||||||
|
|
||||||
elif wtype == 'img' :
|
elif wtype == 'img' :
|
||||||
sep = ''
|
sep = ''
|
||||||
|
@ -522,7 +526,9 @@ class DocParser(object):
|
||||||
|
|
||||||
elif wtype == 'svg' :
|
elif wtype == 'svg' :
|
||||||
sep = ''
|
sep = ''
|
||||||
parares += '<img src="img/' + self.id + '_%04d.svg" alt="" />' % num
|
parares += '<img src="img/'
|
||||||
|
parares += self.id
|
||||||
|
parares += '_%04d.svg" alt="" />' % num
|
||||||
parares += sep
|
parares += sep
|
||||||
|
|
||||||
if len(sep) > 0 : parares = parares[0:-1]
|
if len(sep) > 0 : parares = parares[0:-1]
|
||||||
|
@ -545,7 +551,7 @@ class DocParser(object):
|
||||||
(wtype, num) = pdesc[j]
|
(wtype, num) = pdesc[j]
|
||||||
|
|
||||||
if wtype == 'ocr' :
|
if wtype == 'ocr' :
|
||||||
word = self.ocrtext[num]
|
word = self.ocrtext[num].decode('utf-8')
|
||||||
sep = ' '
|
sep = ' '
|
||||||
|
|
||||||
if handle_links:
|
if handle_links:
|
||||||
|
@ -553,7 +559,7 @@ class DocParser(object):
|
||||||
if (link > 0):
|
if (link > 0):
|
||||||
linktype = self.link_type[link-1]
|
linktype = self.link_type[link-1]
|
||||||
title = self.link_title[link-1]
|
title = self.link_title[link-1]
|
||||||
title = title.rstrip('. ')
|
title = title.rstrip(b'. ')
|
||||||
alt_title = parares[lstart:]
|
alt_title = parares[lstart:]
|
||||||
alt_title = alt_title.strip()
|
alt_title = alt_title.strip()
|
||||||
# now strip off the actual printed page number
|
# now strip off the actual printed page number
|
||||||
|
@ -607,38 +613,38 @@ class DocParser(object):
|
||||||
hlst = []
|
hlst = []
|
||||||
|
|
||||||
# get the ocr text
|
# get the ocr text
|
||||||
(pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
|
(pos, argres) = self.findinDoc(b'info.word.ocrText',0,-1)
|
||||||
if argres : self.ocrtext = argres.split('|')
|
if argres : self.ocrtext = argres.split(b'|')
|
||||||
|
|
||||||
# get information to dehyphenate the text
|
# get information to dehyphenate the text
|
||||||
self.dehyphen_rootid = self.getData('info.dehyphen.rootID',0,-1)
|
self.dehyphen_rootid = self.getData(b'info.dehyphen.rootID',0,-1)
|
||||||
|
|
||||||
# determine if first paragraph is continued from previous page
|
# determine if first paragraph is continued from previous page
|
||||||
(pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1)
|
(pos, self.parastems_stemid) = self.findinDoc(b'info.paraStems.stemID',0,-1)
|
||||||
first_para_continued = (self.parastems_stemid != None)
|
first_para_continued = (self.parastems_stemid != None)
|
||||||
|
|
||||||
# determine if last paragraph is continued onto the next page
|
# determine if last paragraph is continued onto the next page
|
||||||
(pos, self.paracont_stemid) = self.findinDoc('info.paraCont.stemID',0,-1)
|
(pos, self.paracont_stemid) = self.findinDoc(b'info.paraCont.stemID',0,-1)
|
||||||
last_para_continued = (self.paracont_stemid != None)
|
last_para_continued = (self.paracont_stemid != None)
|
||||||
|
|
||||||
# collect link ids
|
# collect link ids
|
||||||
self.link_id = self.getData('info.word.link_id',0,-1)
|
self.link_id = self.getData(b'info.word.link_id',0,-1)
|
||||||
|
|
||||||
# collect link destination page numbers
|
# collect link destination page numbers
|
||||||
self.link_page = self.getData('info.links.page',0,-1)
|
self.link_page = self.getData(b'info.links.page',0,-1)
|
||||||
|
|
||||||
# collect link types (container versus external)
|
# collect link types (container versus external)
|
||||||
(pos, argres) = self.findinDoc('info.links.type',0,-1)
|
(pos, argres) = self.findinDoc(b'info.links.type',0,-1)
|
||||||
if argres : self.link_type = argres.split('|')
|
if argres : self.link_type = argres.split(b'|')
|
||||||
|
|
||||||
# collect link destinations
|
# collect link destinations
|
||||||
(pos, argres) = self.findinDoc('info.links.href',0,-1)
|
(pos, argres) = self.findinDoc(b'info.links.href',0,-1)
|
||||||
if argres : self.link_href = argres.split('|')
|
if argres : self.link_href = argres.split(b'|')
|
||||||
|
|
||||||
# collect link titles
|
# collect link titles
|
||||||
(pos, argres) = self.findinDoc('info.links.title',0,-1)
|
(pos, argres) = self.findinDoc(b'info.links.title',0,-1)
|
||||||
if argres :
|
if argres :
|
||||||
self.link_title = argres.split('|')
|
self.link_title = argres.split(b'|')
|
||||||
else:
|
else:
|
||||||
self.link_title.append('')
|
self.link_title.append('')
|
||||||
|
|
||||||
|
@ -662,51 +668,51 @@ class DocParser(object):
|
||||||
# set anchor for link target on this page
|
# set anchor for link target on this page
|
||||||
if not anchorSet and not first_para_continued:
|
if not anchorSet and not first_para_continued:
|
||||||
hlst.append('<div style="visibility: hidden; height: 0; width: 0;" id="')
|
hlst.append('<div style="visibility: hidden; height: 0; width: 0;" id="')
|
||||||
hlst.append(self.id + '" title="pagetype_' + pagetype + '"></div>\n')
|
hlst.append(self.id + '" title="pagetype_' + pagetype.decode('utf-8') + '"></div>\n')
|
||||||
anchorSet = True
|
anchorSet = True
|
||||||
|
|
||||||
# handle groups of graphics with text captions
|
# handle groups of graphics with text captions
|
||||||
if (etype == 'grpbeg'):
|
if (etype == b'grpbeg'):
|
||||||
(pos, grptype) = self.findinDoc('group.type', start, end)
|
(pos, grptype) = self.findinDoc(b'group.type', start, end)
|
||||||
if grptype != None:
|
if grptype != None:
|
||||||
if grptype == 'graphic':
|
if grptype == b'graphic':
|
||||||
gcstr = ' class="' + grptype + '"'
|
gcstr = ' class="' + grptype.decode('utf-8') + '"'
|
||||||
hlst.append('<div' + gcstr + '>')
|
hlst.append('<div' + gcstr + '>')
|
||||||
inGroup = True
|
inGroup = True
|
||||||
|
|
||||||
elif (etype == 'grpend'):
|
elif (etype == b'grpend'):
|
||||||
if inGroup:
|
if inGroup:
|
||||||
hlst.append('</div>\n')
|
hlst.append('</div>\n')
|
||||||
inGroup = False
|
inGroup = False
|
||||||
|
|
||||||
else:
|
else:
|
||||||
(pos, regtype) = self.findinDoc('region.type',start,end)
|
(pos, regtype) = self.findinDoc(b'region.type',start,end)
|
||||||
|
|
||||||
if regtype == 'graphic' :
|
if regtype == b'graphic' :
|
||||||
(pos, simgsrc) = self.findinDoc('img.src',start,end)
|
(pos, simgsrc) = self.findinDoc(b'img.src',start,end)
|
||||||
if simgsrc:
|
if simgsrc:
|
||||||
if inGroup:
|
if inGroup:
|
||||||
hlst.append('<img src="img/img%04d.jpg" alt="" />' % int(simgsrc))
|
hlst.append('<img src="img/img%04d.jpg" alt="" />' % int(simgsrc))
|
||||||
else:
|
else:
|
||||||
hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc))
|
hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc))
|
||||||
|
|
||||||
elif regtype == 'chapterheading' :
|
elif regtype == b'chapterheading' :
|
||||||
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
|
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
|
||||||
if not breakSet:
|
if not breakSet:
|
||||||
hlst.append('<div style="page-break-after: always;"> </div>\n')
|
hlst.append('<div style="page-break-after: always;"> </div>\n')
|
||||||
breakSet = True
|
breakSet = True
|
||||||
tag = 'h1'
|
tag = 'h1'
|
||||||
if pclass and (len(pclass) >= 7):
|
if pclass and (len(pclass) >= 7):
|
||||||
if pclass[3:7] == 'ch1-' : tag = 'h1'
|
if pclass[3:7] == b'ch1-' : tag = 'h1'
|
||||||
if pclass[3:7] == 'ch2-' : tag = 'h2'
|
if pclass[3:7] == b'ch2-' : tag = 'h2'
|
||||||
if pclass[3:7] == 'ch3-' : tag = 'h3'
|
if pclass[3:7] == b'ch3-' : tag = 'h3'
|
||||||
hlst.append('<' + tag + ' class="' + pclass + '">')
|
hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">')
|
||||||
else:
|
else:
|
||||||
hlst.append('<' + tag + '>')
|
hlst.append('<' + tag + '>')
|
||||||
hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype))
|
hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype))
|
||||||
hlst.append('</' + tag + '>')
|
hlst.append('</' + tag + '>')
|
||||||
|
|
||||||
elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') or (regtype == 'listitem'):
|
elif (regtype == b'text') or (regtype == b'fixed') or (regtype == b'insert') or (regtype == b'listitem'):
|
||||||
ptype = 'full'
|
ptype = 'full'
|
||||||
# check to see if this is a continution from the previous page
|
# check to see if this is a continution from the previous page
|
||||||
if first_para_continued :
|
if first_para_continued :
|
||||||
|
@ -715,16 +721,16 @@ class DocParser(object):
|
||||||
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
|
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
|
||||||
if pclass and (len(pclass) >= 6) and (ptype == 'full'):
|
if pclass and (len(pclass) >= 6) and (ptype == 'full'):
|
||||||
tag = 'p'
|
tag = 'p'
|
||||||
if pclass[3:6] == 'h1-' : tag = 'h4'
|
if pclass[3:6] == b'h1-' : tag = 'h4'
|
||||||
if pclass[3:6] == 'h2-' : tag = 'h5'
|
if pclass[3:6] == b'h2-' : tag = 'h5'
|
||||||
if pclass[3:6] == 'h3-' : tag = 'h6'
|
if pclass[3:6] == b'h3-' : tag = 'h6'
|
||||||
hlst.append('<' + tag + ' class="' + pclass + '">')
|
hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">')
|
||||||
hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype))
|
hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype))
|
||||||
hlst.append('</' + tag + '>')
|
hlst.append('</' + tag + '>')
|
||||||
else :
|
else :
|
||||||
hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
|
hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
|
||||||
|
|
||||||
elif (regtype == 'tocentry') :
|
elif (regtype == b'tocentry') :
|
||||||
ptype = 'full'
|
ptype = 'full'
|
||||||
if first_para_continued :
|
if first_para_continued :
|
||||||
ptype = 'end'
|
ptype = 'end'
|
||||||
|
@ -733,7 +739,7 @@ class DocParser(object):
|
||||||
tocinfo += self.buildTOCEntry(pdesc)
|
tocinfo += self.buildTOCEntry(pdesc)
|
||||||
hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
|
hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
|
||||||
|
|
||||||
elif (regtype == 'vertical') or (regtype == 'table') :
|
elif (regtype == b'vertical') or (regtype == b'table') :
|
||||||
ptype = 'full'
|
ptype = 'full'
|
||||||
if inGroup:
|
if inGroup:
|
||||||
ptype = 'middle'
|
ptype = 'middle'
|
||||||
|
@ -744,19 +750,19 @@ class DocParser(object):
|
||||||
hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
|
hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
|
||||||
|
|
||||||
|
|
||||||
elif (regtype == 'synth_fcvr.center'):
|
elif (regtype == b'synth_fcvr.center'):
|
||||||
(pos, simgsrc) = self.findinDoc('img.src',start,end)
|
(pos, simgsrc) = self.findinDoc(b'img.src',start,end)
|
||||||
if simgsrc:
|
if simgsrc:
|
||||||
hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc))
|
hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc))
|
||||||
|
|
||||||
else :
|
else :
|
||||||
print(' Making region type', regtype, end=' ')
|
print(' Making region type', regtype, end=' ')
|
||||||
(pos, temp) = self.findinDoc('paragraph',start,end)
|
(pos, temp) = self.findinDoc(b'paragraph',start,end)
|
||||||
(pos2, temp) = self.findinDoc('span',start,end)
|
(pos2, temp) = self.findinDoc(b'span',start,end)
|
||||||
if pos != -1 or pos2 != -1:
|
if pos != -1 or pos2 != -1:
|
||||||
print(' a "text" region')
|
print(' a "text" region')
|
||||||
orig_regtype = regtype
|
orig_regtype = regtype
|
||||||
regtype = 'fixed'
|
regtype = b'fixed'
|
||||||
ptype = 'full'
|
ptype = 'full'
|
||||||
# check to see if this is a continution from the previous page
|
# check to see if this is a continution from the previous page
|
||||||
if first_para_continued :
|
if first_para_continued :
|
||||||
|
@ -764,23 +770,23 @@ class DocParser(object):
|
||||||
first_para_continued = False
|
first_para_continued = False
|
||||||
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
|
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
|
||||||
if not pclass:
|
if not pclass:
|
||||||
if orig_regtype.endswith('.right') : pclass = 'cl-right'
|
if orig_regtype.endswith(b'.right') : pclass = 'cl-right'
|
||||||
elif orig_regtype.endswith('.center') : pclass = 'cl-center'
|
elif orig_regtype.endswith(b'.center') : pclass = 'cl-center'
|
||||||
elif orig_regtype.endswith('.left') : pclass = 'cl-left'
|
elif orig_regtype.endswith(b'.left') : pclass = 'cl-left'
|
||||||
elif orig_regtype.endswith('.justify') : pclass = 'cl-justify'
|
elif orig_regtype.endswith(b'.justify') : pclass = 'cl-justify'
|
||||||
if pclass and (ptype == 'full') and (len(pclass) >= 6):
|
if pclass and (ptype == 'full') and (len(pclass) >= 6):
|
||||||
tag = 'p'
|
tag = 'p'
|
||||||
if pclass[3:6] == 'h1-' : tag = 'h4'
|
if pclass[3:6] == b'h1-' : tag = 'h4'
|
||||||
if pclass[3:6] == 'h2-' : tag = 'h5'
|
if pclass[3:6] == b'h2-' : tag = 'h5'
|
||||||
if pclass[3:6] == 'h3-' : tag = 'h6'
|
if pclass[3:6] == b'h3-' : tag = 'h6'
|
||||||
hlst.append('<' + tag + ' class="' + pclass + '">')
|
hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">')
|
||||||
hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype))
|
hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype))
|
||||||
hlst.append('</' + tag + '>')
|
hlst.append('</' + tag + '>')
|
||||||
else :
|
else :
|
||||||
hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
|
hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
|
||||||
else :
|
else :
|
||||||
print(' a "graphic" region')
|
print(' a "graphic" region')
|
||||||
(pos, simgsrc) = self.findinDoc('img.src',start,end)
|
(pos, simgsrc) = self.findinDoc(b'img.src',start,end)
|
||||||
if simgsrc:
|
if simgsrc:
|
||||||
hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc))
|
hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc))
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,7 @@ from struct import unpack
|
||||||
class PParser(object):
|
class PParser(object):
|
||||||
def __init__(self, gd, flatxml, meta_array):
|
def __init__(self, gd, flatxml, meta_array):
|
||||||
self.gd = gd
|
self.gd = gd
|
||||||
self.flatdoc = flatxml.split('\n')
|
self.flatdoc = flatxml.split(b'\n')
|
||||||
self.docSize = len(self.flatdoc)
|
self.docSize = len(self.flatdoc)
|
||||||
self.temp = []
|
self.temp = []
|
||||||
|
|
||||||
|
@ -58,11 +58,11 @@ class PParser(object):
|
||||||
def lineinDoc(self, pos) :
|
def lineinDoc(self, pos) :
|
||||||
if (pos >= 0) and (pos < self.docSize) :
|
if (pos >= 0) and (pos < self.docSize) :
|
||||||
item = self.flatdoc[pos]
|
item = self.flatdoc[pos]
|
||||||
if item.find('=') >= 0:
|
if item.find(b'=') >= 0:
|
||||||
(name, argres) = item.split('=',1)
|
(name, argres) = item.split(b'=',1)
|
||||||
else :
|
else :
|
||||||
name = item
|
name = item
|
||||||
argres = ''
|
argres = b''
|
||||||
return name, argres
|
return name, argres
|
||||||
|
|
||||||
# find tag in doc if within pos to end inclusive
|
# find tag in doc if within pos to end inclusive
|
||||||
|
@ -75,11 +75,13 @@ class PParser(object):
|
||||||
foundat = -1
|
foundat = -1
|
||||||
for j in range(pos, end):
|
for j in range(pos, end):
|
||||||
item = self.flatdoc[j]
|
item = self.flatdoc[j]
|
||||||
if item.find('=') >= 0:
|
if item.find(b'=') >= 0:
|
||||||
(name, argres) = item.split('=',1)
|
(name, argres) = item.split(b'=',1)
|
||||||
else :
|
else :
|
||||||
name = item
|
name = item
|
||||||
argres = ''
|
argres = b''
|
||||||
|
if (isinstance(tagpath,str)):
|
||||||
|
tagpath = tagpath.encode('utf-8')
|
||||||
if name.endswith(tagpath) :
|
if name.endswith(tagpath) :
|
||||||
result = argres
|
result = argres
|
||||||
foundat = j
|
foundat = j
|
||||||
|
@ -103,9 +105,9 @@ class PParser(object):
|
||||||
cnt = len(self.flatdoc)
|
cnt = len(self.flatdoc)
|
||||||
for j in range(cnt):
|
for j in range(cnt):
|
||||||
item = self.flatdoc[j]
|
item = self.flatdoc[j]
|
||||||
if item.find('=') >= 0:
|
if item.find(b'=') >= 0:
|
||||||
(name, argt) = item.split('=')
|
(name, argt) = item.split(b'=')
|
||||||
argres = argt.split('|')
|
argres = argt.split(b'|')
|
||||||
else:
|
else:
|
||||||
name = item
|
name = item
|
||||||
argres = []
|
argres = []
|
||||||
|
@ -120,15 +122,17 @@ class PParser(object):
|
||||||
def getDataatPos(self, path, pos):
|
def getDataatPos(self, path, pos):
|
||||||
result = None
|
result = None
|
||||||
item = self.flatdoc[pos]
|
item = self.flatdoc[pos]
|
||||||
if item.find('=') >= 0:
|
if item.find(b'=') >= 0:
|
||||||
(name, argt) = item.split('=')
|
(name, argt) = item.split(b'=')
|
||||||
argres = argt.split('|')
|
argres = argt.split(b'|')
|
||||||
else:
|
else:
|
||||||
name = item
|
name = item
|
||||||
argres = []
|
argres = []
|
||||||
if (len(argres) > 0) :
|
if (len(argres) > 0) :
|
||||||
for j in range(0,len(argres)):
|
for j in range(0,len(argres)):
|
||||||
argres[j] = int(argres[j])
|
argres[j] = int(argres[j])
|
||||||
|
if (isinstance(path,str)):
|
||||||
|
path = path.encode('utf-8')
|
||||||
if (name.endswith(path)):
|
if (name.endswith(path)):
|
||||||
result = argres
|
result = argres
|
||||||
return result
|
return result
|
||||||
|
@ -138,12 +142,14 @@ class PParser(object):
|
||||||
cnt = len(self.temp)
|
cnt = len(self.temp)
|
||||||
for j in range(cnt):
|
for j in range(cnt):
|
||||||
item = self.temp[j]
|
item = self.temp[j]
|
||||||
if item.find('=') >= 0:
|
if item.find(b'=') >= 0:
|
||||||
(name, argt) = item.split('=')
|
(name, argt) = item.split(b'=')
|
||||||
argres = argt.split('|')
|
argres = argt.split(b'|')
|
||||||
else:
|
else:
|
||||||
name = item
|
name = item
|
||||||
argres = []
|
argres = []
|
||||||
|
if (isinstance(path,str)):
|
||||||
|
path = path.encode('utf-8')
|
||||||
if (name.endswith(path)):
|
if (name.endswith(path)):
|
||||||
result = argres
|
result = argres
|
||||||
self.temp.pop(j)
|
self.temp.pop(j)
|
||||||
|
|
|
@ -44,10 +44,10 @@ if inCalibre :
|
||||||
from calibre_plugins.dedrm import flatxml2svg
|
from calibre_plugins.dedrm import flatxml2svg
|
||||||
from calibre_plugins.dedrm import stylexml2css
|
from calibre_plugins.dedrm import stylexml2css
|
||||||
else :
|
else :
|
||||||
from . import convert2xml
|
import convert2xml
|
||||||
from . import flatxml2html
|
import flatxml2html
|
||||||
from . import flatxml2svg
|
import flatxml2svg
|
||||||
from . import stylexml2css
|
import stylexml2css
|
||||||
|
|
||||||
# global switch
|
# global switch
|
||||||
buildXML = False
|
buildXML = False
|
||||||
|
@ -117,10 +117,10 @@ class Dictionary(object):
|
||||||
self.stable.append(self.escapestr(readString(self.fo)))
|
self.stable.append(self.escapestr(readString(self.fo)))
|
||||||
self.pos = 0
|
self.pos = 0
|
||||||
def escapestr(self, str):
|
def escapestr(self, str):
|
||||||
str = str.replace('&','&')
|
str = str.replace(b'&',b'&')
|
||||||
str = str.replace('<','<')
|
str = str.replace(b'<',b'<')
|
||||||
str = str.replace('>','>')
|
str = str.replace(b'>',b'>')
|
||||||
str = str.replace('=','=')
|
str = str.replace(b'=',b'=')
|
||||||
return str
|
return str
|
||||||
def lookup(self,val):
|
def lookup(self,val):
|
||||||
if ((val >= 0) and (val < self.size)) :
|
if ((val >= 0) and (val < self.size)) :
|
||||||
|
@ -138,7 +138,7 @@ class Dictionary(object):
|
||||||
|
|
||||||
class PageDimParser(object):
|
class PageDimParser(object):
|
||||||
def __init__(self, flatxml):
|
def __init__(self, flatxml):
|
||||||
self.flatdoc = flatxml.split('\n')
|
self.flatdoc = flatxml.split(b'\n')
|
||||||
# find tag if within pos to end inclusive
|
# find tag if within pos to end inclusive
|
||||||
def findinDoc(self, tagpath, pos, end) :
|
def findinDoc(self, tagpath, pos, end) :
|
||||||
result = None
|
result = None
|
||||||
|
@ -151,8 +151,8 @@ class PageDimParser(object):
|
||||||
foundat = -1
|
foundat = -1
|
||||||
for j in range(pos, end):
|
for j in range(pos, end):
|
||||||
item = docList[j]
|
item = docList[j]
|
||||||
if item.find('=') >= 0:
|
if item.find(b'=') >= 0:
|
||||||
(name, argres) = item.split('=')
|
(name, argres) = item.split(b'=')
|
||||||
else :
|
else :
|
||||||
name = item
|
name = item
|
||||||
argres = ''
|
argres = ''
|
||||||
|
@ -162,8 +162,8 @@ class PageDimParser(object):
|
||||||
break
|
break
|
||||||
return foundat, result
|
return foundat, result
|
||||||
def process(self):
|
def process(self):
|
||||||
(pos, sph) = self.findinDoc('page.h',0,-1)
|
(pos, sph) = self.findinDoc(b'page.h',0,-1)
|
||||||
(pos, spw) = self.findinDoc('page.w',0,-1)
|
(pos, spw) = self.findinDoc(b'page.w',0,-1)
|
||||||
if (sph == None): sph = '-1'
|
if (sph == None): sph = '-1'
|
||||||
if (spw == None): spw = '-1'
|
if (spw == None): spw = '-1'
|
||||||
return sph, spw
|
return sph, spw
|
||||||
|
@ -176,21 +176,21 @@ def getPageDim(flatxml):
|
||||||
|
|
||||||
class GParser(object):
|
class GParser(object):
|
||||||
def __init__(self, flatxml):
|
def __init__(self, flatxml):
|
||||||
self.flatdoc = flatxml.split('\n')
|
self.flatdoc = flatxml.split(b'\n')
|
||||||
self.dpi = 1440
|
self.dpi = 1440
|
||||||
self.gh = self.getData('info.glyph.h')
|
self.gh = self.getData(b'info.glyph.h')
|
||||||
self.gw = self.getData('info.glyph.w')
|
self.gw = self.getData(b'info.glyph.w')
|
||||||
self.guse = self.getData('info.glyph.use')
|
self.guse = self.getData(b'info.glyph.use')
|
||||||
if self.guse :
|
if self.guse :
|
||||||
self.count = len(self.guse)
|
self.count = len(self.guse)
|
||||||
else :
|
else :
|
||||||
self.count = 0
|
self.count = 0
|
||||||
self.gvtx = self.getData('info.glyph.vtx')
|
self.gvtx = self.getData(b'info.glyph.vtx')
|
||||||
self.glen = self.getData('info.glyph.len')
|
self.glen = self.getData(b'info.glyph.len')
|
||||||
self.gdpi = self.getData('info.glyph.dpi')
|
self.gdpi = self.getData(b'info.glyph.dpi')
|
||||||
self.vx = self.getData('info.vtx.x')
|
self.vx = self.getData(b'info.vtx.x')
|
||||||
self.vy = self.getData('info.vtx.y')
|
self.vy = self.getData(b'info.vtx.y')
|
||||||
self.vlen = self.getData('info.len.n')
|
self.vlen = self.getData(b'info.len.n')
|
||||||
if self.vlen :
|
if self.vlen :
|
||||||
self.glen.append(len(self.vlen))
|
self.glen.append(len(self.vlen))
|
||||||
elif self.glen:
|
elif self.glen:
|
||||||
|
@ -204,9 +204,9 @@ class GParser(object):
|
||||||
cnt = len(self.flatdoc)
|
cnt = len(self.flatdoc)
|
||||||
for j in range(cnt):
|
for j in range(cnt):
|
||||||
item = self.flatdoc[j]
|
item = self.flatdoc[j]
|
||||||
if item.find('=') >= 0:
|
if item.find(b'=') >= 0:
|
||||||
(name, argt) = item.split('=')
|
(name, argt) = item.split(b'=')
|
||||||
argres = argt.split('|')
|
argres = argt.split(b'|')
|
||||||
else:
|
else:
|
||||||
name = item
|
name = item
|
||||||
argres = []
|
argres = []
|
||||||
|
@ -431,7 +431,7 @@ def generateBook(bookDir, raw, fixedimage):
|
||||||
|
|
||||||
# now get the css info
|
# now get the css info
|
||||||
cssstr , classlst = stylexml2css.convert2CSS(flat_xml, fontsize, ph, pw)
|
cssstr , classlst = stylexml2css.convert2CSS(flat_xml, fontsize, ph, pw)
|
||||||
open(xname, 'wb').write(cssstr)
|
open(xname, 'w').write(cssstr)
|
||||||
if buildXML:
|
if buildXML:
|
||||||
xname = os.path.join(xmlDir, 'other0000.xml')
|
xname = os.path.join(xmlDir, 'other0000.xml')
|
||||||
open(xname, 'wb').write(convert2xml.getXML(dict, otherFile))
|
open(xname, 'wb').write(convert2xml.getXML(dict, otherFile))
|
||||||
|
@ -525,7 +525,7 @@ def generateBook(bookDir, raw, fixedimage):
|
||||||
hlst.append('</body>\n</html>\n')
|
hlst.append('</body>\n</html>\n')
|
||||||
htmlstr = "".join(hlst)
|
htmlstr = "".join(hlst)
|
||||||
hlst = None
|
hlst = None
|
||||||
open(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
|
open(os.path.join(bookDir, htmlFileName), 'w').write(htmlstr)
|
||||||
|
|
||||||
print(" ")
|
print(" ")
|
||||||
print('Extracting Table of Contents from Amazon OCR')
|
print('Extracting Table of Contents from Amazon OCR')
|
||||||
|
@ -571,7 +571,7 @@ def generateBook(bookDir, raw, fixedimage):
|
||||||
tlst.append('</body>\n')
|
tlst.append('</body>\n')
|
||||||
tlst.append('</html>\n')
|
tlst.append('</html>\n')
|
||||||
tochtml = "".join(tlst)
|
tochtml = "".join(tlst)
|
||||||
open(os.path.join(svgDir, 'toc.xhtml'), 'wb').write(tochtml)
|
open(os.path.join(svgDir, 'toc.xhtml'), 'w').write(tochtml)
|
||||||
|
|
||||||
|
|
||||||
# now create index_svg.xhtml that points to all required files
|
# now create index_svg.xhtml that points to all required files
|
||||||
|
@ -608,7 +608,7 @@ def generateBook(bookDir, raw, fixedimage):
|
||||||
flst = []
|
flst = []
|
||||||
for page in pagelst:
|
for page in pagelst:
|
||||||
flst.append(xmllst[page])
|
flst.append(xmllst[page])
|
||||||
flat_svg = "".join(flst)
|
flat_svg = b"".join(flst)
|
||||||
flst=None
|
flst=None
|
||||||
svgxml = flatxml2svg.convert2SVG(gd, flat_svg, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi)
|
svgxml = flatxml2svg.convert2SVG(gd, flat_svg, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi)
|
||||||
if (raw) :
|
if (raw) :
|
||||||
|
@ -626,7 +626,7 @@ def generateBook(bookDir, raw, fixedimage):
|
||||||
slst.append('</body>\n</html>\n')
|
slst.append('</body>\n</html>\n')
|
||||||
svgindex = "".join(slst)
|
svgindex = "".join(slst)
|
||||||
slst = None
|
slst = None
|
||||||
open(os.path.join(bookDir, 'index_svg.xhtml'), 'wb').write(svgindex)
|
open(os.path.join(bookDir, 'index_svg.xhtml'), 'w').write(svgindex)
|
||||||
|
|
||||||
print(" ")
|
print(" ")
|
||||||
|
|
||||||
|
@ -637,16 +637,16 @@ def generateBook(bookDir, raw, fixedimage):
|
||||||
olst.append('<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="guid_id">\n')
|
olst.append('<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="guid_id">\n')
|
||||||
# adding metadata
|
# adding metadata
|
||||||
olst.append(' <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">\n')
|
olst.append(' <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">\n')
|
||||||
if 'GUID' in meta_array:
|
if b'GUID' in meta_array:
|
||||||
olst.append(' <dc:identifier opf:scheme="GUID" id="guid_id">' + meta_array['GUID'] + '</dc:identifier>\n')
|
olst.append(' <dc:identifier opf:scheme="GUID" id="guid_id">' + meta_array[b'GUID'].decode('utf-8') + '</dc:identifier>\n')
|
||||||
if 'ASIN' in meta_array:
|
if b'ASIN' in meta_array:
|
||||||
olst.append(' <dc:identifier opf:scheme="ASIN">' + meta_array['ASIN'] + '</dc:identifier>\n')
|
olst.append(' <dc:identifier opf:scheme="ASIN">' + meta_array[b'ASIN'].decode('utf-8') + '</dc:identifier>\n')
|
||||||
if 'oASIN' in meta_array:
|
if b'oASIN' in meta_array:
|
||||||
olst.append(' <dc:identifier opf:scheme="oASIN">' + meta_array['oASIN'] + '</dc:identifier>\n')
|
olst.append(' <dc:identifier opf:scheme="oASIN">' + meta_array[b'oASIN'].decode('utf-8') + '</dc:identifier>\n')
|
||||||
olst.append(' <dc:title>' + meta_array['Title'] + '</dc:title>\n')
|
olst.append(' <dc:title>' + meta_array[b'Title'].decode('utf-8') + '</dc:title>\n')
|
||||||
olst.append(' <dc:creator opf:role="aut">' + meta_array['Authors'] + '</dc:creator>\n')
|
olst.append(' <dc:creator opf:role="aut">' + meta_array[b'Authors'].decode('utf-8') + '</dc:creator>\n')
|
||||||
olst.append(' <dc:language>en</dc:language>\n')
|
olst.append(' <dc:language>en</dc:language>\n')
|
||||||
olst.append(' <dc:date>' + meta_array['UpdateTime'] + '</dc:date>\n')
|
olst.append(' <dc:date>' + meta_array[b'UpdateTime'].decode('utf-8') + '</dc:date>\n')
|
||||||
if isCover:
|
if isCover:
|
||||||
olst.append(' <meta name="cover" content="bookcover"/>\n')
|
olst.append(' <meta name="cover" content="bookcover"/>\n')
|
||||||
olst.append(' </metadata>\n')
|
olst.append(' </metadata>\n')
|
||||||
|
@ -675,7 +675,7 @@ def generateBook(bookDir, raw, fixedimage):
|
||||||
olst.append('</package>\n')
|
olst.append('</package>\n')
|
||||||
opfstr = "".join(olst)
|
opfstr = "".join(olst)
|
||||||
olst = None
|
olst = None
|
||||||
open(opfname, 'wb').write(opfstr)
|
open(opfname, 'w').write(opfstr)
|
||||||
|
|
||||||
print('Processing Complete')
|
print('Processing Complete')
|
||||||
|
|
||||||
|
|
|
@ -49,14 +49,15 @@ def SHA1(message):
|
||||||
|
|
||||||
|
|
||||||
# Encode the bytes in data with the characters in map
|
# Encode the bytes in data with the characters in map
|
||||||
|
# data and map should be byte arrays
|
||||||
def encode(data, map):
|
def encode(data, map):
|
||||||
result = ''
|
result = b''
|
||||||
for char in data:
|
for char in data:
|
||||||
value = ord(char)
|
value = char
|
||||||
Q = (value ^ 0x80) // len(map)
|
Q = (value ^ 0x80) // len(map)
|
||||||
R = value % len(map)
|
R = value % len(map)
|
||||||
result += map[Q]
|
result += bytes([map[Q]])
|
||||||
result += map[R]
|
result += bytes([map[R]])
|
||||||
return result
|
return result
|
||||||
|
|
||||||
# Hash the bytes in data and then encode the digest with the characters in map
|
# Hash the bytes in data and then encode the digest with the characters in map
|
||||||
|
@ -117,7 +118,7 @@ def generatePidEncryptionTable() :
|
||||||
def generatePidSeed(table,dsn) :
|
def generatePidSeed(table,dsn) :
|
||||||
value = 0
|
value = 0
|
||||||
for counter in range (0,4) :
|
for counter in range (0,4) :
|
||||||
index = (ord(dsn[counter]) ^ value) &0xFF
|
index = (dsn[counter] ^ value) & 0xFF
|
||||||
value = (value >> 8) ^ table[index]
|
value = (value >> 8) ^ table[index]
|
||||||
return value
|
return value
|
||||||
|
|
||||||
|
@ -129,7 +130,7 @@ def generateDevicePID(table,dsn,nbRoll):
|
||||||
pid = [(seed >>24) &0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF,(seed>>24) & 0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF]
|
pid = [(seed >>24) &0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF,(seed>>24) & 0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF]
|
||||||
index = 0
|
index = 0
|
||||||
for counter in range (0,nbRoll):
|
for counter in range (0,nbRoll):
|
||||||
pid[index] = pid[index] ^ ord(dsn[counter])
|
pid[index] = pid[index] ^ dsn[counter]
|
||||||
index = (index+1) %8
|
index = (index+1) %8
|
||||||
for counter in range (0,8):
|
for counter in range (0,8):
|
||||||
index = ((((pid[counter] >>5) & 3) ^ pid[counter]) & 0x1f) + (pid[counter] >> 7)
|
index = ((((pid[counter] >>5) & 3) ^ pid[counter]) & 0x1f) + (pid[counter] >> 7)
|
||||||
|
@ -205,7 +206,7 @@ def getK4Pids(rec209, token, kindleDatabase):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Get the kindle account token, if present
|
# Get the kindle account token, if present
|
||||||
kindleAccountToken = bytearray.fromhex((kindleDatabase[1])[b'kindle.account.tokens']).decode()
|
kindleAccountToken = bytearray.fromhex((kindleDatabase[1])['kindle.account.tokens'])
|
||||||
|
|
||||||
except KeyError:
|
except KeyError:
|
||||||
kindleAccountToken=""
|
kindleAccountToken=""
|
||||||
|
@ -213,30 +214,30 @@ def getK4Pids(rec209, token, kindleDatabase):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Get the DSN token, if present
|
# Get the DSN token, if present
|
||||||
DSN = bytearray.fromhex((kindleDatabase[1])['DSN']).decode()
|
DSN = bytearray.fromhex((kindleDatabase[1])['DSN'])
|
||||||
print("Got DSN key from database {0}".format(kindleDatabase[0]))
|
print("Got DSN key from database {0}".format(kindleDatabase[0]))
|
||||||
except KeyError:
|
except KeyError:
|
||||||
# See if we have the info to generate the DSN
|
# See if we have the info to generate the DSN
|
||||||
try:
|
try:
|
||||||
# Get the Mazama Random number
|
# Get the Mazama Random number
|
||||||
MazamaRandomNumber = bytearray.fromhex((kindleDatabase[1])[b'MazamaRandomNumber']).decode()
|
MazamaRandomNumber = bytearray.fromhex((kindleDatabase[1])['MazamaRandomNumber'])
|
||||||
#print "Got MazamaRandomNumber from database {0}".format(kindleDatabase[0])
|
#print "Got MazamaRandomNumber from database {0}".format(kindleDatabase[0])
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Get the SerialNumber token, if present
|
# Get the SerialNumber token, if present
|
||||||
IDString = bytearray.fromhex((kindleDatabase[1])[b'SerialNumber']).decode()
|
IDString = bytearray.fromhex((kindleDatabase[1])['SerialNumber'])
|
||||||
print("Got SerialNumber from database {0}".format(kindleDatabase[0]))
|
print("Got SerialNumber from database {0}".format(kindleDatabase[0]))
|
||||||
except KeyError:
|
except KeyError:
|
||||||
# Get the IDString we added
|
# Get the IDString we added
|
||||||
IDString = bytearray.fromhex((kindleDatabase[1])[b'IDString']).decode()
|
IDString = bytearray.fromhex((kindleDatabase[1])['IDString'])
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Get the UsernameHash token, if present
|
# Get the UsernameHash token, if present
|
||||||
encodedUsername = bytearray.fromhex((kindleDatabase[1])[b'UsernameHash']).decode()
|
encodedUsername = bytearray.fromhex((kindleDatabase[1])['UsernameHash'])
|
||||||
print("Got UsernameHash from database {0}".format(kindleDatabase[0]))
|
print("Got UsernameHash from database {0}".format(kindleDatabase[0]))
|
||||||
except KeyError:
|
except KeyError:
|
||||||
# Get the UserName we added
|
# Get the UserName we added
|
||||||
UserName = bytearray.fromhex((kindleDatabase[1])[b'UserName']).decode()
|
UserName = bytearray.fromhex((kindleDatabase[1])['UserName'])
|
||||||
# encode it
|
# encode it
|
||||||
encodedUsername = encodeHash(UserName,charMap1)
|
encodedUsername = encodeHash(UserName,charMap1)
|
||||||
#print "encodedUsername",encodedUsername.encode('hex')
|
#print "encodedUsername",encodedUsername.encode('hex')
|
||||||
|
@ -266,19 +267,19 @@ def getK4Pids(rec209, token, kindleDatabase):
|
||||||
# Compute book PIDs
|
# Compute book PIDs
|
||||||
|
|
||||||
# book pid
|
# book pid
|
||||||
pidHash = SHA1(DSN.encode()+kindleAccountToken.encode()+rec209+token)
|
pidHash = SHA1(DSN+kindleAccountToken+rec209+token)
|
||||||
bookPID = encodePID(pidHash)
|
bookPID = encodePID(pidHash)
|
||||||
bookPID = checksumPid(bookPID)
|
bookPID = checksumPid(bookPID)
|
||||||
pids.append(bookPID)
|
pids.append(bookPID)
|
||||||
|
|
||||||
# variant 1
|
# variant 1
|
||||||
pidHash = SHA1(kindleAccountToken.encode()+rec209+token)
|
pidHash = SHA1(kindleAccountToken+rec209+token)
|
||||||
bookPID = encodePID(pidHash)
|
bookPID = encodePID(pidHash)
|
||||||
bookPID = checksumPid(bookPID)
|
bookPID = checksumPid(bookPID)
|
||||||
pids.append(bookPID)
|
pids.append(bookPID)
|
||||||
|
|
||||||
# variant 2
|
# variant 2
|
||||||
pidHash = SHA1(DSN.encode()+rec209+token)
|
pidHash = SHA1(DSN+rec209+token)
|
||||||
bookPID = encodePID(pidHash)
|
bookPID = encodePID(pidHash)
|
||||||
bookPID = checksumPid(bookPID)
|
bookPID = checksumPid(bookPID)
|
||||||
pids.append(bookPID)
|
pids.append(bookPID)
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
|
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__version__ = "1.00"
|
__version__ = "1.0"
|
||||||
|
|
||||||
# This is a python script. You need a Python interpreter to run it.
|
# This is a python script. You need a Python interpreter to run it.
|
||||||
# For example, ActiveState Python, which exists for windows.
|
# For example, ActiveState Python, which exists for windows.
|
||||||
|
@ -73,7 +73,7 @@ __version__ = "1.00"
|
||||||
# 0.40 - moved unicode_argv call inside main for Windows DeDRM compatibility
|
# 0.40 - moved unicode_argv call inside main for Windows DeDRM compatibility
|
||||||
# 0.41 - Fixed potential unicode problem in command line calls
|
# 0.41 - Fixed potential unicode problem in command line calls
|
||||||
# 0.42 - Added GPL v3 licence. updated/removed some print statements
|
# 0.42 - Added GPL v3 licence. updated/removed some print statements
|
||||||
# 1.00 - Python 3 compatibility for calibre 5.0
|
# 1.0 - Python 3 compatibility for calibre 5.0
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
@ -330,7 +330,7 @@ class MobiBook:
|
||||||
}
|
}
|
||||||
title = ''
|
title = ''
|
||||||
codec = 'windows-1252'
|
codec = 'windows-1252'
|
||||||
if self.magic == 'BOOKMOBI':
|
if self.magic == b'BOOKMOBI':
|
||||||
if 503 in self.meta_array:
|
if 503 in self.meta_array:
|
||||||
title = self.meta_array[503]
|
title = self.meta_array[503]
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -15,36 +15,36 @@ debug = False
|
||||||
|
|
||||||
class DocParser(object):
|
class DocParser(object):
|
||||||
def __init__(self, flatxml, fontsize, ph, pw):
|
def __init__(self, flatxml, fontsize, ph, pw):
|
||||||
self.flatdoc = flatxml.split('\n')
|
self.flatdoc = flatxml.split(b'\n')
|
||||||
self.fontsize = int(fontsize)
|
self.fontsize = int(fontsize)
|
||||||
self.ph = int(ph) * 1.0
|
self.ph = int(ph) * 1.0
|
||||||
self.pw = int(pw) * 1.0
|
self.pw = int(pw) * 1.0
|
||||||
|
|
||||||
stags = {
|
stags = {
|
||||||
'paragraph' : 'p',
|
b'paragraph' : 'p',
|
||||||
'graphic' : '.graphic'
|
b'graphic' : '.graphic'
|
||||||
}
|
}
|
||||||
|
|
||||||
attr_val_map = {
|
attr_val_map = {
|
||||||
'hang' : 'text-indent: ',
|
b'hang' : 'text-indent: ',
|
||||||
'indent' : 'text-indent: ',
|
b'indent' : 'text-indent: ',
|
||||||
'line-space' : 'line-height: ',
|
b'line-space' : 'line-height: ',
|
||||||
'margin-bottom' : 'margin-bottom: ',
|
b'margin-bottom' : 'margin-bottom: ',
|
||||||
'margin-left' : 'margin-left: ',
|
b'margin-left' : 'margin-left: ',
|
||||||
'margin-right' : 'margin-right: ',
|
b'margin-right' : 'margin-right: ',
|
||||||
'margin-top' : 'margin-top: ',
|
b'margin-top' : 'margin-top: ',
|
||||||
'space-after' : 'padding-bottom: ',
|
b'space-after' : 'padding-bottom: ',
|
||||||
}
|
}
|
||||||
|
|
||||||
attr_str_map = {
|
attr_str_map = {
|
||||||
'align-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
|
b'align-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
|
||||||
'align-left' : 'text-align: left;',
|
b'align-left' : 'text-align: left;',
|
||||||
'align-right' : 'text-align: right;',
|
b'align-right' : 'text-align: right;',
|
||||||
'align-justify' : 'text-align: justify;',
|
b'align-justify' : 'text-align: justify;',
|
||||||
'display-inline' : 'display: inline;',
|
b'display-inline' : 'display: inline;',
|
||||||
'pos-left' : 'text-align: left;',
|
b'pos-left' : 'text-align: left;',
|
||||||
'pos-right' : 'text-align: right;',
|
b'pos-right' : 'text-align: right;',
|
||||||
'pos-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
|
b'pos-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -60,11 +60,13 @@ class DocParser(object):
|
||||||
foundat = -1
|
foundat = -1
|
||||||
for j in range(pos, end):
|
for j in range(pos, end):
|
||||||
item = docList[j]
|
item = docList[j]
|
||||||
if item.find('=') >= 0:
|
if item.find(b'=') >= 0:
|
||||||
(name, argres) = item.split('=',1)
|
(name, argres) = item.split(b'=',1)
|
||||||
else :
|
else :
|
||||||
name = item
|
name = item
|
||||||
argres = ''
|
argres = b''
|
||||||
|
if (isinstance(tagpath,str)):
|
||||||
|
tagpath = tagpath.encode('utf-8')
|
||||||
if name.endswith(tagpath) :
|
if name.endswith(tagpath) :
|
||||||
result = argres
|
result = argres
|
||||||
foundat = j
|
foundat = j
|
||||||
|
@ -76,7 +78,7 @@ class DocParser(object):
|
||||||
def posinDoc(self, tagpath):
|
def posinDoc(self, tagpath):
|
||||||
startpos = []
|
startpos = []
|
||||||
pos = 0
|
pos = 0
|
||||||
res = ""
|
res = b""
|
||||||
while res != None :
|
while res != None :
|
||||||
(foundpos, res) = self.findinDoc(tagpath, pos, -1)
|
(foundpos, res) = self.findinDoc(tagpath, pos, -1)
|
||||||
if res != None :
|
if res != None :
|
||||||
|
@ -87,11 +89,11 @@ class DocParser(object):
|
||||||
# returns a vector of integers for the tagpath
|
# returns a vector of integers for the tagpath
|
||||||
def getData(self, tagpath, pos, end, clean=False):
|
def getData(self, tagpath, pos, end, clean=False):
|
||||||
if clean:
|
if clean:
|
||||||
digits_only = re.compile(r'''([0-9]+)''')
|
digits_only = re.compile(rb'''([0-9]+)''')
|
||||||
argres=[]
|
argres=[]
|
||||||
(foundat, argt) = self.findinDoc(tagpath, pos, end)
|
(foundat, argt) = self.findinDoc(tagpath, pos, end)
|
||||||
if (argt != None) and (len(argt) > 0) :
|
if (argt != None) and (len(argt) > 0) :
|
||||||
argList = argt.split('|')
|
argList = argt.split(b'|')
|
||||||
for strval in argList:
|
for strval in argList:
|
||||||
if clean:
|
if clean:
|
||||||
m = re.search(digits_only, strval)
|
m = re.search(digits_only, strval)
|
||||||
|
@ -109,7 +111,7 @@ class DocParser(object):
|
||||||
csspage += '.cl-justify { text-align: justify; }\n'
|
csspage += '.cl-justify { text-align: justify; }\n'
|
||||||
|
|
||||||
# generate a list of each <style> starting point in the stylesheet
|
# generate a list of each <style> starting point in the stylesheet
|
||||||
styleList= self.posinDoc('book.stylesheet.style')
|
styleList= self.posinDoc(b'book.stylesheet.style')
|
||||||
stylecnt = len(styleList)
|
stylecnt = len(styleList)
|
||||||
styleList.append(-1)
|
styleList.append(-1)
|
||||||
|
|
||||||
|
@ -121,30 +123,30 @@ class DocParser(object):
|
||||||
start = styleList[j]
|
start = styleList[j]
|
||||||
end = styleList[j+1]
|
end = styleList[j+1]
|
||||||
|
|
||||||
(pos, tag) = self.findinDoc('style._tag',start,end)
|
(pos, tag) = self.findinDoc(b'style._tag',start,end)
|
||||||
if tag == None :
|
if tag == None :
|
||||||
(pos, tag) = self.findinDoc('style.type',start,end)
|
(pos, tag) = self.findinDoc(b'style.type',start,end)
|
||||||
|
|
||||||
# Is this something we know how to convert to css
|
# Is this something we know how to convert to css
|
||||||
if tag in self.stags :
|
if tag in self.stags :
|
||||||
|
|
||||||
# get the style class
|
# get the style class
|
||||||
(pos, sclass) = self.findinDoc('style.class',start,end)
|
(pos, sclass) = self.findinDoc(b'style.class',start,end)
|
||||||
if sclass != None:
|
if sclass != None:
|
||||||
sclass = sclass.replace(' ','-')
|
sclass = sclass.replace(b' ',b'-')
|
||||||
sclass = '.cl-' + sclass.lower()
|
sclass = b'.cl-' + sclass.lower()
|
||||||
else :
|
else :
|
||||||
sclass = ''
|
sclass = b''
|
||||||
|
|
||||||
if debug: print('sclass', sclass)
|
if debug: print('sclass', sclass)
|
||||||
|
|
||||||
# check for any "after class" specifiers
|
# check for any "after class" specifiers
|
||||||
(pos, aftclass) = self.findinDoc('style._after_class',start,end)
|
(pos, aftclass) = self.findinDoc(b'style._after_class',start,end)
|
||||||
if aftclass != None:
|
if aftclass != None:
|
||||||
aftclass = aftclass.replace(' ','-')
|
aftclass = aftclass.replace(b' ',b'-')
|
||||||
aftclass = '.cl-' + aftclass.lower()
|
aftclass = b'.cl-' + aftclass.lower()
|
||||||
else :
|
else :
|
||||||
aftclass = ''
|
aftclass = b''
|
||||||
|
|
||||||
if debug: print('aftclass', aftclass)
|
if debug: print('aftclass', aftclass)
|
||||||
|
|
||||||
|
@ -152,34 +154,37 @@ class DocParser(object):
|
||||||
|
|
||||||
while True :
|
while True :
|
||||||
|
|
||||||
(pos1, attr) = self.findinDoc('style.rule.attr', start, end)
|
(pos1, attr) = self.findinDoc(b'style.rule.attr', start, end)
|
||||||
(pos2, val) = self.findinDoc('style.rule.value', start, end)
|
(pos2, val) = self.findinDoc(b'style.rule.value', start, end)
|
||||||
|
|
||||||
if debug: print('attr', attr)
|
if debug: print('attr', attr)
|
||||||
if debug: print('val', val)
|
if debug: print('val', val)
|
||||||
|
|
||||||
if attr == None : break
|
if attr == None : break
|
||||||
|
|
||||||
if (attr == 'display') or (attr == 'pos') or (attr == 'align'):
|
if (attr == b'display') or (attr == b'pos') or (attr == b'align'):
|
||||||
# handle text based attributess
|
# handle text based attributess
|
||||||
attr = attr + '-' + val
|
attr = attr + b'-' + val
|
||||||
if attr in self.attr_str_map :
|
if attr in self.attr_str_map :
|
||||||
cssargs[attr] = (self.attr_str_map[attr], '')
|
cssargs[attr] = (self.attr_str_map[attr], b'')
|
||||||
else :
|
else :
|
||||||
# handle value based attributes
|
# handle value based attributes
|
||||||
if attr in self.attr_val_map :
|
if attr in self.attr_val_map :
|
||||||
name = self.attr_val_map[attr]
|
name = self.attr_val_map[attr]
|
||||||
if attr in ('margin-bottom', 'margin-top', 'space-after') :
|
if attr in (b'margin-bottom', b'margin-top', b'space-after') :
|
||||||
scale = self.ph
|
scale = self.ph
|
||||||
elif attr in ('margin-right', 'indent', 'margin-left', 'hang') :
|
elif attr in (b'margin-right', b'indent', b'margin-left', b'hang') :
|
||||||
scale = self.pw
|
scale = self.pw
|
||||||
elif attr == 'line-space':
|
elif attr == b'line-space':
|
||||||
scale = self.fontsize * 2.0
|
scale = self.fontsize * 2.0
|
||||||
|
else:
|
||||||
|
print("Scale not defined!")
|
||||||
|
scale = 1.0
|
||||||
|
|
||||||
if val == "":
|
if val == "":
|
||||||
val = 0
|
val = 0
|
||||||
|
|
||||||
if not ((attr == 'hang') and (int(val) == 0)):
|
if not ((attr == b'hang') and (int(val) == 0)):
|
||||||
try:
|
try:
|
||||||
f = float(val)
|
f = float(val)
|
||||||
except:
|
except:
|
||||||
|
@ -198,32 +203,32 @@ class DocParser(object):
|
||||||
if debug: print('keeping style')
|
if debug: print('keeping style')
|
||||||
# make sure line-space does not go below 100% or above 300% since
|
# make sure line-space does not go below 100% or above 300% since
|
||||||
# it can be wacky in some styles
|
# it can be wacky in some styles
|
||||||
if 'line-space' in cssargs:
|
if b'line-space' in cssargs:
|
||||||
seg = cssargs['line-space'][0]
|
seg = cssargs[b'line-space'][0]
|
||||||
val = cssargs['line-space'][1]
|
val = cssargs[b'line-space'][1]
|
||||||
if val < 1.0: val = 1.0
|
if val < 1.0: val = 1.0
|
||||||
if val > 3.0: val = 3.0
|
if val > 3.0: val = 3.0
|
||||||
del cssargs['line-space']
|
del cssargs[b'line-space']
|
||||||
cssargs['line-space'] = (self.attr_val_map['line-space'], val)
|
cssargs[b'line-space'] = (self.attr_val_map[b'line-space'], val)
|
||||||
|
|
||||||
|
|
||||||
# handle modifications for css style hanging indents
|
# handle modifications for css style hanging indents
|
||||||
if 'hang' in cssargs:
|
if b'hang' in cssargs:
|
||||||
hseg = cssargs['hang'][0]
|
hseg = cssargs[b'hang'][0]
|
||||||
hval = cssargs['hang'][1]
|
hval = cssargs[b'hang'][1]
|
||||||
del cssargs['hang']
|
del cssargs[b'hang']
|
||||||
cssargs['hang'] = (self.attr_val_map['hang'], -hval)
|
cssargs[b'hang'] = (self.attr_val_map[b'hang'], -hval)
|
||||||
mval = 0
|
mval = 0
|
||||||
mseg = 'margin-left: '
|
mseg = 'margin-left: '
|
||||||
mval = hval
|
mval = hval
|
||||||
if 'margin-left' in cssargs:
|
if b'margin-left' in cssargs:
|
||||||
mseg = cssargs['margin-left'][0]
|
mseg = cssargs[b'margin-left'][0]
|
||||||
mval = cssargs['margin-left'][1]
|
mval = cssargs[b'margin-left'][1]
|
||||||
if mval < 0: mval = 0
|
if mval < 0: mval = 0
|
||||||
mval = hval + mval
|
mval = hval + mval
|
||||||
cssargs['margin-left'] = (mseg, mval)
|
cssargs[b'margin-left'] = (mseg, mval)
|
||||||
if 'indent' in cssargs:
|
if b'indent' in cssargs:
|
||||||
del cssargs['indent']
|
del cssargs[b'indent']
|
||||||
|
|
||||||
cssline = sclass + ' { '
|
cssline = sclass + ' { '
|
||||||
for key in iter(cssargs):
|
for key in iter(cssargs):
|
||||||
|
|
|
@ -173,7 +173,7 @@ def decryptRecord(data,PID):
|
||||||
def decryptDkeyRecord(data,PID):
|
def decryptDkeyRecord(data,PID):
|
||||||
record = decryptRecord(data,PID)
|
record = decryptRecord(data,PID)
|
||||||
fields = unpack('3sB8sB8s3s',record)
|
fields = unpack('3sB8sB8s3s',record)
|
||||||
if fields[0] != 'PID' or fields[5] != 'pid' :
|
if fields[0] != b'PID' or fields[5] != b'pid' :
|
||||||
raise DrmException("Didn't find PID magic numbers in record")
|
raise DrmException("Didn't find PID magic numbers in record")
|
||||||
elif fields[1] != 8 or fields[3] != 8 :
|
elif fields[1] != 8 or fields[3] != 8 :
|
||||||
raise DrmException("Record didn't contain correct length fields")
|
raise DrmException("Record didn't contain correct length fields")
|
||||||
|
@ -183,11 +183,11 @@ def decryptDkeyRecord(data,PID):
|
||||||
|
|
||||||
# Decrypt all dkey records (contain the book PID)
|
# Decrypt all dkey records (contain the book PID)
|
||||||
def decryptDkeyRecords(data,PID):
|
def decryptDkeyRecords(data,PID):
|
||||||
nbKeyRecords = ord(data[0])
|
nbKeyRecords = data[0]
|
||||||
records = []
|
records = []
|
||||||
data = data[1:]
|
data = data[1:]
|
||||||
for i in range (0,nbKeyRecords):
|
for i in range (0,nbKeyRecords):
|
||||||
length = ord(data[0])
|
length = data[0]
|
||||||
try:
|
try:
|
||||||
key = decryptDkeyRecord(data[1:length+1],PID)
|
key = decryptDkeyRecord(data[1:length+1],PID)
|
||||||
records.append(key)
|
records.append(key)
|
||||||
|
@ -209,7 +209,7 @@ class TopazBook:
|
||||||
self.bookMetadata = {}
|
self.bookMetadata = {}
|
||||||
self.bookKey = None
|
self.bookKey = None
|
||||||
magic = unpack('4s',self.fo.read(4))[0]
|
magic = unpack('4s',self.fo.read(4))[0]
|
||||||
if magic != 'TPZ0':
|
if magic != b'TPZ0':
|
||||||
raise DrmException("Parse Error : Invalid Header, not a Topaz file")
|
raise DrmException("Parse Error : Invalid Header, not a Topaz file")
|
||||||
self.parseTopazHeaders()
|
self.parseTopazHeaders()
|
||||||
self.parseMetadata()
|
self.parseMetadata()
|
||||||
|
@ -244,9 +244,9 @@ class TopazBook:
|
||||||
|
|
||||||
def parseMetadata(self):
|
def parseMetadata(self):
|
||||||
# Parse the metadata record from the book payload and return a list of [key,values]
|
# Parse the metadata record from the book payload and return a list of [key,values]
|
||||||
self.fo.seek(self.bookPayloadOffset + self.bookHeaderRecords['metadata'][0][0])
|
self.fo.seek(self.bookPayloadOffset + self.bookHeaderRecords[b'metadata'][0][0])
|
||||||
tag = bookReadString(self.fo)
|
tag = bookReadString(self.fo)
|
||||||
if tag != 'metadata' :
|
if tag != b'metadata' :
|
||||||
raise DrmException("Parse Error : Record Names Don't Match")
|
raise DrmException("Parse Error : Record Names Don't Match")
|
||||||
flags = ord(self.fo.read(1))
|
flags = ord(self.fo.read(1))
|
||||||
nbRecords = ord(self.fo.read(1))
|
nbRecords = ord(self.fo.read(1))
|
||||||
|
@ -260,18 +260,18 @@ class TopazBook:
|
||||||
return self.bookMetadata
|
return self.bookMetadata
|
||||||
|
|
||||||
def getPIDMetaInfo(self):
|
def getPIDMetaInfo(self):
|
||||||
keysRecord = self.bookMetadata.get('keys','')
|
keysRecord = self.bookMetadata.get(b'keys',b'')
|
||||||
keysRecordRecord = ''
|
keysRecordRecord = b''
|
||||||
if keysRecord != '':
|
if keysRecord != b'':
|
||||||
keylst = keysRecord.split(',')
|
keylst = keysRecord.split(b',')
|
||||||
for keyval in keylst:
|
for keyval in keylst:
|
||||||
keysRecordRecord += self.bookMetadata.get(keyval,'')
|
keysRecordRecord += self.bookMetadata.get(keyval,b'')
|
||||||
return keysRecord, keysRecordRecord
|
return keysRecord, keysRecordRecord
|
||||||
|
|
||||||
def getBookTitle(self):
|
def getBookTitle(self):
|
||||||
title = ''
|
title = b''
|
||||||
if 'Title' in self.bookMetadata:
|
if b'Title' in self.bookMetadata:
|
||||||
title = self.bookMetadata['Title']
|
title = self.bookMetadata[b'Title']
|
||||||
return title.decode('utf-8')
|
return title.decode('utf-8')
|
||||||
|
|
||||||
def setBookKey(self, key):
|
def setBookKey(self, key):
|
||||||
|
@ -323,7 +323,7 @@ class TopazBook:
|
||||||
raw = 0
|
raw = 0
|
||||||
fixedimage=True
|
fixedimage=True
|
||||||
try:
|
try:
|
||||||
keydata = self.getBookPayloadRecord('dkey', 0)
|
keydata = self.getBookPayloadRecord(b'dkey', 0)
|
||||||
except DrmException as e:
|
except DrmException as e:
|
||||||
print("no dkey record found, book may not be encrypted")
|
print("no dkey record found, book may not be encrypted")
|
||||||
print("attempting to extrct files without a book key")
|
print("attempting to extrct files without a book key")
|
||||||
|
@ -354,7 +354,7 @@ class TopazBook:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
bookKey = bookKeys[0]
|
bookKey = bookKeys[0]
|
||||||
print("Book Key Found! ({0})".format(bookKey.encode('hex')))
|
print("Book Key Found! ({0})".format(bookKey.hex()))
|
||||||
break
|
break
|
||||||
|
|
||||||
if not bookKey:
|
if not bookKey:
|
||||||
|
@ -396,26 +396,26 @@ class TopazBook:
|
||||||
outdir = self.outdir
|
outdir = self.outdir
|
||||||
for headerRecord in self.bookHeaderRecords:
|
for headerRecord in self.bookHeaderRecords:
|
||||||
name = headerRecord
|
name = headerRecord
|
||||||
if name != 'dkey':
|
if name != b'dkey':
|
||||||
ext = ".dat"
|
ext = ".dat"
|
||||||
if name == 'img': ext = ".jpg"
|
if name == b'img': ext = ".jpg"
|
||||||
if name == 'color' : ext = ".jpg"
|
if name == b'color' : ext = ".jpg"
|
||||||
print("Processing Section: {0}\n. . .".format(name), end=' ')
|
print("Processing Section: {0}\n. . .".format(name.decode('utf-8')), end=' ')
|
||||||
for index in range (0,len(self.bookHeaderRecords[name])) :
|
for index in range (0,len(self.bookHeaderRecords[name])) :
|
||||||
fname = "{0}{1:04d}{2}".format(name,index,ext)
|
fname = "{0}{1:04d}{2}".format(name.decode('utf-8'),index,ext)
|
||||||
destdir = outdir
|
destdir = outdir
|
||||||
if name == 'img':
|
if name == b'img':
|
||||||
destdir = os.path.join(outdir,"img")
|
destdir = os.path.join(outdir,"img")
|
||||||
if name == 'color':
|
if name == b'color':
|
||||||
destdir = os.path.join(outdir,"color_img")
|
destdir = os.path.join(outdir,"color_img")
|
||||||
if name == 'page':
|
if name == b'page':
|
||||||
destdir = os.path.join(outdir,"page")
|
destdir = os.path.join(outdir,"page")
|
||||||
if name == 'glyphs':
|
if name == b'glyphs':
|
||||||
destdir = os.path.join(outdir,"glyphs")
|
destdir = os.path.join(outdir,"glyphs")
|
||||||
outputFile = os.path.join(destdir,fname)
|
outputFile = os.path.join(destdir,fname)
|
||||||
print(".", end=' ')
|
print(".", end=' ')
|
||||||
record = self.getBookPayloadRecord(name,index)
|
record = self.getBookPayloadRecord(name,index)
|
||||||
if record != '':
|
if record != b'':
|
||||||
open(outputFile, 'wb').write(record)
|
open(outputFile, 'wb').write(record)
|
||||||
print(" ")
|
print(" ")
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue