#! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab from __future__ import with_statement import csv import sys import os import getopt from struct import pack from struct import unpack class DocParser(object): def __init__(self, flatxml, classlst, fileid): self.id = os.path.basename(fileid).replace('.dat','') self.docList = flatxml.split('\n') self.docSize = len(self.docList) self.classList = {} tmpList = classlst.split('\n') for pclass in tmpList: if pclass != '': # remove the leading period from the css name cname = pclass[1:] self.classList[cname] = True self.ocrtext = [] self.link_id = [] self.link_title = [] self.link_page = [] self.dehyphen_rootid = [] self.paracont_stemid = [] self.parastems_stemid = [] # return tag at line pos in document def lineinDoc(self, pos) : if (pos >= 0) and (pos < self.docSize) : item = self.docList[pos] if item.find('=') >= 0: (name, argres) = item.split('=',1) else : name = item argres = '' return name, argres # find tag in doc if within pos to end inclusive def findinDoc(self, tagpath, pos, end) : result = None if end == -1 : end = self.docSize else: end = min(self.docSize, end) foundat = -1 for j in xrange(pos, end): item = self.docList[j] if item.find('=') >= 0: (name, argres) = item.split('=',1) else : name = item argres = '' if name.endswith(tagpath) : result = argres foundat = j break return foundat, result # return list of start positions for the tagpath def posinDoc(self, tagpath): startpos = [] pos = 0 res = "" while res != None : (foundpos, res) = self.findinDoc(tagpath, pos, -1) if res != None : startpos.append(foundpos) pos = foundpos + 1 return startpos # build a description of the paragraph def getParaDescription(self, start, end): result = [] # paragraph (pos, pclass) = self.findinDoc('paragraph.class',start,end) # class names are an issue given topaz may start them with numerals (not allowed), # use a mix of cases (which cause some browsers problems), and actually # attach numbers after "_reclustered*" to the end to deal classeses that inherit # from a base class (but then not actually provide all of these _reclustereed # classes in the stylesheet! # so we clean this up by lowercasing, prepend 'cl-', and getting any baseclass # that exists in the stylesheet first, and then adding this specific class # after if pclass != None : classres = '' pclass = pclass.lower() pclass = 'cl-' + pclass p = pclass.find('_') if p > 0 : baseclass = pclass[0:p] if baseclass in self.classList: classres += baseclass + ' ' classres += pclass pclass = classres # build up a description of the paragraph in result and return it # first check for the basic - all words paragraph (pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end) (pos, slast) = self.findinDoc('paragraph.lastWord',start,end) if (sfirst != None) and (slast != None) : first = int(sfirst) last = int(slast) for wordnum in xrange(first, last): result.append(('ocr', wordnum)) return pclass, result # this type of paragrph may be made up of multiple _spans, inline # word monograms (images) and words with semantic meaning # and now a new type "span" versus the old "_span" # need to parse this type line by line line = start + 1 word_class = '' # if end is -1 then we must search to end of document if end == -1 : end = self.docSize while (line < end) : (name, argres) = self.lineinDoc(line) if name.endswith('span.firstWord') : first = int(argres) (name, argres) = self.lineinDoc(line+1) if not name.endswith('span.lastWord'): print 'Error: - incorrect _span ordering inside paragraph' last = int(argres) for wordnum in xrange(first, last): result.append(('ocr', wordnum)) line += 1 elif name.endswith('word.class'): (cname, space) = argres.split('-',1) if space == '' : space = '0' if (cname == 'spaceafter') and (int(space) > 0) : word_class = 'sa' elif name.endswith('word.img.src'): result.append(('img' + word_class, int(argres))) word_class = '' elif name.endswith('word_semantic.firstWord'): first = int(argres) (name, argres) = self.lineinDoc(line+1) if not name.endswith('word_semantic.lastWord'): print 'Error: - incorrect word_semantic ordering inside paragraph' last = int(argres) for wordnum in xrange(first, last): result.append(('ocr', wordnum)) line += 1 line += 1 return pclass, result def buildParagraph(self, pclass, pdesc, type, regtype) : parares = '' sep ='' classres = '' if pclass : classres = ' class="' + pclass + '"' br_lb = (regtype == 'fixed') or (regtype == 'chapterheading') or (regtype == 'vertical') handle_links = len(self.link_id) > 0 if (type == 'full') or (type == 'begin') : parares += '
'
if (type == 'end'):
parares += ' '
cnt = len(pdesc)
for j in xrange( 0, cnt) :
(wtype, num) = pdesc[j]
if wtype == 'ocr' :
word = self.ocrtext[num]
sep = ' '
if handle_links:
link = self.link_id[num]
if (link > 0):
title = self.link_title[link-1]
if (title == "") or (parares.rfind(title) < 0):
title='_link_'
ptarget = self.link_page[link-1] - 1
linkhtml = '' % ptarget
linkhtml += title + ''
pos = parares.rfind(title)
if pos >= 0:
parares = parares[0:pos] + linkhtml + parares[pos+len(title):]
else :
parares += linkhtml
if word == '_link_' : word = ''
elif (link < 0) :
if word == '_link_' : word = ''
if word == '_lb_':
if ((num-1) in self.dehyphen_rootid ) or handle_links:
word = ''
sep = ''
elif br_lb :
word = '
\n'
sep = ''
else :
word = '\n'
sep = ''
if num in self.dehyphen_rootid :
word = word[0:-1]
sep = ''
parares += word + sep
elif wtype == 'img' :
sep = ''
parares += '' % num
parares += sep
elif wtype == 'imgsa' :
sep = ' '
parares += '' % num
parares += sep
if len(sep) > 0 : parares = parares[0:-1]
if (type == 'full') or (type == 'end') :
parares += '