diff --git a/hocrtotei b/hocrtotei index 19aefe2..6635c65 100755 --- a/hocrtotei +++ b/hocrtotei @@ -1,54 +1,57 @@ #!/usr/bin/env python3.7 # coding=utf-8 -""""Merges hOCR files into a TEI file.""" +""""Convert hOCR to TEI XML.""" from xml.sax.saxutils import escape from argparse import ArgumentParser import re import xml.etree.ElementTree as ET -parser = ArgumentParser(description='Merges hOCR files into a TEI file.') -parser.add_argument('i', metavar='hOCR-sourcefile') -parser.add_argument('o', metavar='TEI-destfile') +parser = ArgumentParser(description='Convert hOCR to TEI XML.') +parser.add_argument('i', metavar='Path to hOCR input file') +parser.add_argument('o', metavar='Path to TEI output file') args = parser.parse_args() -output_file = open(args.o, 'w') -output_file.write( - '\n' - + '\n' - + ' \n' - + ' \n' - + ' \n' - + ' \n' - + ' \n' - + ' \n' - + ' \n' - + ' \n' - + ' \n' - + ' \n' - + ' \n' -) -tree = ET.parse(args.i) -for page in tree.findall('.//*[@class="ocr_page"]'): +tei = '' +tei += '\n' +tei += ' \n' +tei += ' \n' +tei += ' \n' +tei += ' \n' +tei += ' \n' +tei += ' \n' +tei += '

\n' +tei += '
\n' +tei += ' \n' +tei += '

\n' +tei += '
\n' +tei += '
\n' +tei += '
\n' +tei += ' \n' +tei += ' \n' +# Conversion start +hocr = ET.parse(args.i) +for page in hocr.findall('.//*[@class="ocr_page"]'): page_properties = page.attrib.get('title') facsimile = re.search(r'image \"(.*?)\"', page_properties).group(1) page_number = re.search(r'ppageno (\d+)', page_properties).group(1) - output_file.write(' \n' % (facsimile, page_number)) # noqa + tei += ' \n'.format(facsimile, page_number) for para in page.findall('.//*[@class="ocr_par"]'): - output_file.write('

\n') + tei += '

\n' for line in para.findall('.//*[@class="ocr_line"]'): - output_file.write(' ') + tei += ' ' indent = '' for word in line.findall('.//*[@class="ocrx_word"]'): if word.text is not None: - output_file.write(indent + escape(word.text.strip())) + tei += indent + escape(word.text.strip()) indent = ' ' - output_file.write('\n') - output_file.write('

\n') -output_file.write( - ' \n' - + '
\n' - + '
' -) -output_file.close() + tei += '\n' + tei += '

\n' +# Conversion end +tei += ' \n' +tei += '
\n' +tei += '
\n' + +with open(args.o, 'w') as tei_file: + tei_file.write(tei) diff --git a/ocr b/ocr index 71927a2..262a758 100755 --- a/ocr +++ b/ocr @@ -1,7 +1,7 @@ #!/usr/bin/env python2.7 # coding=utf-8 -"""An OCR pipeline for PDF file processing.""" +"""OCR pipeline for PDF file processing.""" __author__ = 'Patrick Jentsch ,' \ 'Stephan Porada '