diff --git a/hocrtotei b/hocrtotei index ee492d7..a6e4963 100755 --- a/hocrtotei +++ b/hocrtotei @@ -4,6 +4,7 @@ import xml.etree.ElementTree as ET from xml.sax.saxutils import escape import os +import re import sys input_files = filter(lambda x: x.endswith(".hocr"), sorted(os.listdir(sys.argv[1]))) @@ -25,7 +26,8 @@ output_file.write('\n' + for input_file in input_files: tree = ET.parse(os.path.join(sys.argv[1], input_file)) - output_file.write(' \n' % (input_file.split(".")[0])) + page_number = int(re.search(r'\d+', input_file.split(".")[0]).group(0)) + output_file.write(' \n' % (page_number)) for para in tree.findall(".//*[@class='ocr_par']"): output_file.write('

\n') for line in para.findall(".//*[@class='ocr_line']"): @@ -41,4 +43,4 @@ for input_file in input_files: output_file.write(' \n' + ' \n' + '') -output_file.close() \ No newline at end of file +output_file.close() diff --git a/ocr b/ocr index 4e70c4d..07dc134 100755 --- a/ocr +++ b/ocr @@ -2,7 +2,6 @@ # coding=utf-8 - """ ocr @@ -19,16 +18,20 @@ from pyflow import WorkflowRunner ''' TODO: - ' Implement --end-page: Last page to ocr - ' Implement --memMb: Total amount of memory (RAM) available for this workflow. Default: 2048 * nCores - ' Implement --rotate: Rotate pages from input (90, 180, 270) - ' Implement --split-pages: Split pages in half after possible rotation - ' Implement --start-page: First page to ocr +' Implement --end-page: Last page to ocr +' Implement --memMb: Total amount of memory (RAM) available for this workflow. Default: 2048 * nCores +' Implement --rotate: Rotate pages from input (90, 180, 270) +' Implement --split-pages: Split pages in half after possible rotation +' Implement --start-page: First page to ocr ''' def parse_arguments(): - parser = argparse.ArgumentParser("Performs OCR of (historical) documents utilizing OCRopus for preprocessing and Tesseract OCR \ - for OCR. Available outputs are HOCR, PDF, shrinked PDF, and simple DTAbf \ - (TEI P5 compliant). Software requirements: imagemagick, ocropus, pdftk, pdftoppm, poppler-utils, pyflow, python2.7, tesseract") + parser = argparse.ArgumentParser( + "Performs OCR of (historical) documents utilizing OCRopus for \ + preprocessing and Tesseract OCR for OCR. Available outputs are HOCR, \ + PDF, shrinked PDF, and simple DTAbf (TEI P5 compliant). Software \ + requirements: imagemagick, ocropus, pdftk, pdftoppm, poppler-utils, \ + pyflow, python2.7, tesseract" + ) parser.add_argument("-i", dest="inputDir", @@ -81,12 +84,7 @@ class OCRWorkflow(WorkflowRunner): mkdir_job_number = 0 for job in self.jobs: mkdir_job_number += 1 - cmd = 'mkdir -p "%s" "%s" "%s" "%s"' % ( - os.path.join(job["output_dir"], "hocr_files"), - os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"), - os.path.join(job["output_dir"], "tmp", "tesseract"), - os.path.join(job["output_dir"], "tmp", "tiff_files") - ) + cmd = 'mkdir -p "%s"' % (os.path.join(job["output_dir"], "tmp")) mkdir_jobs.append(self.addTask(label="mkdir_job_-_%i" % (mkdir_job_number), command=cmd)) ### @@ -102,20 +100,20 @@ class OCRWorkflow(WorkflowRunner): ''' cmd = 'convert "%s" "%s"' % ( job["path"], - os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + "-%sd.tif" % ("%"))) + os.path.join(job["output_dir"], "tmp", job["basename"].rsplit(".", 1)[0] + "-%sd.tif" % ("%"))) ''' # WORKAROUND cmd = 'tiff2pdf -o "%s" "%s" && pdftoppm "%s" "%s" -tiff -r 300 -tiffcompression lzw -cropbox && rm "%s"' % ( - os.path.join(job["output_dir"], "tmp", "tiff_files", job["basename"].rsplit(".", 1)[0] + ".pdf"), + os.path.join(job["output_dir"], "tmp", job["basename"].rsplit(".", 1)[0] + ".pdf"), job["path"], - os.path.join(job["output_dir"], "tmp", "tiff_files", job["basename"].rsplit(".", 1)[0] + ".pdf"), - os.path.join(job["output_dir"], "tmp", "tiff_files", job["basename"].rsplit(".", 1)[0]), - os.path.join(job["output_dir"], "tmp", "tiff_files", job["basename"].rsplit(".", 1)[0] + ".pdf") + os.path.join(job["output_dir"], "tmp", job["basename"].rsplit(".", 1)[0] + ".pdf"), + os.path.join(job["output_dir"], "tmp", "page"), + os.path.join(job["output_dir"], "tmp", job["basename"].rsplit(".", 1)[0] + ".pdf") ) else: cmd = 'pdftoppm "%s" "%s" -tiff -r 300 -tiffcompression lzw -cropbox' % ( job["path"], - os.path.join(job["output_dir"], "tmp", "tiff_files", "page") + os.path.join(job["output_dir"], "tmp", "page") ) split_jobs.append(self.addTask(label="split_job_-_%i" % (split_job_number), command=cmd, dependencies=mkdir_jobs)) @@ -123,95 +121,108 @@ class OCRWorkflow(WorkflowRunner): # Task "ocropus_nlbin_job": binarize tiff files from previous split # Dependencies: split_jobs ### - ocropusnlbin_jobs = [] - if (not self.skipBinarization): - self.waitForTasks() - ocropusnlbin_job_number = 0 + binarization_jobs = [] + binarization_job_number = 0 + ''' + ' We run ocropus-nlbin with either four or, if there are less then four + ' cores available for this workflow, the available core number. + ''' + binarization_job_nCores = min(4, self.nCores) + if not self.skipBinarization: for job in self.jobs: - ocropusnlbin_job_number += 1 - cmd = 'ocropus-nlbin -Q "%i" -o "%s" "%s"/*' % ( - max(1, int(self.nCores / len(self.jobs))), - os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"), - os.path.join(job["output_dir"], "tmp", "tiff_files") + binarization_job_number += 1 + cmd = 'ocropus-nlbin -Q "%i" -o "%s" "%s"/*.tif' % ( + binarization_job_nCores, + os.path.join(job["output_dir"], "tmp"), + os.path.join(job["output_dir"], "tmp") ) - ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs, nCores=max(1, int(self.nCores / len(self.jobs))))) + binarization_jobs.append(self.addTask(label="binarization_job_-_%i" % (binarization_job_number), command=cmd, dependencies=split_jobs, nCores=binarization_job_nCores)) ### - # Task "tesseract_job": perform OCR on binarized images - # Dependencies: ocropusnlbin_jobs + # Task "post_binarization_job": Normalize file names from binarization + # Dependencies: binarization_jobs ### self.waitForTasks() - tesseract_jobs = [] - tesseract_job_number = 0 + post_binarization_jobs = [] + post_binarization_job_number = 0 + if not self.skipBinarization: + for job in self.jobs: + for file in filter(lambda x: x.endswith(".bin.png") or x.endswith(".nrm.png"), os.listdir(os.path.join(job["output_dir"], "tmp"))): + post_binarization_job_number += 1 + cmd = 'mv "%s" "%s"' % ( + os.path.join(job["output_dir"], "tmp", file), + os.path.join(job["output_dir"], "tmp", "page-%i.%s" % (int(file.split(".", 1)[0]), file.split(".", 1)[1])), + ) + post_binarization_jobs.append(self.addTask(label="post_binarization_job_-_%i" % (post_binarization_job_number), command=cmd, dependencies=binarization_jobs)) + + ### + # Task "ocr_job": perform OCR + # Dependencies: waitForTasks + ### + self.waitForTasks() + ocr_jobs = [] + ocr_job_number = 0 + ''' + ' Tesseract runs fastest with four cores. So we run it with either four + ' or, if there are less then four cores available for this workflow, the + ' available core number. + ''' + ocr_job_nCores = min(4, self.nCores) for job in self.jobs: - # This list is empty if you don't wait for ocropus_nlbin_jobs to complete - for file in filter(lambda x: self.skipBinarization or x.endswith(".bin.png"), os.listdir(os.path.join(job["output_dir"], "tmp", "tiff_files" if self.skipBinarization else "ocropus-nlbin"))): - tesseract_job_number += 1 + for file in filter(lambda x: x.endswith(".tif") if self.skipBinarization else x.endswith(".bin.png"), os.listdir(os.path.join(job["output_dir"], "tmp"))): + ocr_job_number += 1 cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % ( - os.path.join(job["output_dir"], "tmp", "tiff_files" if self.skipBinarization else "ocropus-nlbin", file), - os.path.join(job["output_dir"], "tmp", "tesseract", file.rsplit(".", 1 if self.skipBinarization else 2)[0]), + os.path.join(job["output_dir"], "tmp", file), + os.path.join(job["output_dir"], "tmp", file.rsplit(".", 1 if self.skipBinarization else 2)[0]), self.lang ) - tesseract_jobs.append(self.addTask(label="tesseract_job_-_%i" % (tesseract_job_number), command=cmd, dependencies=ocropusnlbin_jobs, nCores=min(4, self.nCores))) + ocr_jobs.append(self.addTask(label="ocr_job_-_%i" % (ocr_job_number), command=cmd, dependencies=post_binarization_jobs, nCores=ocr_job_nCores)) ### # Task "hocr_to_teip_job": create TEI P5 file from hocr files - # Dependencies: tesseract_jobs + # Dependencies: ocr_jobs ### hocr_to_tei_jobs = [] hocr_to_tei_job_number = 0 for job in self.jobs: hocr_to_tei_job_number += 1 cmd = 'hocrtotei "%s" "%s"' % ( - os.path.join(job["output_dir"], "tmp", "tesseract"), + os.path.join(job["output_dir"], "tmp"), os.path.join(job["output_dir"], job["basename"].rsplit(".", 1)[0] + ".xml") ) - hocr_to_tei_jobs.append(self.addTask(label="hocr_to_tei_job_-_%i" % (hocr_to_tei_job_number), command=cmd, dependencies=tesseract_jobs)) - - ### - # Task "move_hocr_job": move hocr files from /tmp/tesseract to /hocr_files - # Dependencies: hocr_to_teip5_jobs - ### - move_hocr_jobs = [] - move_hocr_job_number = 0 - for job in self.jobs: - move_hocr_job_number += 1 - cmd = 'mv "%s"/*.hocr "%s"' % ( - os.path.join(job["output_dir"], "tmp", "tesseract"), - os.path.join(job["output_dir"], "hocr_files") - ) - move_hocr_jobs.append(self.addTask(label="move_hocr_job_-_%i" % (move_hocr_job_number), command=cmd, dependencies=hocr_to_tei_jobs)) + hocr_to_tei_jobs.append(self.addTask(label="hocr_to_tei_job_-_%i" % (hocr_to_tei_job_number), command=cmd, dependencies=ocr_jobs)) ### # Task "pdf_merge_job": Merge PDF files - # Dependencies: tesseract_jobs + # Dependencies: ocr_jobs ### pdf_merge_jobs = [] pdf_merge_job_number = 0 for job in self.jobs: pdf_merge_job_number += 1 cmd = 'pdftk "%s"/*.pdf cat output "%s"' % ( - os.path.join(job["output_dir"], "tmp", "tesseract"), + os.path.join(job["output_dir"], "tmp"), os.path.join(job["output_dir"], job["basename"].rsplit(".", 1)[0] + ".pdf") ) - pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=tesseract_jobs)) + pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=ocr_jobs)) ### - # Task "pdf_to_txt_job": - # Dependencies: pdf_merge_jobs + # Task "txt_merge_job": Merge .txt files + # Dependencies: ocr_jobs ### - pdf_to_txt_jobs = [] - pdf_to_txt_job_number = 0 + txt_merge_jobs = [] + txt_merge_job_number = 0 for job in self.jobs: - pdf_to_txt_job_number += 1 - cmd = 'pdftotext -raw "%s"' % ( - os.path.join(job["output_dir"], job["basename"].rsplit(".", 1)[0] + ".pdf") + txt_merge_job_number += 1 + cmd = 'cat "%s"/*.txt > "%s"' % ( + os.path.join(job["output_dir"], "tmp"), + os.path.join(job["output_dir"], job["basename"].rsplit(".", 1)[0] + ".txt") ) - pdf_merge_jobs.append(self.addTask(label="pdf_to_txt_job_-_%i" % (pdf_to_txt_job_number), command=cmd, dependencies=pdf_merge_jobs)) + txt_merge_jobs.append(self.addTask(label="txt_merge_job_-_%i" % (txt_merge_job_number), command=cmd, dependencies=ocr_jobs)) ### # Task "cleanup_job": remove temporary files - # Dependencies: hocr_to_teip5_jobs + move_hocr_jobs + pdf_merge_jobs + pdf_to_txt_jobs + # Dependencies: hocr_to_teip5_jobs + pdf_merge_jobs + txt_merge_jobs ### cleanup_jobs = [] cleanup_job_counter = 0 @@ -221,7 +232,7 @@ class OCRWorkflow(WorkflowRunner): cmd = 'rm -r "%s"' % ( os.path.join(job["output_dir"], "tmp") ) - cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_tei_jobs + move_hocr_jobs + pdf_merge_jobs + pdf_to_txt_jobs)) + cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs)) def analyze_jobs(inputDir, outputDir, level=1): @@ -259,4 +270,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main()