diff --git a/Dockerfile b/Dockerfile index f97b123..0fd3d6c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -55,9 +55,7 @@ RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /et wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata -RUN mkdir files_for_ocr files_from_ocr - -COPY ocr_pyflow /usr/local/bin -COPY parse_hocr /usr/local/bin +COPY ocr /usr/local/bin +COPY hocr2tei /usr/local/bin CMD ["/bin/bash"] \ No newline at end of file diff --git a/parse_hocr b/hocrtotei similarity index 100% rename from parse_hocr rename to hocrtotei diff --git a/ocr_pyflow b/ocr similarity index 95% rename from ocr_pyflow rename to ocr index 1ac635e..c7b50ff 100755 --- a/ocr_pyflow +++ b/ocr @@ -4,7 +4,7 @@ """ -ocr_pyflow.py +ocr Usage: For usage instructions run with option --help Author: Patrick Jentsch @@ -148,15 +148,15 @@ class OCRWorkflow(WorkflowRunner): # Task "hocr_to_teip5_job": create TEI P5 file from hocr files # Dependencies: tesseract_jobs ### - hocr_to_teip5_jobs = [] - hocr_to_teip5_job_number = 0 + hocr_to_tei_jobs = [] + hocr_to_tei_job_number = 0 for job in self.jobs: - hocr_to_teip5_job_number += 1 - cmd = 'parse_hocr "%s" "%s"' % ( + hocr_to_tei_job_number += 1 + cmd = 'hocrtotei "%s" "%s"' % ( os.path.join(job["output_dir"], "tmp", "tesseract"), os.path.join(job["output_dir"], job["basename"].rsplit(".", 1)[0] + ".xml") ) - hocr_to_teip5_jobs.append(self.addTask(label="hocr_to_teip5_job_-_%i" % (hocr_to_teip5_job_number), command=cmd, dependencies=tesseract_jobs)) + hocr_to_tei_jobs.append(self.addTask(label="hocr_to_tei_job_-_%i" % (hocr_to_tei_job_number), command=cmd, dependencies=tesseract_jobs)) ### # Task "move_hocr_job": move hocr files from /tmp/tesseract to /hocr_files @@ -170,7 +170,7 @@ class OCRWorkflow(WorkflowRunner): os.path.join(job["output_dir"], "tmp", "tesseract"), os.path.join(job["output_dir"], "hocr_files") ) - move_hocr_jobs.append(self.addTask(label="move_hocr_job_-_%i" % (move_hocr_job_number), command=cmd, dependencies=hocr_to_teip5_jobs)) + move_hocr_jobs.append(self.addTask(label="move_hocr_job_-_%i" % (move_hocr_job_number), command=cmd, dependencies=hocr_to_tei_jobs)) ### # Task "pdf_merge_job": Merge PDF files @@ -211,7 +211,7 @@ class OCRWorkflow(WorkflowRunner): cmd = 'rm -r "%s"' % ( os.path.join(job["output_dir"], "tmp") ) - cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_teip5_jobs + move_hocr_jobs + pdf_merge_jobs + pdf_to_txt_jobs)) + cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_tei_jobs + move_hocr_jobs + pdf_merge_jobs + pdf_to_txt_jobs)) def analyze_jobs(inputDir, outputDir, level=1):