From 26757eda03ac7a18f165e8acedd2878e742397c4 Mon Sep 17 00:00:00 2001
From: Patrick Jentsch
Date: Sun, 10 Mar 2019 20:59:30 +0100
Subject: [PATCH] Some renaming and cleanup.
---
Dockerfile | 6 ++----
parse_hocr => hocrtotei | 0
ocr_pyflow => ocr | 16 ++++++++--------
3 files changed, 10 insertions(+), 12 deletions(-)
rename parse_hocr => hocrtotei (100%)
rename ocr_pyflow => ocr (95%)
diff --git a/Dockerfile b/Dockerfile
index f97b123..0fd3d6c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -55,9 +55,7 @@ RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /et
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata
-RUN mkdir files_for_ocr files_from_ocr
-
-COPY ocr_pyflow /usr/local/bin
-COPY parse_hocr /usr/local/bin
+COPY ocr /usr/local/bin
+COPY hocr2tei /usr/local/bin
CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/parse_hocr b/hocrtotei
similarity index 100%
rename from parse_hocr
rename to hocrtotei
diff --git a/ocr_pyflow b/ocr
similarity index 95%
rename from ocr_pyflow
rename to ocr
index 1ac635e..c7b50ff 100755
--- a/ocr_pyflow
+++ b/ocr
@@ -4,7 +4,7 @@
"""
-ocr_pyflow.py
+ocr
Usage: For usage instructions run with option --help
Author: Patrick Jentsch
@@ -148,15 +148,15 @@ class OCRWorkflow(WorkflowRunner):
# Task "hocr_to_teip5_job": create TEI P5 file from hocr files
# Dependencies: tesseract_jobs
###
- hocr_to_teip5_jobs = []
- hocr_to_teip5_job_number = 0
+ hocr_to_tei_jobs = []
+ hocr_to_tei_job_number = 0
for job in self.jobs:
- hocr_to_teip5_job_number += 1
- cmd = 'parse_hocr "%s" "%s"' % (
+ hocr_to_tei_job_number += 1
+ cmd = 'hocrtotei "%s" "%s"' % (
os.path.join(job["output_dir"], "tmp", "tesseract"),
os.path.join(job["output_dir"], job["basename"].rsplit(".", 1)[0] + ".xml")
)
- hocr_to_teip5_jobs.append(self.addTask(label="hocr_to_teip5_job_-_%i" % (hocr_to_teip5_job_number), command=cmd, dependencies=tesseract_jobs))
+ hocr_to_tei_jobs.append(self.addTask(label="hocr_to_tei_job_-_%i" % (hocr_to_tei_job_number), command=cmd, dependencies=tesseract_jobs))
###
# Task "move_hocr_job": move hocr files from /tmp/tesseract to /hocr_files
@@ -170,7 +170,7 @@ class OCRWorkflow(WorkflowRunner):
os.path.join(job["output_dir"], "tmp", "tesseract"),
os.path.join(job["output_dir"], "hocr_files")
)
- move_hocr_jobs.append(self.addTask(label="move_hocr_job_-_%i" % (move_hocr_job_number), command=cmd, dependencies=hocr_to_teip5_jobs))
+ move_hocr_jobs.append(self.addTask(label="move_hocr_job_-_%i" % (move_hocr_job_number), command=cmd, dependencies=hocr_to_tei_jobs))
###
# Task "pdf_merge_job": Merge PDF files
@@ -211,7 +211,7 @@ class OCRWorkflow(WorkflowRunner):
cmd = 'rm -r "%s"' % (
os.path.join(job["output_dir"], "tmp")
)
- cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_teip5_jobs + move_hocr_jobs + pdf_merge_jobs + pdf_to_txt_jobs))
+ cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_tei_jobs + move_hocr_jobs + pdf_merge_jobs + pdf_to_txt_jobs))
def analyze_jobs(inputDir, outputDir, level=1):