Some renaming and cleanup.

This commit is contained in:
Patrick Jentsch 2019-03-10 20:59:30 +01:00
parent 8840217829
commit 26757eda03
3 changed files with 10 additions and 12 deletions

View File

@ -55,9 +55,7 @@ RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /et
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata
RUN mkdir files_for_ocr files_from_ocr COPY ocr /usr/local/bin
COPY hocr2tei /usr/local/bin
COPY ocr_pyflow /usr/local/bin
COPY parse_hocr /usr/local/bin
CMD ["/bin/bash"] CMD ["/bin/bash"]

View File

@ -4,7 +4,7 @@
""" """
ocr_pyflow.py ocr
Usage: For usage instructions run with option --help Usage: For usage instructions run with option --help
Author: Patrick Jentsch <p.jentsch@uni-bielefeld.de> Author: Patrick Jentsch <p.jentsch@uni-bielefeld.de>
@ -148,15 +148,15 @@ class OCRWorkflow(WorkflowRunner):
# Task "hocr_to_teip5_job": create TEI P5 file from hocr files # Task "hocr_to_teip5_job": create TEI P5 file from hocr files
# Dependencies: tesseract_jobs # Dependencies: tesseract_jobs
### ###
hocr_to_teip5_jobs = [] hocr_to_tei_jobs = []
hocr_to_teip5_job_number = 0 hocr_to_tei_job_number = 0
for job in self.jobs: for job in self.jobs:
hocr_to_teip5_job_number += 1 hocr_to_tei_job_number += 1
cmd = 'parse_hocr "%s" "%s"' % ( cmd = 'hocrtotei "%s" "%s"' % (
os.path.join(job["output_dir"], "tmp", "tesseract"), os.path.join(job["output_dir"], "tmp", "tesseract"),
os.path.join(job["output_dir"], job["basename"].rsplit(".", 1)[0] + ".xml") os.path.join(job["output_dir"], job["basename"].rsplit(".", 1)[0] + ".xml")
) )
hocr_to_teip5_jobs.append(self.addTask(label="hocr_to_teip5_job_-_%i" % (hocr_to_teip5_job_number), command=cmd, dependencies=tesseract_jobs)) hocr_to_tei_jobs.append(self.addTask(label="hocr_to_tei_job_-_%i" % (hocr_to_tei_job_number), command=cmd, dependencies=tesseract_jobs))
### ###
# Task "move_hocr_job": move hocr files from <output_dir>/tmp/tesseract to <output_dir>/hocr_files # Task "move_hocr_job": move hocr files from <output_dir>/tmp/tesseract to <output_dir>/hocr_files
@ -170,7 +170,7 @@ class OCRWorkflow(WorkflowRunner):
os.path.join(job["output_dir"], "tmp", "tesseract"), os.path.join(job["output_dir"], "tmp", "tesseract"),
os.path.join(job["output_dir"], "hocr_files") os.path.join(job["output_dir"], "hocr_files")
) )
move_hocr_jobs.append(self.addTask(label="move_hocr_job_-_%i" % (move_hocr_job_number), command=cmd, dependencies=hocr_to_teip5_jobs)) move_hocr_jobs.append(self.addTask(label="move_hocr_job_-_%i" % (move_hocr_job_number), command=cmd, dependencies=hocr_to_tei_jobs))
### ###
# Task "pdf_merge_job": Merge PDF files # Task "pdf_merge_job": Merge PDF files
@ -211,7 +211,7 @@ class OCRWorkflow(WorkflowRunner):
cmd = 'rm -r "%s"' % ( cmd = 'rm -r "%s"' % (
os.path.join(job["output_dir"], "tmp") os.path.join(job["output_dir"], "tmp")
) )
cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_teip5_jobs + move_hocr_jobs + pdf_merge_jobs + pdf_to_txt_jobs)) cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_tei_jobs + move_hocr_jobs + pdf_merge_jobs + pdf_to_txt_jobs))
def analyze_jobs(inputDir, outputDir, level=1): def analyze_jobs(inputDir, outputDir, level=1):