mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2024-12-26 15:54:19 +00:00
Some renaming and cleanup.
This commit is contained in:
parent
8840217829
commit
26757eda03
@ -55,9 +55,7 @@ RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /et
|
||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata
|
||||
|
||||
RUN mkdir files_for_ocr files_from_ocr
|
||||
|
||||
COPY ocr_pyflow /usr/local/bin
|
||||
COPY parse_hocr /usr/local/bin
|
||||
COPY ocr /usr/local/bin
|
||||
COPY hocr2tei /usr/local/bin
|
||||
|
||||
CMD ["/bin/bash"]
|
@ -4,7 +4,7 @@
|
||||
|
||||
|
||||
"""
|
||||
ocr_pyflow.py
|
||||
ocr
|
||||
|
||||
Usage: For usage instructions run with option --help
|
||||
Author: Patrick Jentsch <p.jentsch@uni-bielefeld.de>
|
||||
@ -148,15 +148,15 @@ class OCRWorkflow(WorkflowRunner):
|
||||
# Task "hocr_to_teip5_job": create TEI P5 file from hocr files
|
||||
# Dependencies: tesseract_jobs
|
||||
###
|
||||
hocr_to_teip5_jobs = []
|
||||
hocr_to_teip5_job_number = 0
|
||||
hocr_to_tei_jobs = []
|
||||
hocr_to_tei_job_number = 0
|
||||
for job in self.jobs:
|
||||
hocr_to_teip5_job_number += 1
|
||||
cmd = 'parse_hocr "%s" "%s"' % (
|
||||
hocr_to_tei_job_number += 1
|
||||
cmd = 'hocrtotei "%s" "%s"' % (
|
||||
os.path.join(job["output_dir"], "tmp", "tesseract"),
|
||||
os.path.join(job["output_dir"], job["basename"].rsplit(".", 1)[0] + ".xml")
|
||||
)
|
||||
hocr_to_teip5_jobs.append(self.addTask(label="hocr_to_teip5_job_-_%i" % (hocr_to_teip5_job_number), command=cmd, dependencies=tesseract_jobs))
|
||||
hocr_to_tei_jobs.append(self.addTask(label="hocr_to_tei_job_-_%i" % (hocr_to_tei_job_number), command=cmd, dependencies=tesseract_jobs))
|
||||
|
||||
###
|
||||
# Task "move_hocr_job": move hocr files from <output_dir>/tmp/tesseract to <output_dir>/hocr_files
|
||||
@ -170,7 +170,7 @@ class OCRWorkflow(WorkflowRunner):
|
||||
os.path.join(job["output_dir"], "tmp", "tesseract"),
|
||||
os.path.join(job["output_dir"], "hocr_files")
|
||||
)
|
||||
move_hocr_jobs.append(self.addTask(label="move_hocr_job_-_%i" % (move_hocr_job_number), command=cmd, dependencies=hocr_to_teip5_jobs))
|
||||
move_hocr_jobs.append(self.addTask(label="move_hocr_job_-_%i" % (move_hocr_job_number), command=cmd, dependencies=hocr_to_tei_jobs))
|
||||
|
||||
###
|
||||
# Task "pdf_merge_job": Merge PDF files
|
||||
@ -211,7 +211,7 @@ class OCRWorkflow(WorkflowRunner):
|
||||
cmd = 'rm -r "%s"' % (
|
||||
os.path.join(job["output_dir"], "tmp")
|
||||
)
|
||||
cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_teip5_jobs + move_hocr_jobs + pdf_merge_jobs + pdf_to_txt_jobs))
|
||||
cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_tei_jobs + move_hocr_jobs + pdf_merge_jobs + pdf_to_txt_jobs))
|
||||
|
||||
|
||||
def analyze_jobs(inputDir, outputDir, level=1):
|
Loading…
Reference in New Issue
Block a user