mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2024-12-27 06:34:18 +00:00
Some renaming and cleanup.
This commit is contained in:
parent
8840217829
commit
26757eda03
@ -55,9 +55,7 @@ RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /et
|
|||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata
|
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata
|
||||||
|
|
||||||
RUN mkdir files_for_ocr files_from_ocr
|
COPY ocr /usr/local/bin
|
||||||
|
COPY hocr2tei /usr/local/bin
|
||||||
COPY ocr_pyflow /usr/local/bin
|
|
||||||
COPY parse_hocr /usr/local/bin
|
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
CMD ["/bin/bash"]
|
@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
ocr_pyflow.py
|
ocr
|
||||||
|
|
||||||
Usage: For usage instructions run with option --help
|
Usage: For usage instructions run with option --help
|
||||||
Author: Patrick Jentsch <p.jentsch@uni-bielefeld.de>
|
Author: Patrick Jentsch <p.jentsch@uni-bielefeld.de>
|
||||||
@ -148,15 +148,15 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
# Task "hocr_to_teip5_job": create TEI P5 file from hocr files
|
# Task "hocr_to_teip5_job": create TEI P5 file from hocr files
|
||||||
# Dependencies: tesseract_jobs
|
# Dependencies: tesseract_jobs
|
||||||
###
|
###
|
||||||
hocr_to_teip5_jobs = []
|
hocr_to_tei_jobs = []
|
||||||
hocr_to_teip5_job_number = 0
|
hocr_to_tei_job_number = 0
|
||||||
for job in self.jobs:
|
for job in self.jobs:
|
||||||
hocr_to_teip5_job_number += 1
|
hocr_to_tei_job_number += 1
|
||||||
cmd = 'parse_hocr "%s" "%s"' % (
|
cmd = 'hocrtotei "%s" "%s"' % (
|
||||||
os.path.join(job["output_dir"], "tmp", "tesseract"),
|
os.path.join(job["output_dir"], "tmp", "tesseract"),
|
||||||
os.path.join(job["output_dir"], job["basename"].rsplit(".", 1)[0] + ".xml")
|
os.path.join(job["output_dir"], job["basename"].rsplit(".", 1)[0] + ".xml")
|
||||||
)
|
)
|
||||||
hocr_to_teip5_jobs.append(self.addTask(label="hocr_to_teip5_job_-_%i" % (hocr_to_teip5_job_number), command=cmd, dependencies=tesseract_jobs))
|
hocr_to_tei_jobs.append(self.addTask(label="hocr_to_tei_job_-_%i" % (hocr_to_tei_job_number), command=cmd, dependencies=tesseract_jobs))
|
||||||
|
|
||||||
###
|
###
|
||||||
# Task "move_hocr_job": move hocr files from <output_dir>/tmp/tesseract to <output_dir>/hocr_files
|
# Task "move_hocr_job": move hocr files from <output_dir>/tmp/tesseract to <output_dir>/hocr_files
|
||||||
@ -170,7 +170,7 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
os.path.join(job["output_dir"], "tmp", "tesseract"),
|
os.path.join(job["output_dir"], "tmp", "tesseract"),
|
||||||
os.path.join(job["output_dir"], "hocr_files")
|
os.path.join(job["output_dir"], "hocr_files")
|
||||||
)
|
)
|
||||||
move_hocr_jobs.append(self.addTask(label="move_hocr_job_-_%i" % (move_hocr_job_number), command=cmd, dependencies=hocr_to_teip5_jobs))
|
move_hocr_jobs.append(self.addTask(label="move_hocr_job_-_%i" % (move_hocr_job_number), command=cmd, dependencies=hocr_to_tei_jobs))
|
||||||
|
|
||||||
###
|
###
|
||||||
# Task "pdf_merge_job": Merge PDF files
|
# Task "pdf_merge_job": Merge PDF files
|
||||||
@ -211,7 +211,7 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
cmd = 'rm -r "%s"' % (
|
cmd = 'rm -r "%s"' % (
|
||||||
os.path.join(job["output_dir"], "tmp")
|
os.path.join(job["output_dir"], "tmp")
|
||||||
)
|
)
|
||||||
cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_teip5_jobs + move_hocr_jobs + pdf_merge_jobs + pdf_to_txt_jobs))
|
cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_tei_jobs + move_hocr_jobs + pdf_merge_jobs + pdf_to_txt_jobs))
|
||||||
|
|
||||||
|
|
||||||
def analyze_jobs(inputDir, outputDir, level=1):
|
def analyze_jobs(inputDir, outputDir, level=1):
|
Loading…
Reference in New Issue
Block a user