mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2024-12-26 16:04:18 +00:00
Change tif split handling, sort files before merging
This commit is contained in:
parent
1fcb8bd318
commit
d25204d6a9
20
Dockerfile
20
Dockerfile
@ -11,7 +11,6 @@ RUN apt-get update && \
|
||||
ca-certificates \
|
||||
gnupg2 \
|
||||
imagemagick \
|
||||
libtiff-tools \
|
||||
pdftk \
|
||||
poppler-utils \
|
||||
python2.7 \
|
||||
@ -45,15 +44,16 @@ RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERS
|
||||
RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list && \
|
||||
wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - && \
|
||||
apt-get update && \
|
||||
apt-get install -y --no-install-recommends tesseract-ocr && \
|
||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
||||
wget -nv https://github.com/tesseract-ocr/tessdata/raw/master/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata
|
||||
apt-get install -y --no-install-recommends \
|
||||
tesseract-ocr \
|
||||
tesseract-ocr-deu \
|
||||
tesseract-ocr-frk \
|
||||
tesseract-ocr-eng \
|
||||
tesseract-ocr-enm \
|
||||
tesseract-ocr-fra \
|
||||
tesseract-ocr-frm \
|
||||
tesseract-ocr-por \
|
||||
tesseract-ocr-spa
|
||||
|
||||
COPY ocr /usr/local/bin
|
||||
COPY hocrtotei /usr/local/bin
|
||||
|
48
ocr
48
ocr
@ -105,23 +105,13 @@ class OCRWorkflow(WorkflowRunner):
|
||||
split_job_number = 0
|
||||
for job in self.jobs:
|
||||
split_job_number += 1
|
||||
if job["basename"].endswith(".tif") or job["basename"].endswith(".tiff"):
|
||||
# TODO: Make the following command work
|
||||
'''
|
||||
cmd = 'convert "%s" "%s"' % (
|
||||
if job["filename"].endswith((".tif", ".tiff")):
|
||||
cmd = 'convert "%s" -compress LZW -density 300 -scene 1 "%s"/page-%%d.tif' % (
|
||||
job["path"],
|
||||
os.path.join(job["output_dir"], "tmp", job["basename"].rsplit(".", 1)[0] + "-%sd.tif" % ("%")))
|
||||
'''
|
||||
# WORKAROUND
|
||||
cmd = 'tiff2pdf -o "%s" "%s" && pdftoppm "%s" "%s" -tiff -r 300 -tiffcompression lzw -cropbox && rm "%s"' % (
|
||||
os.path.join(job["output_dir"], "tmp", job["basename"].rsplit(".", 1)[0] + ".pdf"),
|
||||
job["path"],
|
||||
os.path.join(job["output_dir"], "tmp", job["basename"].rsplit(".", 1)[0] + ".pdf"),
|
||||
os.path.join(job["output_dir"], "tmp", "page"),
|
||||
os.path.join(job["output_dir"], "tmp", job["basename"].rsplit(".", 1)[0] + ".pdf")
|
||||
os.path.join(job["output_dir"], "tmp")
|
||||
)
|
||||
else:
|
||||
cmd = 'pdftoppm "%s" "%s" -tiff -r 300 -tiffcompression lzw -cropbox' % (
|
||||
cmd = 'pdftoppm -r 300 -tiff -tiffcompression lzw "%s" "%s"' % (
|
||||
job["path"],
|
||||
os.path.join(job["output_dir"], "tmp", "page")
|
||||
)
|
||||
@ -141,9 +131,9 @@ class OCRWorkflow(WorkflowRunner):
|
||||
if not self.skipBinarization:
|
||||
for job in self.jobs:
|
||||
binarization_job_number += 1
|
||||
cmd = 'ocropus-nlbin -Q "%i" -o "%s" "%s"/*.tif' % (
|
||||
binarization_job_nCores,
|
||||
cmd = 'ocropus-nlbin --output "%s" --parallel "%i" $(ls "%s"/*.tif | sort -V)' % (
|
||||
os.path.join(job["output_dir"], "tmp"),
|
||||
binarization_job_nCores,
|
||||
os.path.join(job["output_dir"], "tmp")
|
||||
)
|
||||
binarization_jobs.append(self.addTask(label="binarization_job_-_%i" % (binarization_job_number), command=cmd, dependencies=split_jobs, nCores=binarization_job_nCores))
|
||||
@ -157,7 +147,7 @@ class OCRWorkflow(WorkflowRunner):
|
||||
post_binarization_job_number = 0
|
||||
if not self.skipBinarization:
|
||||
for job in self.jobs:
|
||||
for file in filter(lambda x: x.endswith(".bin.png") or x.endswith(".nrm.png"), os.listdir(os.path.join(job["output_dir"], "tmp"))):
|
||||
for file in filter(lambda x: x.endswith((".bin.png", ".nrm.png")), os.listdir(os.path.join(job["output_dir"], "tmp"))):
|
||||
post_binarization_job_number += 1
|
||||
cmd = 'mv "%s" "%s"' % (
|
||||
os.path.join(job["output_dir"], "tmp", file),
|
||||
@ -195,7 +185,7 @@ class OCRWorkflow(WorkflowRunner):
|
||||
ocr_jobs.append(self.addTask(label="ocr_job_-_%i" % (ocr_job_number), command=cmd, dependencies=post_binarization_jobs, nCores=ocr_job_nCores))
|
||||
|
||||
###
|
||||
# Task "hocr_to_teip_job": create TEI P5 file from hocr files
|
||||
# Task "hocr_to_tei_job": create TEI P5 file from hocr files
|
||||
# Dependencies: ocr_jobs
|
||||
###
|
||||
hocr_to_tei_jobs = []
|
||||
@ -204,7 +194,7 @@ class OCRWorkflow(WorkflowRunner):
|
||||
hocr_to_tei_job_number += 1
|
||||
cmd = 'hocrtotei "%s" "%s"' % (
|
||||
os.path.join(job["output_dir"], "tmp"),
|
||||
os.path.join(job["output_dir"], job["basename"].rsplit(".", 1)[0] + ".xml")
|
||||
os.path.join(job["output_dir"], job["filename"].rsplit(".", 1)[0] + ".xml")
|
||||
)
|
||||
hocr_to_tei_jobs.append(self.addTask(label="hocr_to_tei_job_-_%i" % (hocr_to_tei_job_number), command=cmd, dependencies=ocr_jobs))
|
||||
|
||||
@ -216,9 +206,9 @@ class OCRWorkflow(WorkflowRunner):
|
||||
pdf_merge_job_number = 0
|
||||
for job in self.jobs:
|
||||
pdf_merge_job_number += 1
|
||||
cmd = 'pdftk "%s"/*.pdf cat output "%s"' % (
|
||||
cmd = 'pdftk $(ls "%s"/*.pdf | sort -V) cat output "%s"' % (
|
||||
os.path.join(job["output_dir"], "tmp"),
|
||||
os.path.join(job["output_dir"], job["basename"].rsplit(".", 1)[0] + ".pdf")
|
||||
os.path.join(job["output_dir"], job["filename"].rsplit(".", 1)[0] + ".pdf")
|
||||
)
|
||||
pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=ocr_jobs))
|
||||
|
||||
@ -230,9 +220,9 @@ class OCRWorkflow(WorkflowRunner):
|
||||
txt_merge_job_number = 0
|
||||
for job in self.jobs:
|
||||
txt_merge_job_number += 1
|
||||
cmd = 'cat "%s"/*.txt > "%s"' % (
|
||||
cmd = 'cat $(ls "%s"/*.txt | sort -V) > "%s"' % (
|
||||
os.path.join(job["output_dir"], "tmp"),
|
||||
os.path.join(job["output_dir"], job["basename"].rsplit(".", 1)[0] + ".txt")
|
||||
os.path.join(job["output_dir"], job["filename"].rsplit(".", 1)[0] + ".txt")
|
||||
)
|
||||
txt_merge_jobs.append(self.addTask(label="txt_merge_job_-_%i" % (txt_merge_job_number), command=cmd, dependencies=ocr_jobs))
|
||||
|
||||
@ -272,21 +262,17 @@ class OCRWorkflow(WorkflowRunner):
|
||||
cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs))
|
||||
|
||||
|
||||
def analyze_jobs(inputDir, outputDir, level=1):
|
||||
def analyze_jobs(inputDir, outputDir):
|
||||
jobs = []
|
||||
|
||||
if level > 2:
|
||||
return jobs
|
||||
|
||||
for file in os.listdir(inputDir):
|
||||
if os.path.isdir(os.path.join(inputDir, file)):
|
||||
jobs += analyze_jobs(
|
||||
os.path.join(inputDir, file),
|
||||
os.path.join(outputDir, file),
|
||||
level + 1
|
||||
os.path.join(outputDir, file)
|
||||
)
|
||||
elif file.endswith(".pdf") or file.endswith(".tif") or file.endswith(".tiff"):
|
||||
jobs.append({"basename": os.path.basename(file), "output_dir": os.path.join(outputDir, file.rsplit(".", 1)[0]), "path": os.path.join(inputDir, file)})
|
||||
elif file.endswith((".pdf", ".tif", ".tiff")):
|
||||
jobs.append({"filename": file, "output_dir": os.path.join(outputDir, file), "path": os.path.join(inputDir, file)})
|
||||
|
||||
return jobs
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user