mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2024-12-27 06:04:18 +00:00
Change tif split handling, sort files before merging
This commit is contained in:
parent
1fcb8bd318
commit
d25204d6a9
20
Dockerfile
20
Dockerfile
@ -11,7 +11,6 @@ RUN apt-get update && \
|
|||||||
ca-certificates \
|
ca-certificates \
|
||||||
gnupg2 \
|
gnupg2 \
|
||||||
imagemagick \
|
imagemagick \
|
||||||
libtiff-tools \
|
|
||||||
pdftk \
|
pdftk \
|
||||||
poppler-utils \
|
poppler-utils \
|
||||||
python2.7 \
|
python2.7 \
|
||||||
@ -45,15 +44,16 @@ RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERS
|
|||||||
RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list && \
|
RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list && \
|
||||||
wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - && \
|
wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - && \
|
||||||
apt-get update && \
|
apt-get update && \
|
||||||
apt-get install -y --no-install-recommends tesseract-ocr && \
|
apt-get install -y --no-install-recommends \
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
tesseract-ocr \
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata/raw/master/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
tesseract-ocr-deu \
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
tesseract-ocr-frk \
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
tesseract-ocr-eng \
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
tesseract-ocr-enm \
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
tesseract-ocr-fra \
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
tesseract-ocr-frm \
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata
|
tesseract-ocr-por \
|
||||||
|
tesseract-ocr-spa
|
||||||
|
|
||||||
COPY ocr /usr/local/bin
|
COPY ocr /usr/local/bin
|
||||||
COPY hocrtotei /usr/local/bin
|
COPY hocrtotei /usr/local/bin
|
||||||
|
48
ocr
48
ocr
@ -105,23 +105,13 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
split_job_number = 0
|
split_job_number = 0
|
||||||
for job in self.jobs:
|
for job in self.jobs:
|
||||||
split_job_number += 1
|
split_job_number += 1
|
||||||
if job["basename"].endswith(".tif") or job["basename"].endswith(".tiff"):
|
if job["filename"].endswith((".tif", ".tiff")):
|
||||||
# TODO: Make the following command work
|
cmd = 'convert "%s" -compress LZW -density 300 -scene 1 "%s"/page-%%d.tif' % (
|
||||||
'''
|
|
||||||
cmd = 'convert "%s" "%s"' % (
|
|
||||||
job["path"],
|
job["path"],
|
||||||
os.path.join(job["output_dir"], "tmp", job["basename"].rsplit(".", 1)[0] + "-%sd.tif" % ("%")))
|
os.path.join(job["output_dir"], "tmp")
|
||||||
'''
|
|
||||||
# WORKAROUND
|
|
||||||
cmd = 'tiff2pdf -o "%s" "%s" && pdftoppm "%s" "%s" -tiff -r 300 -tiffcompression lzw -cropbox && rm "%s"' % (
|
|
||||||
os.path.join(job["output_dir"], "tmp", job["basename"].rsplit(".", 1)[0] + ".pdf"),
|
|
||||||
job["path"],
|
|
||||||
os.path.join(job["output_dir"], "tmp", job["basename"].rsplit(".", 1)[0] + ".pdf"),
|
|
||||||
os.path.join(job["output_dir"], "tmp", "page"),
|
|
||||||
os.path.join(job["output_dir"], "tmp", job["basename"].rsplit(".", 1)[0] + ".pdf")
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
cmd = 'pdftoppm "%s" "%s" -tiff -r 300 -tiffcompression lzw -cropbox' % (
|
cmd = 'pdftoppm -r 300 -tiff -tiffcompression lzw "%s" "%s"' % (
|
||||||
job["path"],
|
job["path"],
|
||||||
os.path.join(job["output_dir"], "tmp", "page")
|
os.path.join(job["output_dir"], "tmp", "page")
|
||||||
)
|
)
|
||||||
@ -141,9 +131,9 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
if not self.skipBinarization:
|
if not self.skipBinarization:
|
||||||
for job in self.jobs:
|
for job in self.jobs:
|
||||||
binarization_job_number += 1
|
binarization_job_number += 1
|
||||||
cmd = 'ocropus-nlbin -Q "%i" -o "%s" "%s"/*.tif' % (
|
cmd = 'ocropus-nlbin --output "%s" --parallel "%i" $(ls "%s"/*.tif | sort -V)' % (
|
||||||
binarization_job_nCores,
|
|
||||||
os.path.join(job["output_dir"], "tmp"),
|
os.path.join(job["output_dir"], "tmp"),
|
||||||
|
binarization_job_nCores,
|
||||||
os.path.join(job["output_dir"], "tmp")
|
os.path.join(job["output_dir"], "tmp")
|
||||||
)
|
)
|
||||||
binarization_jobs.append(self.addTask(label="binarization_job_-_%i" % (binarization_job_number), command=cmd, dependencies=split_jobs, nCores=binarization_job_nCores))
|
binarization_jobs.append(self.addTask(label="binarization_job_-_%i" % (binarization_job_number), command=cmd, dependencies=split_jobs, nCores=binarization_job_nCores))
|
||||||
@ -157,7 +147,7 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
post_binarization_job_number = 0
|
post_binarization_job_number = 0
|
||||||
if not self.skipBinarization:
|
if not self.skipBinarization:
|
||||||
for job in self.jobs:
|
for job in self.jobs:
|
||||||
for file in filter(lambda x: x.endswith(".bin.png") or x.endswith(".nrm.png"), os.listdir(os.path.join(job["output_dir"], "tmp"))):
|
for file in filter(lambda x: x.endswith((".bin.png", ".nrm.png")), os.listdir(os.path.join(job["output_dir"], "tmp"))):
|
||||||
post_binarization_job_number += 1
|
post_binarization_job_number += 1
|
||||||
cmd = 'mv "%s" "%s"' % (
|
cmd = 'mv "%s" "%s"' % (
|
||||||
os.path.join(job["output_dir"], "tmp", file),
|
os.path.join(job["output_dir"], "tmp", file),
|
||||||
@ -195,7 +185,7 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
ocr_jobs.append(self.addTask(label="ocr_job_-_%i" % (ocr_job_number), command=cmd, dependencies=post_binarization_jobs, nCores=ocr_job_nCores))
|
ocr_jobs.append(self.addTask(label="ocr_job_-_%i" % (ocr_job_number), command=cmd, dependencies=post_binarization_jobs, nCores=ocr_job_nCores))
|
||||||
|
|
||||||
###
|
###
|
||||||
# Task "hocr_to_teip_job": create TEI P5 file from hocr files
|
# Task "hocr_to_tei_job": create TEI P5 file from hocr files
|
||||||
# Dependencies: ocr_jobs
|
# Dependencies: ocr_jobs
|
||||||
###
|
###
|
||||||
hocr_to_tei_jobs = []
|
hocr_to_tei_jobs = []
|
||||||
@ -204,7 +194,7 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
hocr_to_tei_job_number += 1
|
hocr_to_tei_job_number += 1
|
||||||
cmd = 'hocrtotei "%s" "%s"' % (
|
cmd = 'hocrtotei "%s" "%s"' % (
|
||||||
os.path.join(job["output_dir"], "tmp"),
|
os.path.join(job["output_dir"], "tmp"),
|
||||||
os.path.join(job["output_dir"], job["basename"].rsplit(".", 1)[0] + ".xml")
|
os.path.join(job["output_dir"], job["filename"].rsplit(".", 1)[0] + ".xml")
|
||||||
)
|
)
|
||||||
hocr_to_tei_jobs.append(self.addTask(label="hocr_to_tei_job_-_%i" % (hocr_to_tei_job_number), command=cmd, dependencies=ocr_jobs))
|
hocr_to_tei_jobs.append(self.addTask(label="hocr_to_tei_job_-_%i" % (hocr_to_tei_job_number), command=cmd, dependencies=ocr_jobs))
|
||||||
|
|
||||||
@ -216,9 +206,9 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
pdf_merge_job_number = 0
|
pdf_merge_job_number = 0
|
||||||
for job in self.jobs:
|
for job in self.jobs:
|
||||||
pdf_merge_job_number += 1
|
pdf_merge_job_number += 1
|
||||||
cmd = 'pdftk "%s"/*.pdf cat output "%s"' % (
|
cmd = 'pdftk $(ls "%s"/*.pdf | sort -V) cat output "%s"' % (
|
||||||
os.path.join(job["output_dir"], "tmp"),
|
os.path.join(job["output_dir"], "tmp"),
|
||||||
os.path.join(job["output_dir"], job["basename"].rsplit(".", 1)[0] + ".pdf")
|
os.path.join(job["output_dir"], job["filename"].rsplit(".", 1)[0] + ".pdf")
|
||||||
)
|
)
|
||||||
pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=ocr_jobs))
|
pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=ocr_jobs))
|
||||||
|
|
||||||
@ -230,9 +220,9 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
txt_merge_job_number = 0
|
txt_merge_job_number = 0
|
||||||
for job in self.jobs:
|
for job in self.jobs:
|
||||||
txt_merge_job_number += 1
|
txt_merge_job_number += 1
|
||||||
cmd = 'cat "%s"/*.txt > "%s"' % (
|
cmd = 'cat $(ls "%s"/*.txt | sort -V) > "%s"' % (
|
||||||
os.path.join(job["output_dir"], "tmp"),
|
os.path.join(job["output_dir"], "tmp"),
|
||||||
os.path.join(job["output_dir"], job["basename"].rsplit(".", 1)[0] + ".txt")
|
os.path.join(job["output_dir"], job["filename"].rsplit(".", 1)[0] + ".txt")
|
||||||
)
|
)
|
||||||
txt_merge_jobs.append(self.addTask(label="txt_merge_job_-_%i" % (txt_merge_job_number), command=cmd, dependencies=ocr_jobs))
|
txt_merge_jobs.append(self.addTask(label="txt_merge_job_-_%i" % (txt_merge_job_number), command=cmd, dependencies=ocr_jobs))
|
||||||
|
|
||||||
@ -272,21 +262,17 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs))
|
cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs))
|
||||||
|
|
||||||
|
|
||||||
def analyze_jobs(inputDir, outputDir, level=1):
|
def analyze_jobs(inputDir, outputDir):
|
||||||
jobs = []
|
jobs = []
|
||||||
|
|
||||||
if level > 2:
|
|
||||||
return jobs
|
|
||||||
|
|
||||||
for file in os.listdir(inputDir):
|
for file in os.listdir(inputDir):
|
||||||
if os.path.isdir(os.path.join(inputDir, file)):
|
if os.path.isdir(os.path.join(inputDir, file)):
|
||||||
jobs += analyze_jobs(
|
jobs += analyze_jobs(
|
||||||
os.path.join(inputDir, file),
|
os.path.join(inputDir, file),
|
||||||
os.path.join(outputDir, file),
|
os.path.join(outputDir, file)
|
||||||
level + 1
|
|
||||||
)
|
)
|
||||||
elif file.endswith(".pdf") or file.endswith(".tif") or file.endswith(".tiff"):
|
elif file.endswith((".pdf", ".tif", ".tiff")):
|
||||||
jobs.append({"basename": os.path.basename(file), "output_dir": os.path.join(outputDir, file.rsplit(".", 1)[0]), "path": os.path.join(inputDir, file)})
|
jobs.append({"filename": file, "output_dir": os.path.join(outputDir, file), "path": os.path.join(inputDir, file)})
|
||||||
|
|
||||||
return jobs
|
return jobs
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user