mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2024-12-27 05:04:18 +00:00
Correct order for output files.
This commit is contained in:
parent
937abb8c8d
commit
843151e547
20
Dockerfile
20
Dockerfile
@ -1,6 +1,6 @@
|
|||||||
FROM debian:stretch-slim
|
FROM debian:stretch-slim
|
||||||
|
|
||||||
MAINTAINER Patrick Jentsch <p.jentsch@uni-bielefeld.de>
|
LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de"
|
||||||
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
ENV LANG=C.UTF-8
|
ENV LANG=C.UTF-8
|
||||||
@ -11,34 +11,31 @@ RUN apt-get update && \
|
|||||||
ca-certificates \
|
ca-certificates \
|
||||||
gnupg2 \
|
gnupg2 \
|
||||||
imagemagick \
|
imagemagick \
|
||||||
pdftk \
|
|
||||||
poppler-utils \
|
poppler-utils \
|
||||||
python2.7 \
|
python2.7 \
|
||||||
python3.5 \
|
python3.5 \
|
||||||
python-numpy \
|
python-numpy \
|
||||||
wget
|
wget
|
||||||
|
|
||||||
WORKDIR /root
|
|
||||||
|
|
||||||
# Install ocropy
|
# Install ocropy
|
||||||
ENV OCROPY_VERSION 1.3.3
|
ENV OCROPY_VERSION 1.3.3
|
||||||
RUN wget -nv https://github.com/tmbdev/ocropy/archive/v"$OCROPY_VERSION".tar.gz && \
|
RUN wget -nv https://github.com/tmbdev/ocropy/archive/v"$OCROPY_VERSION".tar.gz && \
|
||||||
tar -xzf v"$OCROPY_VERSION".tar.gz && \
|
tar -xzf v"$OCROPY_VERSION".tar.gz && \
|
||||||
rm v"$OCROPY_VERSION".tar.gz && \
|
|
||||||
cd ocropy-"$OCROPY_VERSION" && \
|
cd ocropy-"$OCROPY_VERSION" && \
|
||||||
apt-get install -y --no-install-recommends $(cat PACKAGES) python-pil python-tk && \
|
apt-get install -y --no-install-recommends $(cat PACKAGES) python-pil python-tk && \
|
||||||
wget -nv http://www.tmbdev.net/en-default.pyrnn.gz -P models/ && \
|
wget -nv http://www.tmbdev.net/en-default.pyrnn.gz -P models/ && \
|
||||||
python2.7 setup.py install && \
|
python2.7 setup.py install && \
|
||||||
cd ..
|
cd .. && \
|
||||||
|
rm -r v"$OCROPY_VERSION".tar.gz ocropy-"$OCROPY_VERSION"
|
||||||
|
|
||||||
# Install pyFlow
|
# Install pyFlow
|
||||||
ENV PYFLOW_VERSION 1.1.20
|
ENV PYFLOW_VERSION 1.1.20
|
||||||
RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \
|
RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \
|
||||||
tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \
|
tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \
|
||||||
rm pyflow-"$PYFLOW_VERSION".tar.gz && \
|
|
||||||
cd pyflow-"$PYFLOW_VERSION" && \
|
cd pyflow-"$PYFLOW_VERSION" && \
|
||||||
python2.7 setup.py build install && \
|
python2.7 setup.py build install && \
|
||||||
cd ..
|
cd .. && \
|
||||||
|
rm -r pyflow-"$PYFLOW_VERSION".tar.gz pyflow-"$PYFLOW_VERSION"
|
||||||
|
|
||||||
# Install Tesseract OCR and Data Files
|
# Install Tesseract OCR and Data Files
|
||||||
RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list && \
|
RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list && \
|
||||||
@ -52,11 +49,12 @@ RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /et
|
|||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/ita.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/ita.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
||||||
|
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata
|
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata
|
||||||
|
|
||||||
COPY ocr /usr/local/bin
|
|
||||||
COPY hocrtotei /usr/local/bin
|
COPY hocrtotei /usr/local/bin
|
||||||
|
COPY ocr /usr/local/bin
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
ENTRYPOINT ["ocr"]
|
||||||
|
CMD ["--help"]
|
||||||
|
45
hocrtotei
45
hocrtotei
@ -7,22 +7,31 @@ import os
|
|||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
input_files = filter(lambda x: x.endswith(".hocr"), sorted(os.listdir(sys.argv[1])))
|
input_files = sorted(
|
||||||
|
filter(
|
||||||
|
lambda x: x.endswith(".hocr"),
|
||||||
|
os.listdir(sys.argv[1])
|
||||||
|
),
|
||||||
|
key=lambda x: int(re.search(r'\d+', x).group(0))
|
||||||
|
)
|
||||||
|
# "page-1.hocr" -> "1"
|
||||||
output_file = open(sys.argv[2], "w")
|
output_file = open(sys.argv[2], "w")
|
||||||
|
|
||||||
output_file.write('<?xml version="1.0" encoding="UTF-8"?>\n' +
|
output_file.write(
|
||||||
'<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n' +
|
'<?xml version="1.0" encoding="UTF-8"?>\n'
|
||||||
' <teiHeader>\n' +
|
+ '<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n'
|
||||||
' <fileDesc>\n' +
|
+ ' <teiHeader>\n'
|
||||||
' <titleStmt/>\n' +
|
+ ' <fileDesc>\n'
|
||||||
' <publicationStmt/>\n' +
|
+ ' <titleStmt/>\n'
|
||||||
' <sourceDesc/>\n' +
|
+ ' <publicationStmt/>\n'
|
||||||
' </fileDesc>\n' +
|
+ ' <sourceDesc/>\n'
|
||||||
' <encodingDesc/>\n' +
|
+ ' </fileDesc>\n'
|
||||||
' <profileDesc/>\n' +
|
+ ' <encodingDesc/>\n'
|
||||||
' </teiHeader>\n' +
|
+ ' <profileDesc/>\n'
|
||||||
' <text>\n' +
|
+ ' </teiHeader>\n'
|
||||||
' <body>\n')
|
+ ' <text>\n'
|
||||||
|
+ ' <body>\n'
|
||||||
|
)
|
||||||
|
|
||||||
for input_file in input_files:
|
for input_file in input_files:
|
||||||
tree = ET.parse(os.path.join(sys.argv[1], input_file))
|
tree = ET.parse(os.path.join(sys.argv[1], input_file))
|
||||||
@ -40,7 +49,9 @@ for input_file in input_files:
|
|||||||
output_file.write('<lb/>\n')
|
output_file.write('<lb/>\n')
|
||||||
output_file.write(' </p>\n')
|
output_file.write(' </p>\n')
|
||||||
|
|
||||||
output_file.write(' </body>\n' +
|
output_file.write(
|
||||||
' </text>\n' +
|
' </body>\n'
|
||||||
'</TEI>')
|
+ ' </text>\n'
|
||||||
|
+ '</TEI>')
|
||||||
|
|
||||||
output_file.close()
|
output_file.close()
|
||||||
|
189
ocr
189
ocr
@ -19,32 +19,27 @@ from pyflow import WorkflowRunner
|
|||||||
|
|
||||||
''' TODO:
|
''' TODO:
|
||||||
' Implement --end-page: Last page to ocr
|
' Implement --end-page: Last page to ocr
|
||||||
' Implement --memMb: Total amount of memory (RAM) available for this workflow. Default: 2048 * nCores
|
' Implement --memMb: Total amount of memory (RAM) available for this workflow.
|
||||||
|
' Default: 2048 * nCores
|
||||||
' Implement --rotate: Rotate pages from input (90, 180, 270)
|
' Implement --rotate: Rotate pages from input (90, 180, 270)
|
||||||
' Implement --split-pages: Split pages in half after possible rotation
|
' Implement --split-pages: Split pages in half after possible rotation
|
||||||
' Implement --start-page: First page to ocr
|
' Implement --start-page: First page to ocr
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
|
||||||
def parse_arguments():
|
def parse_arguments():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
"Performs OCR of (historical) documents utilizing OCRopus for \
|
"Performs OCR of (historical) documents utilizing OCRopus for \
|
||||||
preprocessing and Tesseract OCR for OCR. Available outputs are HOCR, \
|
preprocessing and Tesseract OCR for OCR. Available outputs are HOCR, \
|
||||||
PDF, shrinked PDF, and simple DTAbf (TEI P5 compliant). Software \
|
PDF, shrinked PDF, and simple DTAbf (TEI P5 compliant). Software \
|
||||||
requirements: imagemagick, ocropus, pdftk, pdftoppm, poppler-utils, \
|
requirements: imagemagick, ocropus, pdftk, pdftoppm, poppler-utils, \
|
||||||
pyflow, python2.7, tesseract"
|
pyflow, python2.7, python3.5, tesseract"
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument("-i",
|
|
||||||
dest="inputDir",
|
|
||||||
help="Input directory.",
|
|
||||||
required=True)
|
|
||||||
parser.add_argument("-l",
|
parser.add_argument("-l",
|
||||||
dest='lang',
|
dest='lang',
|
||||||
help="Language for OCR",
|
help="Language for OCR",
|
||||||
required=True)
|
required=True)
|
||||||
parser.add_argument("-o",
|
|
||||||
dest="outputDir",
|
|
||||||
help="Output directory.",
|
|
||||||
required=True)
|
|
||||||
parser.add_argument("--skip-binarization",
|
parser.add_argument("--skip-binarization",
|
||||||
action='store_true',
|
action='store_true',
|
||||||
default=False,
|
default=False,
|
||||||
@ -67,14 +62,16 @@ def parse_arguments():
|
|||||||
|
|
||||||
|
|
||||||
class OCRWorkflow(WorkflowRunner):
|
class OCRWorkflow(WorkflowRunner):
|
||||||
def __init__(self, jobs, skipBinarization, keepIntermediates, lang, nCores):
|
def __init__(self, args):
|
||||||
self.jobs = jobs
|
self.jobs = analyze_jobs()
|
||||||
self.skipBinarization = skipBinarization
|
self.skipBinarization = args.skipBinarization
|
||||||
self.keepIntermediates = keepIntermediates
|
self.keepIntermediates = args.keepIntermediates
|
||||||
self.lang = lang
|
self.lang = args.lang
|
||||||
self.nCores = nCores
|
self.nCores = args.nCores
|
||||||
self.defaultNCores = min(nCores, max(1, int(nCores / len(jobs))))
|
self.defaultNCores = min(
|
||||||
|
self.nCores,
|
||||||
|
max(1, int(self.nCores / len(self.jobs)))
|
||||||
|
)
|
||||||
|
|
||||||
def workflow(self):
|
def workflow(self):
|
||||||
###
|
###
|
||||||
@ -93,10 +90,17 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
)
|
)
|
||||||
if not self.skipBinarization:
|
if not self.skipBinarization:
|
||||||
cmd += ' "%s" "%s"' % (
|
cmd += ' "%s" "%s"' % (
|
||||||
os.path.join(job["output_dir"], "tmp", "binarized_png"),
|
os.path.join(job["output_dir"], "tmp", "bin.png"),
|
||||||
os.path.join(job["output_dir"], "tmp", "normalized_png"),
|
os.path.join(job["output_dir"], "tmp", "nrm.png"),
|
||||||
|
)
|
||||||
|
create_output_directories_jobs.append(
|
||||||
|
self.addTask(
|
||||||
|
command=cmd,
|
||||||
|
label="create_output_directories_job_-_%i" % (
|
||||||
|
create_output_directories_job_number
|
||||||
|
),
|
||||||
|
nCores=self.defaultNCores)
|
||||||
)
|
)
|
||||||
create_output_directories_jobs.append(self.addTask(label="create_output_directories_job_-_%i" % (create_output_directories_job_number), command=cmd, nCores=self.defaultNCores))
|
|
||||||
|
|
||||||
###
|
###
|
||||||
# Task "split_job": split input file into one tiff file per page
|
# Task "split_job": split input file into one tiff file per page
|
||||||
@ -116,7 +120,14 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
job["path"],
|
job["path"],
|
||||||
os.path.join(job["output_dir"], "tmp", "page")
|
os.path.join(job["output_dir"], "tmp", "page")
|
||||||
)
|
)
|
||||||
split_jobs.append(self.addTask(label="split_job_-_%i" % (split_job_number), command=cmd, dependencies=create_output_directories_jobs, nCores=self.defaultNCores))
|
split_jobs.append(
|
||||||
|
self.addTask(
|
||||||
|
command=cmd,
|
||||||
|
dependencies=create_output_directories_jobs,
|
||||||
|
label="split_job_-_%i" % (split_job_number),
|
||||||
|
nCores=self.defaultNCores
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
###
|
###
|
||||||
# Task "ocropus_nlbin_job": binarize tiff files from previous split
|
# Task "ocropus_nlbin_job": binarize tiff files from previous split
|
||||||
@ -132,12 +143,21 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
if not self.skipBinarization:
|
if not self.skipBinarization:
|
||||||
for job in self.jobs:
|
for job in self.jobs:
|
||||||
binarization_job_number += 1
|
binarization_job_number += 1
|
||||||
cmd = 'ocropus-nlbin --output "%s" --parallel "%i" $(ls "%s"/*.tif | sort -V)' % (
|
cmd = 'ocropus-nlbin --output "%s" --parallel "%i" $(ls --quoting-style=shell-escape -v "%s"/*.tif)' % (
|
||||||
os.path.join(job["output_dir"], "tmp"),
|
os.path.join(job["output_dir"], "tmp"),
|
||||||
binarization_job_nCores,
|
binarization_job_nCores,
|
||||||
os.path.join(job["output_dir"], "tmp")
|
os.path.join(job["output_dir"], "tmp")
|
||||||
)
|
)
|
||||||
binarization_jobs.append(self.addTask(label="binarization_job_-_%i" % (binarization_job_number), command=cmd, dependencies=split_jobs, nCores=binarization_job_nCores))
|
binarization_jobs.append(
|
||||||
|
self.addTask(
|
||||||
|
command=cmd,
|
||||||
|
dependencies=split_jobs,
|
||||||
|
label="binarization_job_-_%i" % (
|
||||||
|
binarization_job_number
|
||||||
|
),
|
||||||
|
nCores=binarization_job_nCores
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
###
|
###
|
||||||
# Task "post_binarization_job": Normalize file names from binarization
|
# Task "post_binarization_job": Normalize file names from binarization
|
||||||
@ -152,9 +172,21 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
post_binarization_job_number += 1
|
post_binarization_job_number += 1
|
||||||
cmd = 'mv "%s" "%s"' % (
|
cmd = 'mv "%s" "%s"' % (
|
||||||
os.path.join(job["output_dir"], "tmp", file),
|
os.path.join(job["output_dir"], "tmp", file),
|
||||||
os.path.join(job["output_dir"], "tmp", "page-%i.%s" % (int(file.split(".", 1)[0]), file.split(".", 1)[1])),
|
os.path.join(job["output_dir"], "tmp", "page-%i.%s" % (
|
||||||
|
int(file.split(".", 1)[0]),
|
||||||
|
file.split(".", 1)[1])
|
||||||
|
),
|
||||||
|
)
|
||||||
|
post_binarization_jobs.append(
|
||||||
|
self.addTask(
|
||||||
|
command=cmd,
|
||||||
|
dependencies=binarization_jobs,
|
||||||
|
label="post_binarization_job_-_%i" % (
|
||||||
|
post_binarization_job_number
|
||||||
|
),
|
||||||
|
nCores=self.defaultNCores
|
||||||
|
)
|
||||||
)
|
)
|
||||||
post_binarization_jobs.append(self.addTask(label="post_binarization_job_-_%i" % (post_binarization_job_number), command=cmd, dependencies=binarization_jobs, nCores=self.defaultNCores))
|
|
||||||
|
|
||||||
###
|
###
|
||||||
# Task "ocr_job": perform OCR
|
# Task "ocr_job": perform OCR
|
||||||
@ -165,8 +197,8 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
ocr_job_number = 0
|
ocr_job_number = 0
|
||||||
'''
|
'''
|
||||||
' Tesseract runs fastest with four cores. So we run it with either four
|
' Tesseract runs fastest with four cores. So we run it with either four
|
||||||
' or, if there are less then four cores available for this workflow, the
|
' or, if there are less then four cores available for this workflow,
|
||||||
' available core number.
|
' the available core number.
|
||||||
'''
|
'''
|
||||||
ocr_job_nCores = min(4, self.nCores)
|
ocr_job_nCores = min(4, self.nCores)
|
||||||
'''
|
'''
|
||||||
@ -183,7 +215,14 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
os.path.join(job["output_dir"], "tmp", file.rsplit(".", 1 if self.skipBinarization else 2)[0]),
|
os.path.join(job["output_dir"], "tmp", file.rsplit(".", 1 if self.skipBinarization else 2)[0]),
|
||||||
self.lang
|
self.lang
|
||||||
)
|
)
|
||||||
ocr_jobs.append(self.addTask(label="ocr_job_-_%i" % (ocr_job_number), command=cmd, dependencies=post_binarization_jobs, nCores=ocr_job_nCores))
|
ocr_jobs.append(
|
||||||
|
self.addTask(
|
||||||
|
command=cmd,
|
||||||
|
dependencies=post_binarization_jobs,
|
||||||
|
label="ocr_job_-_%i" % (ocr_job_number),
|
||||||
|
nCores=ocr_job_nCores
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
###
|
###
|
||||||
# Task "hocr_to_tei_job": create TEI P5 file from hocr files
|
# Task "hocr_to_tei_job": create TEI P5 file from hocr files
|
||||||
@ -197,7 +236,14 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
os.path.join(job["output_dir"], "tmp"),
|
os.path.join(job["output_dir"], "tmp"),
|
||||||
os.path.join(job["output_dir"], job["filename"].rsplit(".", 1)[0] + ".xml")
|
os.path.join(job["output_dir"], job["filename"].rsplit(".", 1)[0] + ".xml")
|
||||||
)
|
)
|
||||||
hocr_to_tei_jobs.append(self.addTask(label="hocr_to_tei_job_-_%i" % (hocr_to_tei_job_number), command=cmd, dependencies=ocr_jobs, nCores=self.defaultNCores))
|
hocr_to_tei_jobs.append(
|
||||||
|
self.addTask(
|
||||||
|
command=cmd,
|
||||||
|
dependencies=ocr_jobs,
|
||||||
|
label="hocr_to_tei_job_-_%i" % (hocr_to_tei_job_number),
|
||||||
|
nCores=self.defaultNCores
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
###
|
###
|
||||||
# Task "pdf_merge_job": Merge PDF files
|
# Task "pdf_merge_job": Merge PDF files
|
||||||
@ -207,11 +253,18 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
pdf_merge_job_number = 0
|
pdf_merge_job_number = 0
|
||||||
for job in self.jobs:
|
for job in self.jobs:
|
||||||
pdf_merge_job_number += 1
|
pdf_merge_job_number += 1
|
||||||
cmd = 'pdftk $(ls "%s"/*.pdf | sort -V) cat output "%s"' % (
|
cmd = '(ls --quoting-style=shell-escape -v "%s"/*.pdf && echo "\'%s\'") | xargs pdfunite' % (
|
||||||
os.path.join(job["output_dir"], "tmp"),
|
os.path.join(job["output_dir"], "tmp"),
|
||||||
os.path.join(job["output_dir"], job["filename"].rsplit(".", 1)[0] + ".pdf")
|
os.path.join(job["output_dir"], job["filename"].rsplit(".", 1)[0] + ".pdf")
|
||||||
)
|
)
|
||||||
pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=ocr_jobs, nCores=self.defaultNCores))
|
pdf_merge_jobs.append(
|
||||||
|
self.addTask(
|
||||||
|
command=cmd,
|
||||||
|
dependencies=ocr_jobs,
|
||||||
|
label="pdf_merge_job_-_%i" % (pdf_merge_job_number),
|
||||||
|
nCores=self.defaultNCores
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
###
|
###
|
||||||
# Task "txt_merge_job": Merge .txt files
|
# Task "txt_merge_job": Merge .txt files
|
||||||
@ -221,11 +274,18 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
txt_merge_job_number = 0
|
txt_merge_job_number = 0
|
||||||
for job in self.jobs:
|
for job in self.jobs:
|
||||||
txt_merge_job_number += 1
|
txt_merge_job_number += 1
|
||||||
cmd = 'cat $(ls "%s"/*.txt | sort -V) > "%s"' % (
|
cmd = 'ls --quoting-style=shell-escape -v "%s"/*.txt | xargs cat > "%s"' % (
|
||||||
os.path.join(job["output_dir"], "tmp"),
|
os.path.join(job["output_dir"], "tmp"),
|
||||||
os.path.join(job["output_dir"], job["filename"].rsplit(".", 1)[0] + ".txt")
|
os.path.join(job["output_dir"], job["filename"].rsplit(".", 1)[0] + ".txt")
|
||||||
)
|
)
|
||||||
txt_merge_jobs.append(self.addTask(label="txt_merge_job_-_%i" % (txt_merge_job_number), command=cmd, dependencies=ocr_jobs, nCores=self.defaultNCores))
|
txt_merge_jobs.append(
|
||||||
|
self.addTask(
|
||||||
|
command=cmd,
|
||||||
|
dependencies=ocr_jobs,
|
||||||
|
label="txt_merge_job_-_%i" % (txt_merge_job_number),
|
||||||
|
nCores=self.defaultNCores
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
###
|
###
|
||||||
# Task "cleanup_job": remove temporary files
|
# Task "cleanup_job": remove temporary files
|
||||||
@ -236,35 +296,59 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
if self.keepIntermediates:
|
if self.keepIntermediates:
|
||||||
for job in self.jobs:
|
for job in self.jobs:
|
||||||
cleanup_job_counter += 1
|
cleanup_job_counter += 1
|
||||||
cmd = 'mv "%s"/*.hocr "%s" && mv "%s"/*.pdf "%s" && mv "%s"/*.tif "%s" && mv "%s"/*.txt "%s"' % (
|
cmd = 'mv "%s"/*.hocr "%s"' % (
|
||||||
os.path.join(job["output_dir"], "tmp"),
|
os.path.join(job["output_dir"], "tmp"),
|
||||||
os.path.join(job["output_dir"], "tmp", "hocr"),
|
os.path.join(job["output_dir"], "tmp", "hocr"),
|
||||||
|
)
|
||||||
|
cmd += ' && mv "%s"/*.pdf "%s"' % (
|
||||||
os.path.join(job["output_dir"], "tmp"),
|
os.path.join(job["output_dir"], "tmp"),
|
||||||
os.path.join(job["output_dir"], "tmp", "pdf"),
|
os.path.join(job["output_dir"], "tmp", "pdf"),
|
||||||
|
)
|
||||||
|
cmd += ' && mv "%s"/*.tif "%s"' % (
|
||||||
os.path.join(job["output_dir"], "tmp"),
|
os.path.join(job["output_dir"], "tmp"),
|
||||||
os.path.join(job["output_dir"], "tmp", "tiff"),
|
os.path.join(job["output_dir"], "tmp", "tiff"),
|
||||||
|
)
|
||||||
|
cmd += ' && mv "%s"/*.txt "%s"' % (
|
||||||
os.path.join(job["output_dir"], "tmp"),
|
os.path.join(job["output_dir"], "tmp"),
|
||||||
os.path.join(job["output_dir"], "tmp", "txt")
|
os.path.join(job["output_dir"], "tmp", "txt"),
|
||||||
)
|
)
|
||||||
if not self.skipBinarization:
|
if not self.skipBinarization:
|
||||||
cmd += ' && mv "%s"/*.bin.png "%s" && mv "%s"/*.nrm.png "%s"' % (
|
cmd += ' && mv "%s"/*.bin.png "%s"' % (
|
||||||
os.path.join(job["output_dir"], "tmp"),
|
os.path.join(job["output_dir"], "tmp"),
|
||||||
os.path.join(job["output_dir"], "tmp", "binarized_png"),
|
os.path.join(job["output_dir"], "tmp", "bin.png"),
|
||||||
os.path.join(job["output_dir"], "tmp"),
|
)
|
||||||
os.path.join(job["output_dir"], "tmp", "normalized_png"),
|
cmd += ' && mv "%s"/*.nrm.png "%s"' % (
|
||||||
|
os.path.join(job["output_dir"], "tmp"),
|
||||||
|
os.path.join(job["output_dir"], "tmp", "nrm.png"),
|
||||||
|
)
|
||||||
|
cleanup_jobs.append(
|
||||||
|
self.addTask(
|
||||||
|
command=cmd,
|
||||||
|
dependencies=hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs,
|
||||||
|
label="cleanup_job_-_%i" % (cleanup_job_counter),
|
||||||
|
nCores=self.defaultNCores
|
||||||
|
)
|
||||||
)
|
)
|
||||||
cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs, nCores=self.defaultNCores))
|
|
||||||
else:
|
else:
|
||||||
for job in self.jobs:
|
for job in self.jobs:
|
||||||
cleanup_job_counter += 1
|
cleanup_job_counter += 1
|
||||||
cmd = 'rm -r "%s"' % (
|
cmd = 'rm -r "%s"' % (
|
||||||
os.path.join(job["output_dir"], "tmp")
|
os.path.join(job["output_dir"], "tmp")
|
||||||
)
|
)
|
||||||
cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs), nCores=self.defaultNCores)
|
cleanup_jobs.append(
|
||||||
|
self.addTask(
|
||||||
|
label="cleanup_job_-_%i" % (cleanup_job_counter),
|
||||||
|
command=cmd,
|
||||||
|
dependencies=hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs,
|
||||||
|
nCores=self.defaultNCores
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def analyze_jobs(inputDir, outputDir):
|
def analyze_jobs():
|
||||||
|
inputDir = "/files_for_ocr"
|
||||||
jobs = []
|
jobs = []
|
||||||
|
outputDir = "/files_from_ocr"
|
||||||
|
|
||||||
for file in os.listdir(inputDir):
|
for file in os.listdir(inputDir):
|
||||||
if os.path.isdir(os.path.join(inputDir, file)):
|
if os.path.isdir(os.path.join(inputDir, file)):
|
||||||
@ -273,7 +357,13 @@ def analyze_jobs(inputDir, outputDir):
|
|||||||
os.path.join(outputDir, file)
|
os.path.join(outputDir, file)
|
||||||
)
|
)
|
||||||
elif file.endswith((".pdf", ".tif", ".tiff")):
|
elif file.endswith((".pdf", ".tif", ".tiff")):
|
||||||
jobs.append({"filename": file, "output_dir": os.path.join(outputDir, file), "path": os.path.join(inputDir, file)})
|
jobs.append(
|
||||||
|
{
|
||||||
|
"filename": file,
|
||||||
|
"output_dir": os.path.join(outputDir, file),
|
||||||
|
"path": os.path.join(inputDir, file)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
return jobs
|
return jobs
|
||||||
|
|
||||||
@ -281,15 +371,10 @@ def analyze_jobs(inputDir, outputDir):
|
|||||||
def main():
|
def main():
|
||||||
args = parse_arguments()
|
args = parse_arguments()
|
||||||
|
|
||||||
wflow = OCRWorkflow(
|
wflow = OCRWorkflow(args)
|
||||||
analyze_jobs(args.inputDir, args.outputDir),
|
|
||||||
args.skipBinarization,
|
retval = wflow.run(dataDirRoot="/files_from_ocr", nCores=args.nCores)
|
||||||
args.keepIntermediates,
|
|
||||||
args.lang,
|
|
||||||
args.nCores
|
|
||||||
)
|
|
||||||
|
|
||||||
retval = wflow.run(nCores=args.nCores)
|
|
||||||
sys.exit(retval)
|
sys.exit(retval)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user