Add prototype

This commit is contained in:
Patrick Jentsch
2019-06-03 14:57:09 +02:00
parent b8fa8f47ab
commit 86557443a2
43 changed files with 24638 additions and 53 deletions

49
vre_ocr_node/Dockerfile Normal file
View File

@ -0,0 +1,49 @@
FROM debian:stretch-slim
MAINTAINER Patrick Jentsch <p.jentsch@uni-bielefeld.de>
ENV LANG=C.UTF-8
RUN apt-get update && \
apt-get install -y --no-install-recommends \
apt-transport-https \
ca-certificates \
gnupg2 \
pdftk \
poppler-utils \
python2.7 \
python3 \
wget \
zip
WORKDIR /root
# Install pyFlow
ENV PYFLOW_VERSION 1.1.20
RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \
tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \
rm pyflow-"$PYFLOW_VERSION".tar.gz && \
cd pyflow-"$PYFLOW_VERSION" && \
python2.7 setup.py build install && \
cd ..
# Install Tesseract OCR and Data Files
RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list && \
wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - && \
apt-get update && \
apt-get install -y --no-install-recommends tesseract-ocr && \
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv https://github.com/tesseract-ocr/tessdata/raw/master/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata
RUN mkdir files_for_ocr files_from_ocr
COPY hocrtotei /usr/local/bin
COPY ocr /usr/local/bin
CMD ["/bin/bash"]

47
vre_ocr_node/README.md Normal file
View File

@ -0,0 +1,47 @@
# vre_ocr_node
Dieses Repository stellt ein Dockerfile zur Erstellung eines Dockerimages zur automatischen Zeichenerkennung zur Verfügung. Es werden PDF-Dateien entgegengenommen und PDF-Dateien, TEI konforme XML-Dateien und Textdateien ausgegeben.
## Funktionsweise
Eingabedateien durchlaufen eine Verarbeitungsroutine, die in Form einer Pipeline realisiert wurde. Diese Pipeline besteht aus drei Schritten:
1. Jede PDF-Datei aus dem Eingabeverzeichnis wird in einzelne Seiten aufgespalten.
2. Die resultierenden Dateien aus Schritt 1 werden durch eine automatische Texterkennung (OCR) weiterverarbeitet.
3. Die verarbeiteten Einzelseiten werden wieder zusammenführt.
## Dockerimage erstellen
Die GitLab Registry stellt ein automatisch erstelltes Dockerimage zur Verfügung, das stehts die neusten Änderungen beinhaltet. Das Dockerimage kann aber auch lokal erstellt werden, dazu muss folgender Befehl ins Terminal eingegeben werden.
```bash
docker build -t gitlab.ub.uni-bielefeld.de:4567/pjentsch/vre_ocr_node .
```
## Nutzung
### Starten eines Dockercontainers
```bash
docker run \
--name <containername> \
-dit \
-v <datalocation>/files_for_ocr:/root/files_for_ocr \
-v <datalocation>/files_from_ocr:/root/files_from_ocr \
gitlab.ub.uni-bielefeld.de:4567/pjentsch/vre_ocr_node
```
### Daten zur Texterkennung in das Eingabeverzeichnis kopieren
```bash
cp <pdffile1> <pdffile2> ... <pdffilen> <datalocation>/files_for_ocr
```
### Texterkennung starten
```bash
docker exec -it <containername> ocr -i /root/files_for_ocr -o /root/files_from_ocr -l <languagecode>
```
Valide Angaben für `<languagecode>` sind:
* deu (Deutsch)
* deu_frak (Deutsch Fraktur)
* eng (English)
* enm (Mittelenglisch)
* fra (Französisch)
* frm (Mittelfranzösisch)
* por (Portugisisch)
* spa (Spanish)
Nach Beendigung des Vorgangs, stehen die aufbereitet Daten im Verzeichnis `<datalocation>/files_from_ocr` zur Verfügung.

44
vre_ocr_node/hocrtotei Executable file
View File

@ -0,0 +1,44 @@
#!/usr/bin/env python3
# coding=utf-8
import xml.etree.ElementTree as ET
from xml.sax.saxutils import escape
import os
import sys
input_files = filter(lambda x: x.endswith(".hocr"), sorted(os.listdir(sys.argv[1])))
output_file = open(sys.argv[2], "w")
output_file.write('<?xml version="1.0" encoding="UTF-8"?>\n' +
'<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n' +
' <teiHeader>\n' +
' <fileDesc>\n' +
' <titleStmt/>\n' +
' <publicationStmt/>\n' +
' <sourceDesc/>\n' +
' </fileDesc>\n' +
' <encodingDesc/>\n' +
' <profileDesc/>\n' +
' </teiHeader>\n' +
' <text>\n' +
' <body>\n')
for input_file in input_files:
tree = ET.parse(os.path.join(sys.argv[1], input_file))
output_file.write(' <pb n="%s"/>\n' % (input_file.split(".")[0].split("-")[1]))
for para in tree.findall(".//*[@class='ocr_par']"):
output_file.write(' <p>\n')
for line in para.findall(".//*[@class='ocr_line']"):
first_word_in_line = True
for word in line.findall(".//*[@class='ocrx_word']"):
if word.text is not None:
output_file.write((" " if first_word_in_line else " ") + escape(word.text.strip()))
first_word_in_line = False
if not first_word_in_line:
output_file.write('<lb/>\n')
output_file.write(' </p>\n')
output_file.write(' </body>\n' +
' </text>\n' +
'</TEI>')
output_file.close()

214
vre_ocr_node/ocr Executable file
View File

@ -0,0 +1,214 @@
#!/usr/bin/env python2
# coding=utf-8
"""
ocr
Usage: For usage instructions run with option --help
Author: Patrick Jentsch <p.jentsch@uni-bielefeld.de>
"""
import argparse
import multiprocessing
import os
import sys
from pyflow import WorkflowRunner
def parse_arguments():
parser = argparse.ArgumentParser(
"Performs OCR of documents utilizing Tesseract OCR. \
Outputs are .pdf and .txt."
)
parser.add_argument("-i",
dest="inputDir",
help="Input directory.",
required=True)
parser.add_argument("-l",
dest='lang',
help="Language for OCR",
required=True)
parser.add_argument("-o",
dest="outputDir",
help="Output directory.",
required=True)
parser.add_argument("--keep-intermediates",
action='store_true',
default=False,
dest="keepIntermediates",
help="Keep intermediate files.",
required=False)
parser.add_argument("--nCores",
default=multiprocessing.cpu_count(),
dest="nCores",
help="Total number of cores available.",
required=False,
type=int)
return parser.parse_args()
class OCRWorkflow(WorkflowRunner):
def __init__(self, jobs, keepIntermediates, lang, nCores):
self.jobs = jobs
self.keepIntermediates = keepIntermediates
self.lang = lang
self.nCores = nCores
def workflow(self):
###
# Task "mkdir_job": create output directories
# Dependencies: None
###
mkdir_jobs = []
mkdir_job_number = 0
for job in self.jobs:
mkdir_job_number += 1
cmd = 'mkdir -p "%s" "%s" "%s"' % (
job["output_dir"],
os.path.join(job["output_dir"], "tmp", "tesseract"),
os.path.join(job["output_dir"], "tmp", "tiff_files")
)
mkdir_jobs.append(self.addTask(label="mkdir_job_-_%i" % (mkdir_job_number), command=cmd))
###
# Task "split_job": split input file into one .tif file per page
# Dependencies: mkdir_jobs
###
split_jobs = []
split_job_number = 0
for job in self.jobs:
split_job_number += 1
cmd = 'pdftoppm "%s" "%s" -tiff -r 300 -tiffcompression lzw -cropbox' % (
job["path"],
os.path.join(job["output_dir"], "tmp", "tiff_files", "page")
)
split_jobs.append(self.addTask(label="split_job_-_%i" % (mkdir_job_number), command=cmd))
###
# Task "tesseract_job": perform OCR
# Dependencies: split_jobs
###
self.waitForTasks()
tesseract_jobs = []
tesseract_job_number = 0
for job in self.jobs:
# This list is empty if you don't wait for split_jobs to complete
for file in os.listdir(os.path.join(job["output_dir"], "tmp", "tiff_files")):
tesseract_job_number += 1
cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % (
os.path.join(job["output_dir"], "tmp", "tiff_files", file),
os.path.join(job["output_dir"], "tmp", "tesseract", file.rsplit(".", 1)[0]),
self.lang
)
tesseract_jobs.append(self.addTask(label="tesseract_job_-_%i" % (tesseract_job_number), command=cmd, dependencies=split_jobs, nCores=min(4, self.nCores)))
###
# Task "hocr_to_teip5_job": create TEI P5 file from hocr files
# Dependencies: tesseract_jobs
###
hocr_to_tei_jobs = []
hocr_to_tei_job_number = 0
for job in self.jobs:
hocr_to_tei_job_number += 1
cmd = 'hocrtotei "%s" "%s"' % (
os.path.join(job["output_dir"], "tmp", "tesseract"),
os.path.join(job["output_dir"], os.path.basename(job["path"]).rsplit(".", 1)[0] + ".xml")
)
hocr_to_tei_jobs.append(self.addTask(label="hocr_to_tei_job_-_%i" % (hocr_to_tei_job_number), command=cmd, dependencies=tesseract_jobs))
###
# Task "pdf_merge_job": Merge .pdf files
# Dependencies: tesseract_jobs
###
pdf_merge_jobs = []
pdf_merge_job_number = 0
for job in self.jobs:
pdf_merge_job_number += 1
cmd = 'pdftk "%s"/*.pdf cat output "%s"' % (
os.path.join(job["output_dir"], "tmp", "tesseract"),
os.path.join(job["output_dir"], os.path.basename(job["path"]).rsplit(".", 1)[0] + ".pdf")
)
pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=tesseract_jobs))
###
# Task "txt_merge_job": Merge .txt files
# Dependencies: tesseract_jobs
###
txt_merge_jobs = []
txt_merge_job_number = 0
for job in self.jobs:
txt_merge_job_number += 1
cmd = 'cat "%s"/*.txt > "%s"' % (
os.path.join(job["output_dir"], "tmp", "tesseract"),
os.path.join(job["output_dir"], os.path.basename(job["path"]).rsplit(".", 1)[0] + ".txt")
)
txt_merge_jobs.append(self.addTask(label="txt_merge_job_-_%i" % (txt_merge_job_number), command=cmd, dependencies=tesseract_jobs))
###
# Task "cleanup_job": remove temporary files
# Dependencies: hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs
###
cleanup_jobs = []
cleanup_job_counter = 0
if not self.keepIntermediates:
for job in self.jobs:
cleanup_job_counter += 1
cmd = 'rm -r "%s"' % (
os.path.join(job["output_dir"], "tmp")
)
cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs))
###
# Task "zip_job": compress output
# Dependencies: cleanup_jobs
###
zip_jobs = []
zip_job_number = 0
for job in self.jobs:
zip_job_number += 1
cmd = 'zip -jqr %s %s' % (
job["output_dir"] + "_-_ocr",
job["output_dir"]
)
zip_jobs.append(self.addTask(label="zip_job_-_%i" % (zip_job_number), command=cmd, dependencies=cleanup_jobs))
def analyze_jobs(inputDir, outputDir, level=1):
jobs = []
if level > 2:
return jobs
for file in os.listdir(inputDir):
if os.path.isdir(os.path.join(inputDir, file)):
jobs += analyze_jobs(
os.path.join(inputDir, file),
os.path.join(outputDir, file),
level + 1
)
elif file.endswith(".pdf"):
jobs.append({"path": os.path.join(inputDir, file), "output_dir": os.path.join(outputDir, file.rsplit(".", 1)[0])})
return jobs
def main():
args = parse_arguments()
wflow = OCRWorkflow(
analyze_jobs(args.inputDir, args.outputDir),
args.keepIntermediates,
args.lang,
args.nCores
)
retval = wflow.run(nCores=args.nCores)
sys.exit(retval)
if __name__ == "__main__":
main()