Add prototype

This commit is contained in:
Patrick Jentsch
2019-06-03 14:57:09 +02:00
parent b8fa8f47ab
commit 86557443a2
43 changed files with 24638 additions and 53 deletions

43
vre_nlp_node/Dockerfile Normal file
View File

@ -0,0 +1,43 @@
FROM debian:stretch-slim
MAINTAINER Patrick Jentsch <p.jentsch@uni-bielefeld.de>
ENV LANG=C.UTF-8
RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
python2.7 \
python3 \
python3-dev \
python3-pip \
python3-setuptools \
wget \
zip
WORKDIR /root
# Install pyFlow
ENV PYFLOW_VERSION 1.1.20
RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \
tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \
rm pyflow-"$PYFLOW_VERSION".tar.gz && \
cd pyflow-"$PYFLOW_VERSION" && \
python2.7 setup.py build install && \
cd ..
# Install spaCy
RUN pip3 install wheel && pip3 install -U spacy && \
python3 -m spacy download de && \
python3 -m spacy download en && \
python3 -m spacy download es && \
python3 -m spacy download fr && \
python3 -m spacy download pt
RUN mkdir files_for_nlp files_from_nlp
COPY nlp /usr/local/bin
COPY spacy_nlp /usr/local/bin
CMD ["/bin/bash"]

38
vre_nlp_node/README.md Normal file
View File

@ -0,0 +1,38 @@
# vre_nlp_node
Dieses Repository stellt ein Dockerfile zur Erstellung eines Dockerimages zur linguistischen Datenverarbeitung (NLP) zur Verfügung. Es werden Textdateien entgegengenommen und verticalized text-Dateien ausgegeben.
## Dockerimage erstellen
Die GitLab Registry stellt ein automatisch erstelltes Dockerimage zur Verfügung, das stehts die neusten Änderungen beinhaltet. Das Dockerimage kann aber auch lokal erstellt werden, dazu muss folgender Befehl ins Terminal eingegeben werden.
```bash
docker build -t gitlab.ub.uni-bielefeld.de:4567/pjentsch/vre_nlp_node .
```
## Nutzung
### Starten eines Dockercontainers
```bash
docker run \
--name <containername> \
-dit \
-v <datalocation>/files_for_nlp:/root/files_for_nlp \
-v <datalocation>/files_from_nlp:/root/files_from_nlp \
gitlab.ub.uni-bielefeld.de:4567/pjentsch/vre_nlp_node
```
### Daten zur linguistischen Datenverarbeitung in das Eingabeverzeichnis kopieren
```bash
cp <textfile1> <textfile2> ... <textfilen> <datalocation>/files_for_nlp
```
### Linguistische Datenverarbeitung starten
```bash
docker exec -it <containername> ocr -i /root/files_for_nlp -o /root/files_from_nlp -l <languagecode>
```
Valide Angaben für `<languagecode>` sind:
* de (Deutsch)
* en (Englisch)
* es (Spanish)
* fr (Französisch)
* pt (Portugisisch)
Nach Beendigung des Vorgangs, stehen die aufbereitet Daten im Verzeichnis `<datalocation>/files_from_nlp` zur Verfügung.

131
vre_nlp_node/nlp Executable file
View File

@ -0,0 +1,131 @@
#!/usr/bin/env python2.7
# coding=utf-8
"""
nlp
Usage: For usage instructions run with option --help
Author: Patrick Jentsch <p.jentsch@uni-bielefeld.de>
"""
import argparse
import multiprocessing
import os
import sys
from pyflow import WorkflowRunner
def parse_arguments():
parser = argparse.ArgumentParser(
"Performs NLP of documents utilizing spaCy. \
Output is .vrt."
)
parser.add_argument("-i",
dest="inputDir",
help="Input directory.",
required=True)
parser.add_argument("-l",
dest='lang',
help="Language for NLP",
required=True)
parser.add_argument("-o",
dest="outputDir",
help="Output directory.",
required=True)
parser.add_argument("--nCores",
default=multiprocessing.cpu_count(),
dest="nCores",
help="Total number of cores available.",
required=False,
type=int)
return parser.parse_args()
class NLPWorkflow(WorkflowRunner):
def __init__(self, jobs, lang, nCores):
self.jobs = jobs
self.lang = lang
self.nCores = nCores
def workflow(self):
###
# Task "mkdir_job": create output directories
# Dependencies: None
###
mkdir_jobs = []
mkdir_job_number = 0
for job in self.jobs:
mkdir_job_number += 1
cmd = 'mkdir -p "%s"' % (
job["output_dir"]
)
mkdir_jobs.append(self.addTask(label="mkdir_job_-_%i" % (mkdir_job_number), command=cmd))
###
# Task "spacy_nlp_job": perform NLP
# Dependencies: mkdir_jobs
###
self.waitForTasks()
nlp_jobs = []
nlp_job_number = 0
for job in self.jobs:
nlp_job_number += 1
cmd = 'spacy_nlp -i "%s" -o "%s" -l "%s"' % (
job["path"],
os.path.join(job["output_dir"], os.path.basename(job["path"]).rsplit(".", 1)[0] + ".vrt"),
self.lang
)
nlp_jobs.append(self.addTask(label="nlp_job_-_%i" % (nlp_job_number), command=cmd, dependencies=mkdir_jobs))
###
# Task "zip_job": compress output
# Dependencies: nlp_jobs
###
zip_jobs = []
zip_job_number = 0
for job in self.jobs:
zip_job_number += 1
cmd = 'zip -jqr %s %s' % (
job["output_dir"] + "_-_nlp",
job["output_dir"]
)
zip_jobs.append(self.addTask(label="zip_job_-_%i" % (zip_job_number), command=cmd, dependencies=nlp_jobs))
def analyze_jobs(inputDir, outputDir, level=1):
jobs = []
if level > 2:
return jobs
for file in os.listdir(inputDir):
if os.path.isdir(os.path.join(inputDir, file)):
jobs += analyze_jobs(
os.path.join(inputDir, file),
os.path.join(outputDir, file),
level + 1
)
elif file.endswith(".txt"):
jobs.append({"path": os.path.join(inputDir, file), "output_dir": os.path.join(outputDir, file.rsplit(".", 1)[0])})
return jobs
def main():
args = parse_arguments()
wflow = NLPWorkflow(
analyze_jobs(args.inputDir, args.outputDir),
args.lang,
args.nCores
)
retval = wflow.run(nCores=args.nCores)
sys.exit(retval)
if __name__ == "__main__":
main()

59
vre_nlp_node/spacy_nlp Executable file
View File

@ -0,0 +1,59 @@
#!/usr/bin/env python3
# coding=utf-8
import argparse
import os
import spacy
parser = argparse.ArgumentParser(description="Tag a .txt file with spaCy and \
save it in .vrt format")
parser.add_argument("-i",
dest="input",
help="Input file.",
required=True)
parser.add_argument("-l",
choices=["de", "en", "es", "fr", "pt"],
dest="lang",
help="Language for tagging",
required=True)
parser.add_argument("-o",
dest="output",
help="Output file.",
required=True)
args = parser.parse_args()
SPACY_MODELS = {"de": "de_core_news_sm", "en": "en_core_web_sm",
"es": "es_core_news_sm", "fr": "fr_core_news_sm",
"pt": "pt_core_news_sm"}
# Set the language model for spacy
nlp = spacy.load(SPACY_MODELS[args.lang])
# Read text from the input file
with open(args.input) as input_file:
text = input_file.read()
# Run spacy nlp over the text
doc = nlp(text)
# Create and open the output file
output_file = open(args.output, "w+")
output_file.write('<?xml version="1.0" encoding="UTF-8"?>\n<corpus>\n<text id="' + os.path.basename(args.input).rsplit(".", 1)[0] + '">\n')
for sent in doc.sents:
output_file.write('<s>\n')
for token in sent:
# Skip whitespace tokens like "\n" or "\t"
if token.text.isspace():
continue
# Write all information in .vrt style to the output file
# text, lemma, simple_pos, pos, ner
output_file.write(token.text + "\t" + token.lemma_ + "\t"
+ token.pos_ + "\t" + token.tag_ + "\t"
+ (token.ent_type_ if token.ent_type_ != "" else "NULL") + "\n")
output_file.write('</s>\n')
output_file.write('</text>\n</corpus>')
output_file.close()