Add prototype

2025-06-15 10:30:40 +00:00 · 2019-06-03 14:57:09 +02:00
parent b8fa8f47ab
commit 86557443a2
43 changed files with 24638 additions and 53 deletions
--- a/vre_nlp_node/Dockerfile
+++ b/vre_nlp_node/Dockerfile
@ -0,0 +1,43 @@
+FROM debian:stretch-slim
+
+MAINTAINER Patrick Jentsch <p.jentsch@uni-bielefeld.de>
+
+ENV LANG=C.UTF-8
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    build-essential \
+    ca-certificates \
+    python2.7 \
+    python3 \
+    python3-dev \
+    python3-pip \
+    python3-setuptools \
+    wget \
+    zip
+
+WORKDIR /root
+
+# Install pyFlow
+ENV PYFLOW_VERSION 1.1.20
+RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \
+    tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \
+    rm pyflow-"$PYFLOW_VERSION".tar.gz && \
+    cd pyflow-"$PYFLOW_VERSION" && \
+    python2.7 setup.py build install && \
+    cd ..
+
+# Install spaCy
+RUN pip3 install wheel && pip3 install -U spacy && \
+    python3 -m spacy download de && \
+    python3 -m spacy download en && \
+    python3 -m spacy download es && \
+    python3 -m spacy download fr && \
+    python3 -m spacy download pt
+
+RUN mkdir files_for_nlp files_from_nlp
+
+COPY nlp /usr/local/bin
+COPY spacy_nlp /usr/local/bin
+
+CMD ["/bin/bash"]
--- a/vre_nlp_node/README.md
+++ b/vre_nlp_node/README.md
@ -0,0 +1,38 @@
+# vre_nlp_node
+Dieses Repository stellt ein Dockerfile zur Erstellung eines Dockerimages zur linguistischen Datenverarbeitung (NLP) zur Verfügung. Es werden Textdateien entgegengenommen und verticalized text-Dateien ausgegeben.
+
+## Dockerimage erstellen
+Die GitLab Registry stellt ein automatisch erstelltes Dockerimage zur Verfügung, das stehts die neusten Änderungen beinhaltet. Das Dockerimage kann aber auch lokal erstellt werden, dazu muss folgender Befehl ins Terminal eingegeben werden.
+```bash
+docker build -t gitlab.ub.uni-bielefeld.de:4567/pjentsch/vre_nlp_node .
+```
+
+## Nutzung
+
+### Starten eines Dockercontainers
+```bash
+docker run \
+  --name <containername> \
+  -dit \
+  -v <datalocation>/files_for_nlp:/root/files_for_nlp \
+  -v <datalocation>/files_from_nlp:/root/files_from_nlp \
+  gitlab.ub.uni-bielefeld.de:4567/pjentsch/vre_nlp_node
+```
+
+### Daten zur linguistischen Datenverarbeitung in das Eingabeverzeichnis kopieren
+```bash
+cp <textfile1> <textfile2> ... <textfilen> <datalocation>/files_for_nlp
+```
+
+### Linguistische Datenverarbeitung starten
+```bash
+docker exec -it <containername> ocr -i /root/files_for_nlp -o /root/files_from_nlp -l <languagecode>
+```
+Valide Angaben für `<languagecode>` sind:
+* de (Deutsch)
+* en (Englisch)
+* es (Spanish)
+* fr (Französisch)
+* pt (Portugisisch)
+
+Nach Beendigung des Vorgangs, stehen die aufbereitet Daten im Verzeichnis `<datalocation>/files_from_nlp` zur Verfügung.
--- a/vre_nlp_node/nlp
+++ b/vre_nlp_node/nlp
@ -0,0 +1,131 @@
+#!/usr/bin/env python2.7
+# coding=utf-8
+
+"""
+nlp
+
+Usage:  For usage instructions run with option --help
+Author: Patrick Jentsch <p.jentsch@uni-bielefeld.de>
+"""
+
+
+import argparse
+import multiprocessing
+import os
+import sys
+from pyflow import WorkflowRunner
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        "Performs NLP of documents utilizing spaCy. \
+        Output is .vrt."
+    )
+
+    parser.add_argument("-i",
+                        dest="inputDir",
+                        help="Input directory.",
+                        required=True)
+    parser.add_argument("-l",
+                        dest='lang',
+                        help="Language for NLP",
+                        required=True)
+    parser.add_argument("-o",
+                        dest="outputDir",
+                        help="Output directory.",
+                        required=True)
+    parser.add_argument("--nCores",
+                        default=multiprocessing.cpu_count(),
+                        dest="nCores",
+                        help="Total number of cores available.",
+                        required=False,
+                        type=int)
+    return parser.parse_args()
+
+
+class NLPWorkflow(WorkflowRunner):
+    def __init__(self, jobs, lang, nCores):
+        self.jobs = jobs
+        self.lang = lang
+        self.nCores = nCores
+
+
+    def workflow(self):
+        ###
+        # Task "mkdir_job": create output directories
+        # Dependencies: None
+        ###
+        mkdir_jobs = []
+        mkdir_job_number = 0
+        for job in self.jobs:
+            mkdir_job_number += 1
+            cmd = 'mkdir -p "%s"' % (
+                job["output_dir"]
+            )
+            mkdir_jobs.append(self.addTask(label="mkdir_job_-_%i" % (mkdir_job_number), command=cmd))
+
+        ###
+        # Task "spacy_nlp_job": perform NLP
+        # Dependencies: mkdir_jobs
+        ###
+        self.waitForTasks()
+        nlp_jobs = []
+        nlp_job_number = 0
+        for job in self.jobs:
+            nlp_job_number += 1
+            cmd = 'spacy_nlp -i "%s" -o "%s" -l "%s"' % (
+                job["path"],
+                os.path.join(job["output_dir"], os.path.basename(job["path"]).rsplit(".", 1)[0] + ".vrt"),
+                self.lang
+            )
+            nlp_jobs.append(self.addTask(label="nlp_job_-_%i" % (nlp_job_number), command=cmd, dependencies=mkdir_jobs))
+
+        ###
+        # Task "zip_job": compress output
+        # Dependencies: nlp_jobs
+        ###
+        zip_jobs = []
+        zip_job_number = 0
+        for job in self.jobs:
+            zip_job_number += 1
+            cmd = 'zip -jqr %s %s' % (
+                job["output_dir"] + "_-_nlp",
+                job["output_dir"]
+            )
+            zip_jobs.append(self.addTask(label="zip_job_-_%i" % (zip_job_number), command=cmd, dependencies=nlp_jobs))
+
+
+def analyze_jobs(inputDir, outputDir, level=1):
+    jobs = []
+
+    if level > 2:
+        return jobs
+
+    for file in os.listdir(inputDir):
+        if os.path.isdir(os.path.join(inputDir, file)):
+            jobs += analyze_jobs(
+                os.path.join(inputDir, file),
+                os.path.join(outputDir, file),
+                level + 1
+            )
+        elif file.endswith(".txt"):
+            jobs.append({"path": os.path.join(inputDir, file), "output_dir": os.path.join(outputDir, file.rsplit(".", 1)[0])})
+
+    return jobs
+
+
+def main():
+    args = parse_arguments()
+
+    wflow = NLPWorkflow(
+        analyze_jobs(args.inputDir, args.outputDir),
+        args.lang,
+        args.nCores
+    )
+
+    retval = wflow.run(nCores=args.nCores)
+    sys.exit(retval)
+
+
+if __name__ == "__main__":
+    main()
--- a/vre_nlp_node/spacy_nlp
+++ b/vre_nlp_node/spacy_nlp
@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+# coding=utf-8
+
+
+import argparse
+import os
+import spacy
+
+
+parser = argparse.ArgumentParser(description="Tag a .txt file with spaCy and \
+                                              save it in .vrt format")
+parser.add_argument("-i",
+                    dest="input",
+                    help="Input file.",
+                    required=True)
+parser.add_argument("-l",
+                    choices=["de", "en", "es", "fr", "pt"],
+                    dest="lang",
+                    help="Language for tagging",
+                    required=True)
+parser.add_argument("-o",
+                    dest="output",
+                    help="Output file.",
+                    required=True)
+args = parser.parse_args()
+
+
+SPACY_MODELS = {"de": "de_core_news_sm", "en": "en_core_web_sm",
+                "es": "es_core_news_sm", "fr": "fr_core_news_sm",
+                "pt": "pt_core_news_sm"}
+
+
+# Set the language model for spacy
+nlp = spacy.load(SPACY_MODELS[args.lang])
+
+# Read text from the input file
+with open(args.input) as input_file:
+    text = input_file.read()
+
+# Run spacy nlp over the text
+doc = nlp(text)
+
+# Create and open the output file
+output_file = open(args.output, "w+")
+output_file.write('<?xml version="1.0" encoding="UTF-8"?>\n<corpus>\n<text id="' + os.path.basename(args.input).rsplit(".", 1)[0] + '">\n')
+for sent in doc.sents:
+    output_file.write('<s>\n')
+    for token in sent:
+        # Skip whitespace tokens like "\n" or "\t"
+        if token.text.isspace():
+            continue
+        # Write all information in .vrt style to the output file
+        # text, lemma, simple_pos, pos, ner
+        output_file.write(token.text + "\t" + token.lemma_ + "\t"
+                          + token.pos_ + "\t" + token.tag_ + "\t"
+                          + (token.ent_type_ if token.ent_type_ != "" else "NULL") + "\n")
+    output_file.write('</s>\n')
+output_file.write('</text>\n</corpus>')
+output_file.close()