From 5b7bc2a84003ccb899a24fe7f23347f8b96f3ed2 Mon Sep 17 00:00:00 2001
From: Patrick Jentsch <pjentsch@pjentsch-Laptop.local>
Date: Mon, 20 May 2019 11:28:51 +0200
Subject: [PATCH] Update

---
 Dockerfile |  16 +++---
 nlp        | 154 ++++++++++++++++++++++++++++++-----------------------
 spacy_nlp  |  62 +++++++++++----------
 3 files changed, 128 insertions(+), 104 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index e198715..b4ef535 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,8 @@
 FROM debian:stretch-slim
 
-MAINTAINER Patrick Jentsch <p.jentsch@uni-bielefeld.de>
+LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de"
 
+ENV DEBIAN_FRONTEND=noninteractive
 ENV LANG=C.UTF-8
 
 RUN apt-get update && \
@@ -9,22 +10,20 @@ RUN apt-get update && \
     build-essential \
     ca-certificates \
     python2.7 \
-    python3 \
+    python3.5 \
     python3-dev \
     python3-pip \
     python3-setuptools \
     wget
 
-WORKDIR /root
-
 # Install pyFlow
 ENV PYFLOW_VERSION 1.1.20
 RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \
     tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \
-    rm pyflow-"$PYFLOW_VERSION".tar.gz && \
     cd pyflow-"$PYFLOW_VERSION" && \
     python2.7 setup.py build install && \
-    cd ..
+    cd .. && \
+    rm -r pyflow-"$PYFLOW_VERSION".tar.gz pyflow-"$PYFLOW_VERSION"
 
 # Install spaCy
 RUN pip3 install wheel && pip3 install -U spacy && \
@@ -34,9 +33,8 @@ RUN pip3 install wheel && pip3 install -U spacy && \
     python3 -m spacy download fr && \
     python3 -m spacy download pt
 
-RUN mkdir files_for_nlp files_from_nlp
-
 COPY nlp /usr/local/bin
 COPY spacy_nlp /usr/local/bin
 
-CMD ["/bin/bash"]
\ No newline at end of file
+ENTRYPOINT ["nlp"]
+CMD ["--help"]
diff --git a/nlp b/nlp
index 6e8996b..af92e18 100755
--- a/nlp
+++ b/nlp
@@ -18,84 +18,105 @@ from pyflow import WorkflowRunner
 
 def parse_arguments():
     parser = argparse.ArgumentParser(
-        "Performs NLP of documents utilizing spaCy. \
-        Output is .vrt."
+        description='Performs NLP of documents utilizing spaCy. The results are served as verticalized text files.'
     )
 
-    parser.add_argument("-i",
-                        dest="inputDir",
-                        help="Input directory.",
-                        required=True)
-    parser.add_argument("-l",
-                        dest='lang',
-                        help="Language for NLP",
-                        required=True)
-    parser.add_argument("-o",
-                        dest="outputDir",
-                        help="Output directory.",
-                        required=True)
-    parser.add_argument("--nCores",
-                        default=min(4, multiprocessing.cpu_count()),
-                        dest="nCores",
-                        help="Total number of cores available.",
-                        required=False,
-                        type=int)
+    parser.add_argument(
+        '-i',
+        dest='input_dir',
+        required=True
+    )
+    parser.add_argument(
+        '-l',
+        choices=['de', 'en', 'es', 'fr', 'pt'],
+        dest='lang',
+        required=True
+    )
+    parser.add_argument(
+        '-o',
+        dest='output_dir',
+        required=True
+    )
+    parser.add_argument(
+        '--nCores',
+        default=min(4, multiprocessing.cpu_count()),
+        dest='n_cores',
+        help='total number of cores available',
+        required=False,
+        type=int
+    )
     return parser.parse_args()
 
 
 class NLPWorkflow(WorkflowRunner):
-    def __init__(self, jobs, lang, nCores):
-        self.jobs = jobs
-        self.lang = lang
-        self.nCores = nCores
-
+    def __init__(self, args):
+        self.jobs = analyze_jobs(args.input_dir, args.output_dir)
+        self.lang = args.lang
+        self.n_cores = args.n_cores
 
     def workflow(self):
-        ###
-        # Task "mkdir_job": create output directories
-        # Dependencies: None
-        ###
-        mkdir_jobs = []
-        mkdir_job_number = 0
-        for job in self.jobs:
-            mkdir_job_number += 1
-            cmd = 'mkdir -p "%s"' % (
-                job["output_dir"]
-            )
-            mkdir_jobs.append(self.addTask(label="mkdir_job_-_%i" % (mkdir_job_number), command=cmd))
+        if len(self.jobs) == 0:
+            return
 
-        ###
-        # Task "spacy_nlp_job": perform NLP
-        # Dependencies: mkdir_jobs
-        ###
-        self.waitForTasks()
+        '''
+        ' ##################################################
+        ' # Create output directories                      #
+        ' ##################################################
+        '''
+        create_output_directories_jobs = []
+        for index, job in enumerate(self.jobs):
+            cmd = 'mkdir -p "%s"' % (job['output_dir'])
+            create_output_directories_jobs.append(
+                self.addTask(
+                    command=cmd,
+                    label='create_output_directories_job_-_%i' % (index)
+                )
+            )
+
+        '''
+        ' ##################################################
+        ' # Natural language processing                    #
+        ' ##################################################
+        '''
         nlp_jobs = []
-        nlp_job_number = 0
-        for job in self.jobs:
-            nlp_job_number += 1
-            cmd = 'spacy_nlp -i "%s" -o "%s" -l "%s"' % (
-                job["path"],
-                os.path.join(job["output_dir"], os.path.basename(job["path"]).rsplit(".", 1)[0] + ".vrt"),
-                self.lang
+        nlp_job_n_cores = min(
+            self.n_cores,
+            max(1, int(self.n_cores / len(self.jobs)))
+        )
+        for index, job in enumerate(self.jobs):
+            cmd = 'spacy_nlp -l "%s" "%s" "%s"' % (
+                self.lang,
+                job['path'],
+                os.path.join(job['output_dir'], job['name'] + '.vrt')
+            )
+            nlp_jobs.append(
+                self.addTask(
+                    command=cmd,
+                    dependencies='create_output_directories_job_-_%i' % (index),
+                    label='nlp_job_-_%i' % (index),
+                    nCores=nlp_job_n_cores
+                )
             )
-            nlp_jobs.append(self.addTask(label="nlp_job_-_%i" % (nlp_job_number), command=cmd, dependencies=mkdir_jobs, nCores=min(4, self.nCores)))
 
 
-def analyze_jobs(inputDir, outputDir, level=1):
+def analyze_jobs(input_dir, output_dir):
     jobs = []
 
-    if level > 2:
-        return jobs
-
-    for file in os.listdir(inputDir):
-        if os.path.isdir(os.path.join(inputDir, file)):
+    for file in os.listdir(input_dir):
+        if os.path.isdir(os.path.join(input_dir, file)):
             jobs += analyze_jobs(
-                os.path.join(inputDir, file),
-                os.path.join(outputDir, file),
-                level + 1
+                os.path.join(input_dir, file),
+                os.path.join(output_dir, file),
+            )
+        elif file.endswith('.txt'):
+            jobs.append(
+                {
+                    'filename': file,
+                    'name': file.rsplit('.', 1)[0],
+                    'output_dir': os.path.join(output_dir, file),
+                    'path': os.path.join(input_dir, file)
+                }
             )
-        elif file.endswith(".txt"):
-            jobs.append({"path": os.path.join(inputDir, file), "output_dir": os.path.join(outputDir, file.rsplit(".", 1)[0])})
 
     return jobs
 
@@ -103,15 +124,12 @@ def analyze_jobs(inputDir, outputDir, level=1):
 def main():
     args = parse_arguments()
 
-    wflow = NLPWorkflow(
-        analyze_jobs(args.inputDir, args.outputDir),
-        args.lang,
-        args.nCores
-    )
+    wflow = NLPWorkflow(args)
+
+    retval = wflow.run(dataDirRoot=args.output_dir, nCores=args.n_cores)
 
-    retval = wflow.run(nCores=args.nCores)
     sys.exit(retval)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     main()
diff --git a/spacy_nlp b/spacy_nlp
index 6d895a5..e01bb05 100755
--- a/spacy_nlp
+++ b/spacy_nlp
@@ -1,48 +1,53 @@
 #!/usr/bin/env python3
 # coding=utf-8
 
-
 import argparse
 import os
 import spacy
 import textwrap
 
-
-parser = argparse.ArgumentParser(description="Tag a .txt file with spaCy and \
-                                              save it in .vrt format")
-parser.add_argument("-i",
-                    dest="input",
-                    help="Input file.",
-                    required=True)
-parser.add_argument("-l",
-                    choices=["de", "en", "es", "fr", "pt"],
-                    dest="lang",
-                    help="Language for tagging",
-                    required=True)
-parser.add_argument("-o",
-                    dest="output",
-                    help="Output file.",
-                    required=True)
+parser = argparse.ArgumentParser(
+    description='Tag a text file with spaCy and save it as a verticalized text file.'
+)
+parser.add_argument(
+    'i',
+    metavar='txt-sourcefile',
+)
+parser.add_argument(
+    '-l',
+    choices=['de', 'en', 'es', 'fr', 'pt'],
+    dest='lang',
+    required=True
+)
+parser.add_argument(
+    'o',
+    metavar='vrt-destfile',
+)
 args = parser.parse_args()
 
-
-SPACY_MODELS = {"de": "de_core_news_sm", "en": "en_core_web_sm",
-                "es": "es_core_news_sm", "fr": "fr_core_news_sm",
-                "pt": "pt_core_news_sm"}
+SPACY_MODELS = {
+    'de': 'de_core_news_sm', 'en': 'en_core_web_sm', 'es': 'es_core_news_sm',
+    'fr': 'fr_core_news_sm', 'pt': 'pt_core_news_sm'
+}
 
 # Set the language model for spacy
 nlp = spacy.load(SPACY_MODELS[args.lang])
 
 # Read text from the input file and if neccessary split it into parts with a
 # length of less than 1 million characters.
-with open(args.input) as input_file:
+with open(args.i) as input_file:
     text = input_file.read()
     texts = textwrap.wrap(text, 1000000, break_long_words=False)
     text = None
 
 # Create and open the output file
-output_file = open(args.output, "w+")
-output_file.write('<?xml version="1.0" encoding="UTF-8"?>\n<corpus>\n<text id="' + os.path.basename(args.input).rsplit(".", 1)[0] + '">\n')
+output_file = open(args.o, 'w+')
+
+output_file.write(
+    '<?xml version="1.0" encoding="UTF-8"?>\n<corpus>\n<text id="%s">\n' % (
+        os.path.basename(args.i).rsplit(".", 1)[0]
+    )
+)
 for text in texts:
     # Run spacy nlp over the text (partial string if above 1 million chars)
     doc = nlp(text)
@@ -54,9 +59,12 @@ for text in texts:
                 continue
             # Write all information in .vrt style to the output file
             # text, lemma, simple_pos, pos, ner
-            output_file.write(token.text + "\t" + token.lemma_ + "\t"
-                              + token.pos_ + "\t" + token.tag_ + "\t"
-                              + (token.ent_type_ if token.ent_type_ != "" else "NULL") + "\n")
+            output_file.write(
+                token.text + '\t' + token.lemma_ + '\t'
+                + token.pos_ + '\t' + token.tag_ + '\t'
+                + (token.ent_type_ if token.ent_type_ != '' else 'NULL') + '\n'
+            )
         output_file.write('</s>\n')
 output_file.write('</text>\n</corpus>')
+
 output_file.close()