Update

2026-08-02 07:23:32 +00:00 · 2019-05-20 11:28:51 +02:00
parent ed26d24776
commit 5b7bc2a840
3 changed files with 128 additions and 104 deletions
@@ -1,7 +1,8 @@
 FROM debian:stretch-slim
-MAINTAINER Patrick Jentsch <p.jentsch@uni-bielefeld.de>
+LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de"
 ENV DEBIAN_FRONTEND=noninteractive
 ENV LANG=C.UTF-8
 RUN apt-get update && \
@@ -9,22 +10,20 @@ RUN apt-get update && \
    build-essential \
    ca-certificates \
    python2.7 \
-    python3 \
+    python3.5 \
    python3-dev \
    python3-pip \
    python3-setuptools \
    wget
 WORKDIR /root
 # Install pyFlow
 ENV PYFLOW_VERSION 1.1.20
 RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \
    tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \
    rm pyflow-"$PYFLOW_VERSION".tar.gz && \
    cd pyflow-"$PYFLOW_VERSION" && \
    python2.7 setup.py build install && \
-    cd ..
+    cd .. && \
    rm -r pyflow-"$PYFLOW_VERSION".tar.gz pyflow-"$PYFLOW_VERSION"
 # Install spaCy
 RUN pip3 install wheel && pip3 install -U spacy && \
@@ -34,9 +33,8 @@ RUN pip3 install wheel && pip3 install -U spacy && \
    python3 -m spacy download fr && \
    python3 -m spacy download pt
 RUN mkdir files_for_nlp files_from_nlp
 COPY nlp /usr/local/bin
 COPY spacy_nlp /usr/local/bin
-CMD ["/bin/bash"]
+ENTRYPOINT ["nlp"]
 CMD ["--help"]
@@ -18,84 +18,105 @@ from pyflow import WorkflowRunner
 def parse_arguments():
    parser = argparse.ArgumentParser(
-        "Performs NLP of documents utilizing spaCy. \
+        description='Performs NLP of documents utilizing spaCy. The results are served as verticalized text files.'
        Output is .vrt."
    )
-    parser.add_argument("-i",
+    parser.add_argument(
-                        dest="inputDir",
+        '-i',
-                        help="Input directory.",
+        dest='input_dir',
-                        required=True)
+        required=True
-    parser.add_argument("-l",
+    )
    parser.add_argument(
        '-l',
        choices=['de', 'en', 'es', 'fr', 'pt'],
        dest='lang',
-                        help="Language for NLP",
+        required=True
-                        required=True)
+    )
-    parser.add_argument("-o",
+    parser.add_argument(
-                        dest="outputDir",
+        '-o',
-                        help="Output directory.",
+        dest='output_dir',
-                        required=True)
+        required=True
-    parser.add_argument("--nCores",
+    )
    parser.add_argument(
        '--nCores',
        default=min(4, multiprocessing.cpu_count()),
-                        dest="nCores",
+        dest='n_cores',
-                        help="Total number of cores available.",
+        help='total number of cores available',
        required=False,
-                        type=int)
+        type=int
    )
    return parser.parse_args()
 class NLPWorkflow(WorkflowRunner):
-    def __init__(self, jobs, lang, nCores):
+    def __init__(self, args):
-        self.jobs = jobs
+        self.jobs = analyze_jobs(args.input_dir, args.output_dir)
-        self.lang = lang
+        self.lang = args.lang
-        self.nCores = nCores
+        self.n_cores = args.n_cores
    def workflow(self):
-        ###
+        if len(self.jobs) == 0:
-        # Task "mkdir_job": create output directories
+            return
        # Dependencies: None
        ###
        mkdir_jobs = []
        mkdir_job_number = 0
        for job in self.jobs:
            mkdir_job_number += 1
            cmd = 'mkdir -p "%s"' % (
                job["output_dir"]
            )
            mkdir_jobs.append(self.addTask(label="mkdir_job_-_%i" % (mkdir_job_number), command=cmd))
-        ###
+        '''
-        # Task "spacy_nlp_job": perform NLP
+        ' ##################################################
-        # Dependencies: mkdir_jobs
+        ' # Create output directories                      #
-        ###
+        ' ##################################################
-        self.waitForTasks()
+        '''
        create_output_directories_jobs = []
        for index, job in enumerate(self.jobs):
            cmd = 'mkdir -p "%s"' % (job['output_dir'])
            create_output_directories_jobs.append(
                self.addTask(
                    command=cmd,
                    label='create_output_directories_job_-_%i' % (index)
                )
            )
        '''
        ' ##################################################
        ' # Natural language processing                    #
        ' ##################################################
        '''
        nlp_jobs = []
-        nlp_job_number = 0
+        nlp_job_n_cores = min(
-        for job in self.jobs:
+            self.n_cores,
-            nlp_job_number += 1
+            max(1, int(self.n_cores / len(self.jobs)))
-            cmd = 'spacy_nlp -i "%s" -o "%s" -l "%s"' % (
+        )
-                job["path"],
+        for index, job in enumerate(self.jobs):
-                os.path.join(job["output_dir"], os.path.basename(job["path"]).rsplit(".", 1)[0] + ".vrt"),
+            cmd = 'spacy_nlp -l "%s" "%s" "%s"' % (
-                self.lang
+                self.lang,
                job['path'],
                os.path.join(job['output_dir'], job['name'] + '.vrt')
            )
            nlp_jobs.append(
                self.addTask(
                    command=cmd,
                    dependencies='create_output_directories_job_-_%i' % (index),
                    label='nlp_job_-_%i' % (index),
                    nCores=nlp_job_n_cores
                )
            )
            nlp_jobs.append(self.addTask(label="nlp_job_-_%i" % (nlp_job_number), command=cmd, dependencies=mkdir_jobs, nCores=min(4, self.nCores)))
-def analyze_jobs(inputDir, outputDir, level=1):
+def analyze_jobs(input_dir, output_dir):
    jobs = []
-    if level > 2:
+    for file in os.listdir(input_dir):
-        return jobs
+        if os.path.isdir(os.path.join(input_dir, file)):
    for file in os.listdir(inputDir):
        if os.path.isdir(os.path.join(inputDir, file)):
            jobs += analyze_jobs(
-                os.path.join(inputDir, file),
+                os.path.join(input_dir, file),
-                os.path.join(outputDir, file),
+                os.path.join(output_dir, file),
-                level + 1
+            )
        elif file.endswith('.txt'):
            jobs.append(
                {
                    'filename': file,
                    'name': file.rsplit('.', 1)[0],
                    'output_dir': os.path.join(output_dir, file),
                    'path': os.path.join(input_dir, file)
                }
            )
        elif file.endswith(".txt"):
            jobs.append({"path": os.path.join(inputDir, file), "output_dir": os.path.join(outputDir, file.rsplit(".", 1)[0])})
    return jobs
@@ -103,15 +124,12 @@ def analyze_jobs(inputDir, outputDir, level=1):
 def main():
    args = parse_arguments()
-    wflow = NLPWorkflow(
+    wflow = NLPWorkflow(args)
-        analyze_jobs(args.inputDir, args.outputDir),
+
-        args.lang,
+    retval = wflow.run(dataDirRoot=args.output_dir, nCores=args.n_cores)
        args.nCores
    )
    retval = wflow.run(nCores=args.nCores)
    sys.exit(retval)
-if __name__ == "__main__":
+if __name__ == '__main__':
    main()
@@ -1,48 +1,53 @@
 #!/usr/bin/env python3
 # coding=utf-8
 import argparse
 import os
 import spacy
 import textwrap
-
+parser = argparse.ArgumentParser(
-parser = argparse.ArgumentParser(description="Tag a .txt file with spaCy and \
+    description='Tag a text file with spaCy and save it as a verticalized text file.'
-                                              save it in .vrt format")
+)
-parser.add_argument("-i",
+parser.add_argument(
-                    dest="input",
+    'i',
-                    help="Input file.",
+    metavar='txt-sourcefile',
-                    required=True)
+)
-parser.add_argument("-l",
+parser.add_argument(
-                    choices=["de", "en", "es", "fr", "pt"],
+    '-l',
-                    dest="lang",
+    choices=['de', 'en', 'es', 'fr', 'pt'],
-                    help="Language for tagging",
+    dest='lang',
-                    required=True)
+    required=True
-parser.add_argument("-o",
+)
-                    dest="output",
+parser.add_argument(
-                    help="Output file.",
+    'o',
-                    required=True)
+    metavar='vrt-destfile',
 )
 args = parser.parse_args()
-
+SPACY_MODELS = {
-SPACY_MODELS = {"de": "de_core_news_sm", "en": "en_core_web_sm",
+    'de': 'de_core_news_sm', 'en': 'en_core_web_sm', 'es': 'es_core_news_sm',
-                "es": "es_core_news_sm", "fr": "fr_core_news_sm",
+    'fr': 'fr_core_news_sm', 'pt': 'pt_core_news_sm'
-                "pt": "pt_core_news_sm"}
+}
 # Set the language model for spacy
 nlp = spacy.load(SPACY_MODELS[args.lang])
 # Read text from the input file and if neccessary split it into parts with a
 # length of less than 1 million characters.
-with open(args.input) as input_file:
+with open(args.i) as input_file:
    text = input_file.read()
    texts = textwrap.wrap(text, 1000000, break_long_words=False)
    text = None
 # Create and open the output file
-output_file = open(args.output, "w+")
+output_file = open(args.o, 'w+')
-output_file.write('<?xml version="1.0" encoding="UTF-8"?>\n<corpus>\n<text id="' + os.path.basename(args.input).rsplit(".", 1)[0] + '">\n')
+
 output_file.write(
    '<?xml version="1.0" encoding="UTF-8"?>\n<corpus>\n<text id="%s">\n' % (
        os.path.basename(args.i).rsplit(".", 1)[0]
    )
 )
 for text in texts:
    # Run spacy nlp over the text (partial string if above 1 million chars)
    doc = nlp(text)
@@ -54,9 +59,12 @@ for text in texts:
                continue
            # Write all information in .vrt style to the output file
            # text, lemma, simple_pos, pos, ner
-            output_file.write(token.text + "\t" + token.lemma_ + "\t"
+            output_file.write(
-                              + token.pos_ + "\t" + token.tag_ + "\t"
+                token.text + '\t' + token.lemma_ + '\t'
-                              + (token.ent_type_ if token.ent_type_ != "" else "NULL") + "\n")
+                + token.pos_ + '\t' + token.tag_ + '\t'
                + (token.ent_type_ if token.ent_type_ != '' else 'NULL') + '\n'
            )
        output_file.write('</s>\n')
 output_file.write('</text>\n</corpus>')
 output_file.close()