commit 2a0662bccc49544b3c1128cff82473a9179fd15a Author: Patrick Jentsch Date: Wed Feb 6 16:58:17 2019 +0100 Initial commit diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..b9ab88d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,42 @@ +FROM debian:stretch + +MAINTAINER Patrick Jentsch + +ENV LANG=C.UTF-8 + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + python2.7 \ + python3 \ + python3-dev \ + python3-pip \ + python3-setuptools \ + wget + +WORKDIR /root + +# Install pyFlow +ENV PYFLOW_VERSION 1.1.20 +RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \ + tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \ + rm pyflow-"$PYFLOW_VERSION".tar.gz && \ + cd pyflow-"$PYFLOW_VERSION" && \ + python2.7 setup.py build install && \ + cd .. + +# Install spaCy +RUN pip3 install wheel && pip3 install -U spacy && \ + python3 -m spacy download de && \ + python3 -m spacy download en && \ + python3 -m spacy download es && \ + python3 -m spacy download fr && \ + python3 -m spacy download pt + +RUN mkdir files_for_nlp files_from_nlp + +COPY nlp /usr/local/bin +COPY spacy_nlp /usr/local/bin + +CMD ["/bin/bash"] \ No newline at end of file diff --git a/nlp b/nlp new file mode 100755 index 0000000..b7839da --- /dev/null +++ b/nlp @@ -0,0 +1,117 @@ +#!/usr/bin/env python2.7 +# coding=utf-8 + +""" +nlp + +Usage: For usage instructions run with option --help +Author: Patrick Jentsch +""" + + +import argparse +import multiprocessing +import os +import sys +from pyflow import WorkflowRunner + + +def parse_arguments(): + parser = argparse.ArgumentParser( + "Performs NLP of documents utilizing spaCy. \ + Output is .vrt." + ) + + parser.add_argument("-i", + dest="inputDir", + help="Input directory.", + required=True) + parser.add_argument("-l", + dest='lang', + help="Language for NLP", + required=True) + parser.add_argument("-o", + dest="outputDir", + help="Output directory.", + required=True) + parser.add_argument("--nCores", + default=multiprocessing.cpu_count(), + dest="nCores", + help="Total number of cores available.", + required=False, + type=int) + return parser.parse_args() + + +class NLPWorkflow(WorkflowRunner): + def __init__(self, jobs, lang, nCores): + self.jobs = jobs + self.lang = lang + self.nCores = nCores + + + def workflow(self): + ### + # Task "mkdir_job": create output directories + # Dependencies: None + ### + mkdir_jobs = [] + mkdir_job_number = 0 + for job in self.jobs: + mkdir_job_number += 1 + cmd = 'mkdir -p "%s"' % ( + job["output_dir"] + ) + mkdir_jobs.append(self.addTask(label="mkdir_job_-_%i" % (mkdir_job_number), command=cmd)) + + ### + # Task "spacy_nlp_job": perform NLP + # Dependencies: mkdir_jobs + ### + self.waitForTasks() + nlp_jobs = [] + nlp_job_number = 0 + for job in self.jobs: + nlp_job_number += 1 + cmd = 'spacy_nlp -i "%s" -o "%s" -l "%s"' % ( + job["path"], + os.path.join(job["output_dir"], os.path.basename(job["path"]).rsplit(".", 1)[0] + ".vrt"), + self.lang + ) + nlp_jobs.append(self.addTask(label="nlp_job_-_%i" % (nlp_job_number), command=cmd, dependencies=mkdir_jobs)) + + +def analyze_jobs(inputDir, outputDir, level=1): + jobs = [] + + if level > 2: + return jobs + + for file in os.listdir(inputDir): + if os.path.isdir(os.path.join(inputDir, file)): + jobs += analyze_jobs( + os.path.join(inputDir, file), + os.path.join(outputDir, file), + level + 1 + ) + elif file.endswith(".txt"): + jobs.append({"path": os.path.join(inputDir, file), "output_dir": os.path.join(outputDir, file.rsplit(".", 1)[0])}) + + return jobs + + +def main(): + args = parse_arguments() + + wflow = NLPWorkflow( + analyze_jobs(args.inputDir, args.outputDir), + args.lang, + args.nCores + ) + + retval = wflow.run(nCores=args.nCores) + sys.exit(retval) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/spacy_nlp b/spacy_nlp new file mode 100755 index 0000000..bd3921c --- /dev/null +++ b/spacy_nlp @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +# coding=utf-8 + + +import argparse +import os +import spacy + + +parser = argparse.ArgumentParser(description="Tag a .txt file with spaCy and \ + save it in .vrt format") +parser.add_argument("-i", + dest="input", + help="Input file.", + required=True) +parser.add_argument("-l", + choices=["de", "en", "es", "fr", "pt"], + dest="lang", + help="Language for tagging", + required=True) +parser.add_argument("-o", + dest="output", + help="Output file.", + required=True) +args = parser.parse_args() + + +SPACY_MODELS = {"de": "de_core_news_sm", "en": "en_core_web_sm", + "es": "es_core_news_sm", "fr": "fr_core_news_sm", + "pt": "pt_core_news_sm"} + + +# Set the language model for spacy +nlp = spacy.load(SPACY_MODELS[args.lang]) + +# Read text from the input file +with open(args.input) as input_file: + text = input_file.read() + +# Run spacy nlp over the text +doc = nlp(text) + +# Create and open the output file +output_file = open(args.output, "w+") +output_file.write('\n\n\n') +for sent in doc.sents: + output_file.write('\n') + for token in sent: + # Skip whitespace tokens like "\n" or "\t" + if token.text.isspace(): + continue + # Write all information in .vrt style to the output file + # text, lemma, simple_pos, pos, ner + output_file.write(token.text + "\t" + token.lemma_ + "\t" + + token.pos_ + "\t" + token.tag_ + "\t" + + (token.ent_type_ if token.ent_type_ != "" else "NULL") + "\n") + output_file.write('\n') +output_file.write('\n') +output_file.close() \ No newline at end of file