diff --git a/Dockerfile b/Dockerfile
index e198715..b4ef535 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,8 @@
FROM debian:stretch-slim
-MAINTAINER Patrick Jentsch
+LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de"
+ENV DEBIAN_FRONTEND=noninteractive
ENV LANG=C.UTF-8
RUN apt-get update && \
@@ -9,22 +10,20 @@ RUN apt-get update && \
build-essential \
ca-certificates \
python2.7 \
- python3 \
+ python3.5 \
python3-dev \
python3-pip \
python3-setuptools \
wget
-WORKDIR /root
-
# Install pyFlow
ENV PYFLOW_VERSION 1.1.20
RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \
tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \
- rm pyflow-"$PYFLOW_VERSION".tar.gz && \
cd pyflow-"$PYFLOW_VERSION" && \
python2.7 setup.py build install && \
- cd ..
+ cd .. && \
+ rm -r pyflow-"$PYFLOW_VERSION".tar.gz pyflow-"$PYFLOW_VERSION"
# Install spaCy
RUN pip3 install wheel && pip3 install -U spacy && \
@@ -34,9 +33,8 @@ RUN pip3 install wheel && pip3 install -U spacy && \
python3 -m spacy download fr && \
python3 -m spacy download pt
-RUN mkdir files_for_nlp files_from_nlp
-
COPY nlp /usr/local/bin
COPY spacy_nlp /usr/local/bin
-CMD ["/bin/bash"]
\ No newline at end of file
+ENTRYPOINT ["nlp"]
+CMD ["--help"]
diff --git a/nlp b/nlp
index 6e8996b..af92e18 100755
--- a/nlp
+++ b/nlp
@@ -18,84 +18,105 @@ from pyflow import WorkflowRunner
def parse_arguments():
parser = argparse.ArgumentParser(
- "Performs NLP of documents utilizing spaCy. \
- Output is .vrt."
+ description='Performs NLP of documents utilizing spaCy. The results are served as verticalized text files.'
)
- parser.add_argument("-i",
- dest="inputDir",
- help="Input directory.",
- required=True)
- parser.add_argument("-l",
- dest='lang',
- help="Language for NLP",
- required=True)
- parser.add_argument("-o",
- dest="outputDir",
- help="Output directory.",
- required=True)
- parser.add_argument("--nCores",
- default=min(4, multiprocessing.cpu_count()),
- dest="nCores",
- help="Total number of cores available.",
- required=False,
- type=int)
+ parser.add_argument(
+ '-i',
+ dest='input_dir',
+ required=True
+ )
+ parser.add_argument(
+ '-l',
+ choices=['de', 'en', 'es', 'fr', 'pt'],
+ dest='lang',
+ required=True
+ )
+ parser.add_argument(
+ '-o',
+ dest='output_dir',
+ required=True
+ )
+ parser.add_argument(
+ '--nCores',
+ default=min(4, multiprocessing.cpu_count()),
+ dest='n_cores',
+ help='total number of cores available',
+ required=False,
+ type=int
+ )
return parser.parse_args()
class NLPWorkflow(WorkflowRunner):
- def __init__(self, jobs, lang, nCores):
- self.jobs = jobs
- self.lang = lang
- self.nCores = nCores
-
+ def __init__(self, args):
+ self.jobs = analyze_jobs(args.input_dir, args.output_dir)
+ self.lang = args.lang
+ self.n_cores = args.n_cores
def workflow(self):
- ###
- # Task "mkdir_job": create output directories
- # Dependencies: None
- ###
- mkdir_jobs = []
- mkdir_job_number = 0
- for job in self.jobs:
- mkdir_job_number += 1
- cmd = 'mkdir -p "%s"' % (
- job["output_dir"]
- )
- mkdir_jobs.append(self.addTask(label="mkdir_job_-_%i" % (mkdir_job_number), command=cmd))
+ if len(self.jobs) == 0:
+ return
- ###
- # Task "spacy_nlp_job": perform NLP
- # Dependencies: mkdir_jobs
- ###
- self.waitForTasks()
+ '''
+ ' ##################################################
+ ' # Create output directories #
+ ' ##################################################
+ '''
+ create_output_directories_jobs = []
+ for index, job in enumerate(self.jobs):
+ cmd = 'mkdir -p "%s"' % (job['output_dir'])
+ create_output_directories_jobs.append(
+ self.addTask(
+ command=cmd,
+ label='create_output_directories_job_-_%i' % (index)
+ )
+ )
+
+ '''
+ ' ##################################################
+ ' # Natural language processing #
+ ' ##################################################
+ '''
nlp_jobs = []
- nlp_job_number = 0
- for job in self.jobs:
- nlp_job_number += 1
- cmd = 'spacy_nlp -i "%s" -o "%s" -l "%s"' % (
- job["path"],
- os.path.join(job["output_dir"], os.path.basename(job["path"]).rsplit(".", 1)[0] + ".vrt"),
- self.lang
+ nlp_job_n_cores = min(
+ self.n_cores,
+ max(1, int(self.n_cores / len(self.jobs)))
+ )
+ for index, job in enumerate(self.jobs):
+ cmd = 'spacy_nlp -l "%s" "%s" "%s"' % (
+ self.lang,
+ job['path'],
+ os.path.join(job['output_dir'], job['name'] + '.vrt')
+ )
+ nlp_jobs.append(
+ self.addTask(
+ command=cmd,
+ dependencies='create_output_directories_job_-_%i' % (index),
+ label='nlp_job_-_%i' % (index),
+ nCores=nlp_job_n_cores
+ )
)
- nlp_jobs.append(self.addTask(label="nlp_job_-_%i" % (nlp_job_number), command=cmd, dependencies=mkdir_jobs, nCores=min(4, self.nCores)))
-def analyze_jobs(inputDir, outputDir, level=1):
+def analyze_jobs(input_dir, output_dir):
jobs = []
- if level > 2:
- return jobs
-
- for file in os.listdir(inputDir):
- if os.path.isdir(os.path.join(inputDir, file)):
+ for file in os.listdir(input_dir):
+ if os.path.isdir(os.path.join(input_dir, file)):
jobs += analyze_jobs(
- os.path.join(inputDir, file),
- os.path.join(outputDir, file),
- level + 1
+ os.path.join(input_dir, file),
+ os.path.join(output_dir, file),
+ )
+ elif file.endswith('.txt'):
+ jobs.append(
+ {
+ 'filename': file,
+ 'name': file.rsplit('.', 1)[0],
+ 'output_dir': os.path.join(output_dir, file),
+ 'path': os.path.join(input_dir, file)
+ }
)
- elif file.endswith(".txt"):
- jobs.append({"path": os.path.join(inputDir, file), "output_dir": os.path.join(outputDir, file.rsplit(".", 1)[0])})
return jobs
@@ -103,15 +124,12 @@ def analyze_jobs(inputDir, outputDir, level=1):
def main():
args = parse_arguments()
- wflow = NLPWorkflow(
- analyze_jobs(args.inputDir, args.outputDir),
- args.lang,
- args.nCores
- )
+ wflow = NLPWorkflow(args)
+
+ retval = wflow.run(dataDirRoot=args.output_dir, nCores=args.n_cores)
- retval = wflow.run(nCores=args.nCores)
sys.exit(retval)
-if __name__ == "__main__":
+if __name__ == '__main__':
main()
diff --git a/spacy_nlp b/spacy_nlp
index 6d895a5..e01bb05 100755
--- a/spacy_nlp
+++ b/spacy_nlp
@@ -1,48 +1,53 @@
#!/usr/bin/env python3
# coding=utf-8
-
import argparse
import os
import spacy
import textwrap
-
-parser = argparse.ArgumentParser(description="Tag a .txt file with spaCy and \
- save it in .vrt format")
-parser.add_argument("-i",
- dest="input",
- help="Input file.",
- required=True)
-parser.add_argument("-l",
- choices=["de", "en", "es", "fr", "pt"],
- dest="lang",
- help="Language for tagging",
- required=True)
-parser.add_argument("-o",
- dest="output",
- help="Output file.",
- required=True)
+parser = argparse.ArgumentParser(
+ description='Tag a text file with spaCy and save it as a verticalized text file.'
+)
+parser.add_argument(
+ 'i',
+ metavar='txt-sourcefile',
+)
+parser.add_argument(
+ '-l',
+ choices=['de', 'en', 'es', 'fr', 'pt'],
+ dest='lang',
+ required=True
+)
+parser.add_argument(
+ 'o',
+ metavar='vrt-destfile',
+)
args = parser.parse_args()
-
-SPACY_MODELS = {"de": "de_core_news_sm", "en": "en_core_web_sm",
- "es": "es_core_news_sm", "fr": "fr_core_news_sm",
- "pt": "pt_core_news_sm"}
+SPACY_MODELS = {
+ 'de': 'de_core_news_sm', 'en': 'en_core_web_sm', 'es': 'es_core_news_sm',
+ 'fr': 'fr_core_news_sm', 'pt': 'pt_core_news_sm'
+}
# Set the language model for spacy
nlp = spacy.load(SPACY_MODELS[args.lang])
# Read text from the input file and if neccessary split it into parts with a
# length of less than 1 million characters.
-with open(args.input) as input_file:
+with open(args.i) as input_file:
text = input_file.read()
texts = textwrap.wrap(text, 1000000, break_long_words=False)
text = None
# Create and open the output file
-output_file = open(args.output, "w+")
-output_file.write('\n\n\n')
+output_file = open(args.o, 'w+')
+
+output_file.write(
+ '\n\n\n' % (
+ os.path.basename(args.i).rsplit(".", 1)[0]
+ )
+)
for text in texts:
# Run spacy nlp over the text (partial string if above 1 million chars)
doc = nlp(text)
@@ -54,9 +59,12 @@ for text in texts:
continue
# Write all information in .vrt style to the output file
# text, lemma, simple_pos, pos, ner
- output_file.write(token.text + "\t" + token.lemma_ + "\t"
- + token.pos_ + "\t" + token.tag_ + "\t"
- + (token.ent_type_ if token.ent_type_ != "" else "NULL") + "\n")
+ output_file.write(
+ token.text + '\t' + token.lemma_ + '\t'
+ + token.pos_ + '\t' + token.tag_ + '\t'
+ + (token.ent_type_ if token.ent_type_ != '' else 'NULL') + '\n'
+ )
output_file.write('\n')
output_file.write('\n')
+
output_file.close()