From e061a7426dec836dc0463fd6fb5131d06708c315 Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Fri, 3 Apr 2020 17:35:05 +0200 Subject: [PATCH] Update NLP Pipeline --- Dockerfile | 32 ++++----- nlp | 198 ++++++++++++++++++++++++---------------------------- spacy-nlp | 76 ++++++++++++++++++++ spacy_nlp | 83 ---------------------- wrapper/nlp | 40 +++++------ 5 files changed, 198 insertions(+), 231 deletions(-) create mode 100755 spacy-nlp delete mode 100755 spacy_nlp diff --git a/Dockerfile b/Dockerfile index 7602a94..800c24f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,8 +11,10 @@ ENV LANG=C.UTF-8 # Install prerequisites RUN apt-get update \ && apt-get install -y --no-install-recommends \ + build-essential \ python2.7 \ python3.5 \ + python3-dev \ python3-pip \ zip \ && rm -rf /var/lib/apt/lists/* \ @@ -31,30 +33,24 @@ RUN tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \ "pyflow-${PYFLOW_VERSION}" \ "pyflow-${PYFLOW_VERSION}.tar.gz" -ENV SPACY_MODEL_DE=de_core_news_sm \ - SPACY_MODEL_EL=el_core_news_sm \ - SPACY_MODEL_EN=en_core_web_sm \ - SPACY_MODEL_ES=es_core_news_sm \ - SPACY_MODEL_FR=fr_core_news_sm \ - SPACY_MODEL_IT=it_core_news_sm \ - SPACY_MODEL_NL=nl_core_news_sm \ - SPACY_MODEL_PT=pt_core_news_sm \ - SPACY_VERSION=2.2.0 +ENV SPACY_VERSION=2.2.4 +ENV SPACY_MODELS_VERSION=2.2.5 + RUN pip3 install \ "spacy==${SPACY_VERSION}" \ - && python3 -m spacy download "${SPACY_MODEL_DE}-${SPACY_VERSION}" --direct \ - && python3 -m spacy download "${SPACY_MODEL_EL}-${SPACY_VERSION}" --direct \ - && python3 -m spacy download "${SPACY_MODEL_EN}-${SPACY_VERSION}" --direct \ - && python3 -m spacy download "${SPACY_MODEL_ES}-${SPACY_VERSION}" --direct \ - && python3 -m spacy download "${SPACY_MODEL_FR}-${SPACY_VERSION}" --direct \ - && python3 -m spacy download "${SPACY_MODEL_IT}-${SPACY_VERSION}" --direct \ - && python3 -m spacy download "${SPACY_MODEL_NL}-${SPACY_VERSION}" --direct \ - && python3 -m spacy download "${SPACY_MODEL_PT}-${SPACY_VERSION}" --direct + && python3 -m spacy download "de_core_news_sm-${SPACY_MODELS_VERSION}" --direct \ + && python3 -m spacy download "el_core_news_sm-${SPACY_MODELS_VERSION}" --direct \ + && python3 -m spacy download "en_core_web_sm-${SPACY_MODELS_VERSION}" --direct \ + && python3 -m spacy download "es_core_news_sm-${SPACY_MODELS_VERSION}" --direct \ + && python3 -m spacy download "fr_core_news_sm-${SPACY_MODELS_VERSION}" --direct \ + && python3 -m spacy download "it_core_news_sm-${SPACY_MODELS_VERSION}" --direct \ + && python3 -m spacy download "nl_core_news_sm-${SPACY_MODELS_VERSION}" --direct \ + && python3 -m spacy download "pt_core_news_sm-${SPACY_MODELS_VERSION}" --direct # Install NLP pipeline COPY nlp /usr/local/bin -COPY spacy_nlp /usr/local/bin +COPY spacy-nlp /usr/local/bin ENTRYPOINT ["nlp"] diff --git a/nlp b/nlp index 6458611..dc8c23c 100755 --- a/nlp +++ b/nlp @@ -9,147 +9,131 @@ Author: Patrick Jentsch """ -import argparse +from argparse import ArgumentParser +from pyflow import WorkflowRunner import multiprocessing import os import sys -from pyflow import WorkflowRunner -def parse_arguments(): - parser = argparse.ArgumentParser( - description=('Performs NLP of documents utilizing spaCy. The results ' - 'are served as verticalized text files.') - ) - parser.add_argument('-i', dest='input_dir', required=True) - parser.add_argument( - '-l', - choices=['de', 'el', 'en', 'es', 'fr', 'it', 'nl', 'pt'], - dest='lang', - required=True - ) - parser.add_argument('-o', dest='output_dir', required=True) - parser.add_argument('--nCores', +SPACY_MODELS = {'de': 'de_core_news_sm', + 'el': 'el_core_news_sm', + 'en': 'en_core_web_sm', + 'es': 'es_core_news_sm', + 'fr': 'fr_core_news_sm', + 'it': 'it_core_news_sm', + 'nl': 'nl_core_news_sm', + 'pt': 'pt_core_news_sm'} + + +def parse_args(): + parser = ArgumentParser(description='NLP Pipeline utilizing spaCy.') + parser.add_argument('i') + parser.add_argument('o') + parser.add_argument('-l', '--language', choices=SPACY_MODELS.keys(), + required=True) + parser.add_argument('--n-cores', default=min(4, multiprocessing.cpu_count()), - dest='n_cores', - help='total number of cores available', - required=False, - type=int) - parser.add_argument('--zip', - default='vrt-results', - dest='zip', - type=str, - help='''package result files in zip bundles takes a - string as a filename as an optional paramteer''', - required=False) - parser.add_argument('--check-encoding', - action='store_true', - default=False, - dest="check_encoding", - help='''if used the nlp process will know hat the - encoding of the input files is unkown and - thus != utf-8. The process will try to determine - the encoding of the input files and use this. - encoding.''' - ) + help='total number of cores available', type=int) + parser.add_argument('--check-encoding', action='store_true') + parser.add_argument('--log-dir') + parser.add_argument('--zip') return parser.parse_args() -class NLPWorkflow(WorkflowRunner): - def __init__(self, args): - self.jobs = analyze_jobs(args.input_dir, args.output_dir) - self.lang = args.lang - self.n_cores = args.n_cores - self.output_dir = args.output_dir - self.zip = args.zip - self.check_encoding = args.check_encoding +class NLPPipelineJob: + def __init__(self, file, output_dir): + self.file = file + self.name = os.path.basename(file).rsplit('.', 1)[0] + self.output_dir = output_dir + + +class NLPPipeline(WorkflowRunner): + def __init__(self, check_encoding, jobs, lang, n_cores, output_dir, zip): + self.check_encoding = check_encoding + self.jobs = jobs + self.lang = lang + self.n_cores = n_cores + self.output_dir = output_dir + self.zip = zip def workflow(self): - if len(self.jobs) == 0: + if not self.jobs: return ''' ' ################################################## - ' # Create output directories # + ' # mkdir_jobs # ' ################################################## ''' - create_output_directories_jobs = [] - for index, job in enumerate(self.jobs): - cmd = 'mkdir -p "{}"'.format(job['output_dir']) - create_output_directories_jobs.append( - self.addTask( - command=cmd, - label='create_output_directories_job_-_{}'.format(index) - ) - ) + mkdir_jobs = [] + for i, job in enumerate(self.jobs): + cmd = 'mkdir' + cmd += ' -p' + cmd += ' "{}"'.format(job.output_dir) + lbl = 'mkdir_job_-_{}'.format(i) + mkdir_jobs.append(self.addTask(command=cmd, label=lbl)) ''' ' ################################################## - ' # Natural language processing # + ' # spacy_nlp_jobs # ' ################################################## ''' - nlp_jobs = [] - nlp_job_n_cores = min( - self.n_cores, - max(1, int(self.n_cores / len(self.jobs))) - ) - for index, job in enumerate(self.jobs): - cmd = 'spacy_nlp -l "{}" "{}" "{}" {}'.format( - self.lang, - job['path'], - os.path.join(job['output_dir'], job['name'] + '.vrt'), - "--check-encoding" if self.check_encoding else "" - ) - nlp_jobs.append( - self.addTask( - command=cmd, - dependencies='create_output_directories_job_-_{}'.format( - index - ), - label='nlp_job_-_{}'.format(index), - nCores=nlp_job_n_cores - ) - ) + spacy_nlp_jobs = [] + n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs)))) + for i, job in enumerate(self.jobs): + output_file = os.path.join(job.output_dir, + '{}.vrt'.format(job.name)) + cmd = 'spacy-nlp "{}" "{}"'.format(job.file, output_file) + cmd += ' -l "{}"'.format(self.lang) + cmd += ' --check-encoding' if self.check_encoding else '' + deps = 'mkdir_job_-_{}'.format(i) + lbl = 'spacy_nlp_job_-_{}'.format(i) + spacy_nlp_jobs.append(self.addTask(command=cmd, dependencies=deps, + label=lbl, nCores=n_cores)) - if zip: - vrt_zip_jobs = [] - vrt_zip_job_dependencies = nlp_jobs - cmd = 'cd "%s" && zip -m "%s"-nlp.zip */*.vrt -x "pyflow.data*" && cd -' % ( - self.output_dir, - self.zip - ) - vrt_zip_jobs.append( - self.addTask( - command=cmd, - dependencies=vrt_zip_job_dependencies, - label='vrt_zip_job' - ) - ) + ''' + ' ################################################## + ' # zip_jobs # + ' ################################################## + ''' + zip_jobs = [] + if self.zip is not None: + cmd = 'cd "{}"'.format(self.output_dir) + cmd += ' && ' + cmd += 'zip' + cmd += ' -m' + cmd += ' -r' + cmd += ' "{}_-_vrt" .'.format(self.zip) + cmd += ' -x "pyflow.data*"' + cmd += ' -i "*.vrt"' + cmd += ' && ' + cmd += 'cd -' + deps = spacy_nlp_jobs + lbl = 'zip_job' + zip_jobs.append(self.addTask(command=cmd, dependencies=deps, + label=lbl)) -def analyze_jobs(input_dir, output_dir): +def collect_jobs(input_dir, output_dir): jobs = [] - for file in os.listdir(input_dir): if os.path.isdir(os.path.join(input_dir, file)): - jobs += analyze_jobs(os.path.join(input_dir, file), + jobs += collect_jobs(os.path.join(input_dir, file), os.path.join(output_dir, file)) elif file.endswith('.txt'): - jobs.append({'filename': file, - 'name': file.rsplit('.', 1)[0], - 'output_dir': os.path.join(output_dir, file), - 'path': os.path.join(input_dir, file)}) - + jobs.append(NLPPipelineJob(os.path.join(input_dir, file), + os.path.join(output_dir, file))) return jobs def main(): - args = parse_arguments() - - wflow = NLPWorkflow(args) - - retval = wflow.run(dataDirRoot=args.output_dir, nCores=args.n_cores) - + args = parse_args() + jobs = collect_jobs(args.i, args.o) + nlp_pipeline = NLPPipeline(args.check_encoding, jobs, args.language, + args.n_cores, args.o, args.zip) + retval = nlp_pipeline.run(dataDirRoot=(args.log_dir or args.o), + nCores=args.n_cores) sys.exit(retval) diff --git a/spacy-nlp b/spacy-nlp new file mode 100755 index 0000000..a92d535 --- /dev/null +++ b/spacy-nlp @@ -0,0 +1,76 @@ +#!/usr/bin/env python3.5 +# coding=utf-8 + +from argparse import ArgumentParser +from xml.sax.saxutils import escape +import chardet +import spacy +import textwrap + + +SPACY_MODELS = {'de': 'de_core_news_sm', + 'el': 'el_core_news_sm', + 'en': 'en_core_web_sm', + 'es': 'es_core_news_sm', + 'fr': 'fr_core_news_sm', + 'it': 'it_core_news_sm', + 'nl': 'nl_core_news_sm', + 'pt': 'pt_core_news_sm'} + + +# Parse the given arguments +parser = ArgumentParser(description=('Tag a text file with spaCy and save it ' + 'as a verticalized text file.')) +parser.add_argument('i', metavar='txt-sourcefile') +parser.add_argument('o', metavar='vrt-destfile') +parser.add_argument('-l', '--language', choices=SPACY_MODELS.keys(), + required=True) +parser.add_argument('--check-encoding', action='store_true') +args = parser.parse_args() + + +# If requested: Check the encoding of the text contents from the input file +# Else: Use utf-8 +if args.check_encoding: + with open(args.i, "rb") as input_file: + bytes = input_file.read() + encoding = chardet.detect(bytes)['encoding'] +else: + encoding = 'utf-8' + + +# Load the text contents from the input file +with open(args.i, encoding=encoding) as input_file: + text = input_file.read() + # spaCys NLP is limited to strings with maximum 1 million characters at + # once. So we split it into suitable chunks. + text_chunks = textwrap.wrap(text, 1000000, break_long_words=False) + # the text variable potentially occupies a lot of system memory and is no + # longer needed... + del text + + +# Setup the spaCy toolkit by loading the chosen language model +nlp = spacy.load(SPACY_MODELS[args.language]) + + +# Create the output file in verticalized text format +# See: http://cwb.sourceforge.net/files/CWB_Encoding_Tutorial/node3.html +output_file = open(args.o, 'w+') +output_file.write('\n\n\n') +for text_chunk in text_chunks: + doc = nlp(text_chunk) + for sent in doc.sents: + output_file.write('\n') + for token in sent: + # Skip whitespace tokens + if token.text.isspace(): + continue + output_file.write('{}'.format(escape(token.text)) + + '\t{}'.format(escape(token.lemma_)) + + '\t{}'.format(token.pos_) + + '\t{}'.format(token.tag_) + + '\t{}\n'.format(token.ent_type_ or 'NULL')) + output_file.write('\n') +output_file.write('\n') +output_file.close() diff --git a/spacy_nlp b/spacy_nlp deleted file mode 100755 index a8863ec..0000000 --- a/spacy_nlp +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env python3.5 -# coding=utf-8 - -from xml.sax.saxutils import escape -import argparse -import chardet -import spacy -import textwrap - -parser = argparse.ArgumentParser( - description=('Tag a text file with spaCy and save it as a verticalized ' - 'text file.') -) -parser.add_argument('i', metavar='txt-sourcefile') -parser.add_argument('-l', - choices=['de', 'el', 'en', 'es', 'fr', 'it', 'nl', 'pt'], - dest='lang', - required=True) -parser.add_argument('o', metavar='vrt-destfile') -parser.add_argument('--check-encoding', - default=False, - action='store_true', - dest='check_encoding' - ) -args = parser.parse_args() - -SPACY_MODELS = {'de': 'de_core_news_sm', - 'el': 'el_core_news_sm', - 'en': 'en_core_web_sm', - 'es': 'es_core_news_sm', - 'fr': 'fr_core_news_sm', - 'it': 'it_core_news_sm', - 'nl': 'nl_core_news_sm', - 'pt': 'pt_core_news_sm'} - -# Set the language model for spacy -nlp = spacy.load(SPACY_MODELS[args.lang]) - -# Try to determine the encoding of the text in the input file -if args.check_encoding: - with open(args.i, "rb") as input_file: - bytes = input_file.read() - encoding = chardet.detect(bytes)['encoding'] -else: - encoding = 'utf-8' -# Read text from the input file and if neccessary split it into parts with a -# length of less than 1 million characters. -with open(args.i, encoding=encoding) as input_file: - text = input_file.read() - texts = textwrap.wrap(text, 1000000, break_long_words=False) - text = None - -# Create and open the output file -output_file = open(args.o, 'w+') - -output_file.write('\n' - '\n' - '\n') -for text in texts: - # Run spacy nlp over the text (partial string if above 1 million chars) - doc = nlp(text) - for sent in doc.sents: - output_file.write('\n') - for token in sent: - # Skip whitespace tokens like "\n" or "\t" - if token.text.isspace(): - continue - # Write all information in .vrt style to the output file - # text, lemma, simple_pos, pos, ner - output_file.write( - '{}\t{}\t{}\t{}\t{}\n'.format( - escape(token.text), - escape(token.lemma_), - token.pos_, - token.tag_, - token.ent_type_ if token.ent_type_ != '' else 'NULL' - ) - ) - output_file.write('\n') -output_file.write('\n' - '') - -output_file.close() diff --git a/wrapper/nlp b/wrapper/nlp index 36d0e08..8946504 100755 --- a/wrapper/nlp +++ b/wrapper/nlp @@ -1,35 +1,29 @@ #!/usr/bin/env python3 # coding=utf-8 -import argparse +from argparse import ArgumentParser import os import subprocess -container_image = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest' -container_input_dir = '/input' -container_output_dir = '/output' -uid = str(os.getuid()) -gid = str(os.getgid()) +CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest' +CONTAINER_INPUT_DIR = '/input' +CONTAINER_OUTPUT_DIR = '/output' +UID = str(os.getuid()) +GID = str(os.getgid()) -parser = argparse.ArgumentParser(add_help=False) -parser.add_argument('-i', - dest='input_dir', - required=False) -parser.add_argument('-o', - dest='output_dir', - required=False) +parser = ArgumentParser(add_help=False) +parser.add_argument('-i') +parser.add_argument('-o') args, remaining_args = parser.parse_known_args() -cmd = ['docker', 'run', '--rm', '-it', '-u', uid + ':' + gid] -if args.input_dir is not None: - host_input_dir = os.path.abspath(args.input_dir) - cmd += ['-v', host_input_dir + ':' + container_input_dir] - remaining_args += ['-i', container_input_dir] -if args.output_dir is not None: - host_output_dir = os.path.abspath(args.output_dir) - cmd += ['-v', host_output_dir + ':' + container_output_dir] - remaining_args += ['-o', container_output_dir] -cmd.append(container_image) +cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)] +if args.o is not None: + cmd += ['-v', '{}:{}'.format(os.path.abspath(args.o), CONTAINER_OUTPUT_DIR)] + remaining_args.insert(0, CONTAINER_OUTPUT_DIR) +if args.i is not None: + cmd += ['-v', '{}:{}'.format(os.path.abspath(args.i), CONTAINER_INPUT_DIR)] + remaining_args.insert(0, CONTAINER_INPUT_DIR) +cmd.append(CONTAINER_IMAGE) cmd += remaining_args subprocess.run(cmd)