Update to newer Version

This commit is contained in:
Patrick Jentsch 2020-09-23 15:26:53 +02:00
parent 5bd0feda5c
commit 42583fea46
4 changed files with 151 additions and 105 deletions

View File

@ -9,36 +9,68 @@ variables:
stages: stages:
- build - build
- push - push
- clean
before_script: .docker_setup:
before_script:
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
Build: .reg_setup:
before_script:
- apk add --no-cache curl
- curl --fail --show-error --location "https://github.com/genuinetools/reg/releases/download/v$REG_VERSION/reg-linux-amd64" --output /usr/local/bin/reg
- echo "$REG_SHA256 /usr/local/bin/reg" | sha256sum -c -
- chmod a+x /usr/local/bin/reg
variables:
REG_SHA256: ade837fc5224acd8c34732bf54a94f579b47851cc6a7fd5899a98386b782e228
REG_VERSION: 0.16.1
build_image:
extends: .docker_setup
script: script:
- docker build --pull -t $CI_REGISTRY_IMAGE:tmp . - docker build -t $INTERMEDIATE_IMAGE_TAG .
- docker push $CI_REGISTRY_IMAGE:tmp - docker push $INTERMEDIATE_IMAGE_TAG
stage: build stage: build
tags: tags:
- docker - docker
variables:
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
Push latest: push_master:
extends:
- .docker_setup
- .reg_setup
only: only:
- master - master
script: script:
- docker pull $CI_REGISTRY_IMAGE:tmp - docker pull $INTERMEDIATE_IMAGE_TAG
- docker tag $CI_REGISTRY_IMAGE:tmp $CI_REGISTRY_IMAGE:latest - /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG
- docker push $CI_REGISTRY_IMAGE:latest - docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG
- docker push $IMAGE_TAG
stage: push stage: push
tags: tags:
- docker - docker
variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:latest
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
Push tag: push_other:
extends:
- .docker_setup
- .reg_setup
except:
- master
only: only:
- branches
- tags - tags
script: script:
- docker pull $CI_REGISTRY_IMAGE:tmp - docker pull $INTERMEDIATE_IMAGE_TAG
- docker tag $CI_REGISTRY_IMAGE:tmp $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME - /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG
- docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME - docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG
- docker push $IMAGE_TAG
stage: push stage: push
tags: tags:
- docker - docker
variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:CI_COMMIT_REF_NAME
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA

View File

@ -1,52 +1,54 @@
FROM debian:10-slim FROM debian:10-slim
LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de" LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <sporada@uni-bielefeld.de>"
ENV LANG=C.UTF-8 ENV LANG=C.UTF-8
RUN apt-get update \ RUN apt-get update
## Install pyFlow ##
ENV PYFLOW_RELEASE=1.1.20
ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" .
RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \
&& cd "pyflow-${PYFLOW_RELEASE}" \
&& apt-get install -y --no-install-recommends \ && apt-get install -y --no-install-recommends \
python2.7 \ python2.7 \
&& python2.7 setup.py build install \
&& cd .. \
&& rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz"
## Install Pipeline ##
ENV SPACY_VERSION=2.3.2
ENV SPACY_MODELS_VERSION=2.3.0
RUN apt-get install -y --no-install-recommends \
python3.7 \ python3.7 \
python3-pip \ python3-pip \
zip \ zip \
&& pip3 install \ && pip3 install \
chardet chardet \
setuptools \
wheel \
ENV PYFLOW_VERSION=1.1.20 && pip3 install "spacy==${SPACY_VERSION}" \
ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" . && python3 -m spacy download "de_core_news_lg-${SPACY_MODELS_VERSION}" --direct \
RUN tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \ && python3 -m spacy download "el_core_news_lg-${SPACY_MODELS_VERSION}" --direct \
&& cd "pyflow-${PYFLOW_VERSION}" \ && python3 -m spacy download "en_core_web_lg-${SPACY_MODELS_VERSION}" --direct \
&& python2.7 setup.py build install \ && python3 -m spacy download "es_core_news_lg-${SPACY_MODELS_VERSION}" --direct \
&& cd .. \ && python3 -m spacy download "fr_core_news_lg-${SPACY_MODELS_VERSION}" --direct \
&& rm -rf \ && python3 -m spacy download "it_core_news_lg-${SPACY_MODELS_VERSION}" --direct \
"pyflow-${PYFLOW_VERSION}" \ && python3 -m spacy download "nl_core_news_lg-${SPACY_MODELS_VERSION}" --direct \
"pyflow-${PYFLOW_VERSION}.tar.gz" && python3 -m spacy download "pt_core_news_lg-${SPACY_MODELS_VERSION}" --direct
ENV SPACY_VERSION=2.2.4
ENV SPACY_MODELS_VERSION=2.2.5
RUN pip3 install setuptools wheel && pip3 install "spacy==${SPACY_VERSION}" \
&& python3 -m spacy download "de_core_news_sm-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "el_core_news_sm-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "en_core_web_sm-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "es_core_news_sm-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "fr_core_news_sm-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "it_core_news_sm-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "nl_core_news_sm-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "pt_core_news_sm-${SPACY_MODELS_VERSION}" --direct
RUN rm -rf /var/lib/apt/lists/*
COPY nlp /usr/local/bin COPY nlp /usr/local/bin
COPY spacy-nlp /usr/local/bin COPY spacy-nlp /usr/local/bin
## Cleanup ##
RUN rm -r /var/lib/apt/lists/*
ENTRYPOINT ["nlp"] ENTRYPOINT ["nlp"]
CMD ["--help"] CMD ["--help"]

78
nlp
View File

@ -5,10 +5,10 @@
nlp nlp
Usage: For usage instructions run with option --help Usage: For usage instructions run with option --help
Author: Patrick Jentsch <p.jentsch@uni-bielefeld.de> Authors: Patrick Jentsch <p.jentsch@uni-bielefeld.de
Stephan Porada <sporada@uni-bielefeld.de>
""" """
from argparse import ArgumentParser from argparse import ArgumentParser
from pyflow import WorkflowRunner from pyflow import WorkflowRunner
import multiprocessing import multiprocessing
@ -34,19 +34,31 @@ def parse_args():
parser.add_argument('-o', '--output-directory', parser.add_argument('-o', '--output-directory',
help='Output directory', help='Output directory',
required=True) required=True)
parser.add_argument('-l', '--language', choices=SPACY_MODELS.keys(), parser.add_argument('-l', '--language',
choices=SPACY_MODELS.keys(),
required=True) required=True)
parser.add_argument('--check-encoding', action='store_true') parser.add_argument('--check-encoding', action='store_true')
parser.add_argument('--log-dir') parser.add_argument('--log-dir')
parser.add_argument('--n-cores', parser.add_argument('--n-cores',
default=min(4, multiprocessing.cpu_count()), default=min(4, multiprocessing.cpu_count()),
help='total number of cores available', type=int) help='total number of cores available', type=int)
parser.add_argument('--zip', parser.add_argument('--zip', help='Zips everything into one archive.')
help='Zips everything into one archive.')
return parser.parse_args() return parser.parse_args()
class NLPPipelineJob: class NLPPipelineJob:
"""An NLP pipeline job class
Each input file of the pipeline is represented as an NLP pipeline job,
which holds all necessary information for the pipeline to process it.
Arguments:
file -- Path to the file
output_dir -- Path to a directory, where job results a stored
intermediate_dir -- Path to a directory, where intermediate files are
stored.
"""
def __init__(self, file, output_dir): def __init__(self, file, output_dir):
self.file = file self.file = file
self.name = os.path.basename(file).rsplit('.', 1)[0] self.name = os.path.basename(file).rsplit('.', 1)[0]
@ -54,13 +66,23 @@ class NLPPipelineJob:
class NLPPipeline(WorkflowRunner): class NLPPipeline(WorkflowRunner):
def __init__(self, check_encoding, jobs, lang, n_cores, output_dir, zip): def __init__(self, input_dir, lang, output_dir, check_encoding, n_cores, zip):
self.check_encoding = check_encoding self.input_dir = input_dir
self.jobs = jobs
self.lang = lang self.lang = lang
self.output_dir = output_dir
self.check_encoding = check_encoding
self.n_cores = n_cores self.n_cores = n_cores
self.output_dir = output_dir self.output_dir = output_dir
if zip is None:
self.zip = zip self.zip = zip
else:
if zip.lower().endswith('.zip'):
# Remove .zip file extension if provided
self.zip = zip[:-4]
self.zip = self.zip if self.zip else 'output'
else:
self.zip = zip
self.jobs = collect_jobs(self.input_dir, self.output_dir)
def workflow(self): def workflow(self):
if not self.jobs: if not self.jobs:
@ -71,25 +93,24 @@ class NLPPipeline(WorkflowRunner):
' # setup output directory # ' # setup output directory #
' ################################################## ' ##################################################
''' '''
setup_output_directory_jobs = [] setup_output_directory_tasks = []
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
cmd = 'mkdir' cmd = 'mkdir'
cmd += ' -p' cmd += ' -p'
cmd += ' "{}"'.format(job.output_dir) cmd += ' "{}"'.format(job.output_dir)
lbl = 'setup_output_directory_-_{}'.format(i) lbl = 'setup_output_directory_-_{}'.format(i)
setup_output_directory_jobs.append(self.addTask(command=cmd, task = self.addTask(command=cmd, label=lbl)
label=lbl)) setup_output_directory_tasks.append(task)
''' '''
' ################################################## ' ##################################################
' # nlp # ' # nlp #
' ################################################## ' ##################################################
''' '''
nlp_jobs = [] nlp_tasks = []
n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs)))) n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs))))
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
output_file = os.path.join(job.output_dir, output_file = os.path.join(job.output_dir, '{}.vrt'.format(job.name)) # noqa
'{}.vrt'.format(job.name))
cmd = 'spacy-nlp' cmd = 'spacy-nlp'
cmd += ' -i "{}"'.format(job.file) cmd += ' -i "{}"'.format(job.file)
cmd += ' -l "{}"'.format(self.lang) cmd += ' -l "{}"'.format(self.lang)
@ -98,36 +119,29 @@ class NLPPipeline(WorkflowRunner):
cmd += ' --check-encoding' cmd += ' --check-encoding'
deps = 'setup_output_directory_-_{}'.format(i) deps = 'setup_output_directory_-_{}'.format(i)
lbl = 'nlp_-_{}'.format(i) lbl = 'nlp_-_{}'.format(i)
nlp_jobs.append(self.addTask(command=cmd, task = self.addTask(command=cmd, dependencies=deps, label=lbl, nCores=n_cores) # noqa
dependencies=deps, nlp_tasks.append(task)
label=lbl,
nCores=n_cores))
''' '''
' ################################################## ' ##################################################
' # zip creation # ' # zip creation #
' ################################################## ' ##################################################
''' '''
zip_creation_jobs = [] zip_creation_tasks = []
if self.zip is not None: if self.zip is not None:
# Remove .zip file extension if provided
if self.zip.endswith('.zip'):
self.zip = self.zip[:-4]
self.zip = self.zip if self.zip else 'output'
cmd = 'cd "{}"'.format(self.output_dir) cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && ' cmd += ' && '
cmd += 'zip' cmd += 'zip'
cmd += ' -r' cmd += ' -r'
cmd += ' "{}".zip .'.format(self.zip) cmd += ' "{}.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*"' cmd += ' -x "pyflow.data*"'
cmd += ' -i "*.vrt"' cmd += ' -i "*.vrt"'
cmd += ' && ' cmd += ' && '
cmd += 'cd -' cmd += 'cd -'
deps = nlp_jobs deps = nlp_tasks
lbl = 'zip_creation' lbl = 'zip_creation'
zip_creation_jobs.append(self.addTask(command=cmd, task = self.addTask(command=cmd, dependencies=deps, label=lbl)
dependencies=deps, zip_creation_tasks.append(task)
label=lbl))
def collect_jobs(input_dir, output_dir): def collect_jobs(input_dir, output_dir):
@ -136,7 +150,7 @@ def collect_jobs(input_dir, output_dir):
if os.path.isdir(os.path.join(input_dir, file)): if os.path.isdir(os.path.join(input_dir, file)):
jobs += collect_jobs(os.path.join(input_dir, file), jobs += collect_jobs(os.path.join(input_dir, file),
os.path.join(output_dir, file)) os.path.join(output_dir, file))
elif file.endswith('.txt'): elif file.lower().endswith('.txt'):
jobs.append(NLPPipelineJob(os.path.join(input_dir, file), jobs.append(NLPPipelineJob(os.path.join(input_dir, file),
os.path.join(output_dir, file))) os.path.join(output_dir, file)))
return jobs return jobs
@ -144,9 +158,9 @@ def collect_jobs(input_dir, output_dir):
def main(): def main():
args = parse_args() args = parse_args()
jobs = collect_jobs(args.input_directory, args.output_directory) nlp_pipeline = NLPPipeline(args.input_directory, args.language,
nlp_pipeline = NLPPipeline(args.check_encoding, jobs, args.language, args.output_directory, args.check_encoding,
args.n_cores, args.output_directory, args.zip) args.n_cores, args.zip)
retval = nlp_pipeline.run( retval = nlp_pipeline.run(
dataDirRoot=(args.log_dir or args.output_directory), dataDirRoot=(args.log_dir or args.output_directory),
nCores=args.n_cores nCores=args.n_cores

View File

@ -4,29 +4,28 @@
from argparse import ArgumentParser from argparse import ArgumentParser
from xml.sax.saxutils import escape from xml.sax.saxutils import escape
import chardet import chardet
import hashlib
import os
import spacy import spacy
import textwrap import textwrap
import hashlib
SPACY_MODELS = {'de': 'de_core_news_sm',
'el': 'el_core_news_sm',
'en': 'en_core_web_sm',
'es': 'es_core_news_sm',
'fr': 'fr_core_news_sm',
'it': 'it_core_news_sm',
'nl': 'nl_core_news_sm',
'pt': 'pt_core_news_sm'}
SPACY_MODELS = {'de': 'de_core_news_lg',
'el': 'el_core_news_lg',
'en': 'en_core_web_lg',
'es': 'es_core_news_lg',
'fr': 'fr_core_news_lg',
'it': 'it_core_news_lg',
'nl': 'nl_core_news_lg',
'pt': 'pt_core_news_lg'}
SPACY_MODELS_VERSION = os.environ.get('SPACY_MODELS_VERSION')
SPACY_VERSION = os.environ.get('SPACY_VERSION')
# Parse the given arguments # Parse the given arguments
parser = ArgumentParser(description=('Tag a text file with spaCy and save it ' parser = ArgumentParser(description=('Tag a text file with spaCy and save it '
'as a verticalized text file.')) 'as a verticalized text file.'))
parser.add_argument('-i', '--input', metavar='txt-sourcefile', required=True) parser.add_argument('-i', '--input', metavar='txt-sourcefile', required=True)
parser.add_argument('-o', '--output', metavar='vrt-destfile', required=True) parser.add_argument('-o', '--output', metavar='vrt-destfile', required=True)
parser.add_argument('-l', '--language', parser.add_argument('-l', '--language', choices=SPACY_MODELS.keys(), required=True) # noqa
choices=SPACY_MODELS.keys(),
required=True)
parser.add_argument('--check-encoding', action='store_true') parser.add_argument('--check-encoding', action='store_true')
args = parser.parse_args() args = parser.parse_args()
@ -43,10 +42,10 @@ else:
# hashing in chunks to avoid full RAM with huge files. # hashing in chunks to avoid full RAM with huge files.
with open(args.input, 'rb') as input_file: with open(args.input, 'rb') as input_file:
md5_hash = hashlib.md5() source_md5 = hashlib.md5()
for chunk in iter(lambda: input_file.read(128 * md5_hash.block_size), b''): for chunk in iter(lambda: input_file.read(128 * source_md5.block_size), b''):
md5_hash.update(chunk) source_md5.update(chunk)
md5_hash = md5_hash.hexdigest() source_md5 = source_md5.hexdigest()
# Load the text contents from the input file # Load the text contents from the input file
with open(args.input, encoding=encoding) as input_file: with open(args.input, encoding=encoding) as input_file:
@ -60,7 +59,8 @@ with open(args.input, encoding=encoding) as input_file:
# Setup the spaCy toolkit by loading the chosen language model # Setup the spaCy toolkit by loading the chosen language model
nlp = spacy.load(SPACY_MODELS[args.language]) model = SPACY_MODELS[args.language]
nlp = spacy.load(model)
# Create the output file in verticalized text format # Create the output file in verticalized text format
@ -70,11 +70,9 @@ output_file_stand_off_filename = args.output.replace('.vrt', '.stand-off.vrt')
common_xml = ('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n' common_xml = ('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'
+ '<corpus>\n' + '<corpus>\n'
+ '<text>\n' + '<text>\n'
+ '<nlp name="spaCy"\n' + '<nlp name="spaCy:{}"\n'.format(SPACY_VERSION)
+ ' version="{}"\n'.format(spacy.__version__) + ' model="{}:{}"\n'.format(model, SPACY_MODELS_VERSION)
+ ' model="{}"\n'.format(SPACY_MODELS[args.language]) + ' source-md5="{}" />\n'.format(source_md5))
+ ' model_version="{}"\n'.format(nlp.meta['version'])
+ ' md5_hash_of_input="{}" />\n'.format(md5_hash))
with open(output_file_original_filename, 'w+') as output_file_original, \ with open(output_file_original_filename, 'w+') as output_file_original, \
open(output_file_stand_off_filename, 'w+') as output_file_stand_off: open(output_file_stand_off_filename, 'w+') as output_file_stand_off: