Fix version 1.0.0

This commit is contained in:
Patrick Jentsch 2021-02-25 11:26:11 +01:00
parent 2ced38504c
commit d620c29f27
5 changed files with 102 additions and 74 deletions

View File

@ -1,41 +1,43 @@
FROM debian:buster-slim FROM debian:buster-slim
LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <sporada@uni-bielefeld.de>" LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>"
ENV LANG=C.UTF-8 ENV LANG=C.UTF-8
RUN apt-get update
# Install pipeline dependencies # # Install pipeline dependencies #
## Install pyFlow ## ## Install pyFlow ##
ENV PYFLOW_RELEASE=1.1.20 ENV PYFLOW_RELEASE=1.1.20
ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" . ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" .
RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \ RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \
&& cd "pyflow-${PYFLOW_RELEASE}" \ && cd "pyflow-${PYFLOW_RELEASE}" \
&& apt-get update \
&& apt-get install --no-install-recommends --yes \ && apt-get install --no-install-recommends --yes \
python2.7 \ python2.7 \
&& rm -r /var/lib/apt/lists/* \
&& python2.7 setup.py build install \ && python2.7 setup.py build install \
&& cd .. \ && cd .. \
&& rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz" && rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz"
## Install spaCy ## ## Install spaCy ##
ENV SPACY_VERSION=2.3.2 ENV SPACY_VERSION=3.0.3
ENV SPACY_MODELS_VERSION=2.3.0 RUN apt-get install --no-install-recommends --yes \
RUN apt-get update \
&& apt-get install --no-install-recommends --yes \
python3.7 \ python3.7 \
python3-pip \ python3-pip \
zip \ zip \
&& rm -r /var/lib/apt/lists/* \
&& pip3 install \ && pip3 install \
chardet \ chardet \
setuptools \ setuptools \
wheel \ wheel \
&& pip3 install "spacy==${SPACY_VERSION}" \ && pip3 install --upgrade pip \
&& pip3 install "spacy==${SPACY_VERSION}"
ENV SPACY_MODELS_VERSION=3.0.0
RUN python3 -m spacy download "da_core_news_md-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "de_core_news_md-${SPACY_MODELS_VERSION}" --direct \ && python3 -m spacy download "de_core_news_md-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "el_core_news_md-${SPACY_MODELS_VERSION}" --direct \ && python3 -m spacy download "el_core_news_md-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "en_core_web_md-${SPACY_MODELS_VERSION}" --direct \ && python3 -m spacy download "en_core_web_md-${SPACY_MODELS_VERSION}" --direct \
@ -43,12 +45,17 @@ RUN apt-get update \
&& python3 -m spacy download "fr_core_news_md-${SPACY_MODELS_VERSION}" --direct \ && python3 -m spacy download "fr_core_news_md-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "it_core_news_md-${SPACY_MODELS_VERSION}" --direct \ && python3 -m spacy download "it_core_news_md-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "nl_core_news_md-${SPACY_MODELS_VERSION}" --direct \ && python3 -m spacy download "nl_core_news_md-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "pt_core_news_md-${SPACY_MODELS_VERSION}" --direct && python3 -m spacy download "pt_core_news_md-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "ru_core_news_md-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "zh_core_web_md-${SPACY_MODELS_VERSION}" --direct
## Install Pipeline ## ## Install Pipeline ##
COPY nlp spacy-nlp /usr/local/bin/ COPY nlp spacy-nlp /usr/local/bin/
RUN rm -r /var/lib/apt/lists/*
ENTRYPOINT ["nlp"] ENTRYPOINT ["nlp"]
CMD ["--help"] CMD ["--help"]

120
README.md
View File

@ -1,74 +1,88 @@
# Natural language processing # NLP - Natural Language Processing
This repository provides all code that is needed to build a container image for natural language processing utilizing [spaCy](https://spacy.io). This software implements a heavily parallelized pipeline for Natural Language Processing of text files. It is used for nopaque's NLP service but you can also use it standalone, for that purpose a convenient wrapper script is provided.
## Build image ## Software used in this pipeline implementation
- Official Debian Docker image (buster-slim) and programs from its free repositories: https://hub.docker.com/_/debian
- pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
- spaCy (3.0.3): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1
- spaCy medium sized models (3.0.0):
- https://github.com/explosion/spacy-models/releases/tag/da_core_news_md-3.0.0
- https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.0.0
- https://github.com/explosion/spacy-models/releases/tag/el_core_news_md-3.0.0
- https://github.com/explosion/spacy-models/releases/tag/en_core_web_md-3.0.0
- https://github.com/explosion/spacy-models/releases/tag/es_core_news_md-3.0.0
- https://github.com/explosion/spacy-models/releases/tag/fr_core_news_md-3.0.0
- https://github.com/explosion/spacy-models/releases/tag/it_core_news_md-3.0.0
- https://github.com/explosion/spacy-models/releases/tag/nl_core_news_md-3.0.0
- https://github.com/explosion/spacy-models/releases/tag/pt_core_news_md-3.0.0
- https://github.com/explosion/spacy-models/releases/tag/ru_core_news_md-3.0.0
- https://github.com/explosion/spacy-models/releases/tag/zh_core_web_md-3.0.0
1. Clone this repository and navigate into it:
``` ## Use this image
git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git && cd nlp
1. Create input and output directories for the pipeline.
``` bash
mkdir -p /<my_data_location>/input /<my_data_location>/output
``` ```
2. Build image: 2. Place your text files inside `/<my_data_location>/input`. Files should all contain text of the same language.
```
docker build -t sfb1288inf/nlp:latest .
```
Alternatively build from the GitLab repository without cloning: 3. Start the pipeline process. Check the [Pipeline arguments](#pipeline-arguments) section for more details.
1. Build image:
```
docker build -t sfb1288inf/nlp:latest https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
``` ```
# Option one: Use the wrapper script
## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/raw/1.0.0/wrapper/nlp, make it executeable and add it to your ${PATH}
cd /<my_data_location>
nlp -i input -l <language_code> -o output <optional_pipeline_arguments>
## Download prebuilt image # Option two: Classic Docker style
The GitLab registry provides a prebuilt image. It is automatically created, utilizing the conquaire build servers.
1. Download image:
```
docker pull gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest
```
## Run
1. Create input and output directories for the NLP software:
```
mkdir -p /<mydatalocation>/files_for_nlp /<mydatalocation>/files_from_nlp
```
2. Place your text files inside the `/<mydatalocation>/files_for_nlp` directory. Files should all contain text of the same language.
3. Start the NLP process.
```
docker run \ docker run \
--rm \ --rm \
-it \ -it \
-u $(id -u $USER):$(id -g $USER) \ -u $(id -u $USER):$(id -g $USER) \
-v /<mydatalocation>/files_for_nlp:/input \ -v /<my_data_location>/input:/input \
-v /<mydatalocation>/files_from_nlp:/output \ -v /<my_data_location>/output:/output \
sfb1288inf/nlp:latest \ gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0 \
-i /input \ -i /input \
-l <languagecode> \ -l <language_code>
-o /output -o /output \
<optional_pipeline_arguments>
``` ```
The arguments below `sfb1288inf/nlp:latest` are described in the [NLP arguments](#nlp-arguments) part.
If you want to use the prebuilt image, replace `sfb1288inf/nlp:latest` with `gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest`. 4. Check your results in the `/<my_data_location>/output` directory.
```
4. Check your results in the `/<mydatalocation>/files_from_nlp` directory. ### Pipeline arguments
### NLP arguments `--check-encoding`
* If set, the pipeline tries to automatically determine the right encoding for
`-i path` your texts. Only use it if you are not sure that your input is provided in UTF-8.
* Sets the input directory using the specified path. * default = False
* required = True * required = False
`-o path`
* Sets the output directory using the specified path.
* required = True
`-l languagecode` `-l languagecode`
* Tells spaCy which language will be used. * Tells spaCy which language will be used.
* options = de (German), el (Greek), en (English), es (Spanish), fr (French), it (Italian), nl (Dutch), pt (Portuguese) * options = da (Danish), de (German), el (Greek), en (English), es (Spanish), fr (French), it (Italian), nl (Dutch), pt (Portuguese), ru (Russian), zh (Chinese)
* required = True * required = True
`--nCores corenumber`
* Sets the number of CPU cores being used during the NLP process.
* default = min(4, multiprocessing.cpu_count())
* required = False
``` bash
# Example with all arguments used
docker run \
--rm \
-it \
-u $(id -u $USER):$(id -g $USER) \
-v "$HOME"/ocr/input:/input \
-v "$HOME"/ocr/output:/output \
gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0 \
-i /input \
-l en \
-o /output \
--check-encoding \
--nCores 8 \
```

17
nlp
View File

@ -1,13 +1,11 @@
#!/usr/bin/env python2.7 #!/usr/bin/env python2.7
# coding=utf-8 # coding=utf-8
""" """A NLP pipeline for text file processing."""
nlp
Usage: For usage instructions run with option --help __author__ = 'Patrick Jentsch <p.jentsch@uni-bielefeld.de>,' \
Authors: Patrick Jentsch <p.jentsch@uni-bielefeld.de 'Stephan Porada <porada@posteo.de>'
Stephan Porada <sporada@uni-bielefeld.de> __version__ = '1.0.0'
"""
from argparse import ArgumentParser from argparse import ArgumentParser
from pyflow import WorkflowRunner from pyflow import WorkflowRunner
@ -16,14 +14,17 @@ import os
import sys import sys
SPACY_MODELS = {'de': 'de_core_news_md', SPACY_MODELS = {'da': 'da_core_news_md',
'de': 'de_core_news_md',
'el': 'el_core_news_md', 'el': 'el_core_news_md',
'en': 'en_core_web_md', 'en': 'en_core_web_md',
'es': 'es_core_news_md', 'es': 'es_core_news_md',
'fr': 'fr_core_news_md', 'fr': 'fr_core_news_md',
'it': 'it_core_news_md', 'it': 'it_core_news_md',
'nl': 'nl_core_news_md', 'nl': 'nl_core_news_md',
'pt': 'pt_core_news_md'} 'pt': 'pt_core_news_md',
'ru': 'ru_core_news_md',
'zh': 'zh_core_web_md'}
def parse_args(): def parse_args():

View File

@ -9,14 +9,20 @@ import os
import spacy import spacy
import textwrap import textwrap
SPACY_MODELS = {'de': 'de_core_news_md',
SPACY_MODELS = {'da': 'da_core_news_md',
'de': 'de_core_news_md',
'el': 'el_core_news_md', 'el': 'el_core_news_md',
'en': 'en_core_web_md', 'en': 'en_core_web_md',
'es': 'es_core_news_md', 'es': 'es_core_news_md',
'fr': 'fr_core_news_md', 'fr': 'fr_core_news_md',
'it': 'it_core_news_md', 'it': 'it_core_news_md',
'nl': 'nl_core_news_md', 'nl': 'nl_core_news_md',
'pt': 'pt_core_news_md'} 'pt': 'pt_core_news_md',
'ru': 'ru_core_news_md',
'zh': 'zh_core_web_md'}
SPACY_MODELS_VERSION = os.environ.get('SPACY_MODELS_VERSION') SPACY_MODELS_VERSION = os.environ.get('SPACY_MODELS_VERSION')
SPACY_VERSION = os.environ.get('SPACY_VERSION') SPACY_VERSION = os.environ.get('SPACY_VERSION')

View File

@ -5,7 +5,7 @@ from argparse import ArgumentParser
import os import os
import subprocess import subprocess
CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest' CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0'
CONTAINER_INPUT_DIR = '/input' CONTAINER_INPUT_DIR = '/input'
CONTAINER_OUTPUT_DIR = '/output' CONTAINER_OUTPUT_DIR = '/output'
UID = str(os.getuid()) UID = str(os.getuid())