diff --git a/Dockerfile b/Dockerfile index c001480..6c9f483 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,41 +1,43 @@ FROM debian:buster-slim -LABEL authors="Patrick Jentsch , Stephan Porada " +LABEL authors="Patrick Jentsch , Stephan Porada " ENV LANG=C.UTF-8 +RUN apt-get update + + # Install pipeline dependencies # ## Install pyFlow ## ENV PYFLOW_RELEASE=1.1.20 ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" . RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \ && cd "pyflow-${PYFLOW_RELEASE}" \ - && apt-get update \ && apt-get install --no-install-recommends --yes \ python2.7 \ - && rm -r /var/lib/apt/lists/* \ && python2.7 setup.py build install \ && cd .. \ && rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz" ## Install spaCy ## -ENV SPACY_VERSION=2.3.2 -ENV SPACY_MODELS_VERSION=2.3.0 -RUN apt-get update \ - && apt-get install --no-install-recommends --yes \ +ENV SPACY_VERSION=3.0.3 +RUN apt-get install --no-install-recommends --yes \ python3.7 \ python3-pip \ zip \ - && rm -r /var/lib/apt/lists/* \ && pip3 install \ chardet \ setuptools \ wheel \ - && pip3 install "spacy==${SPACY_VERSION}" \ + && pip3 install --upgrade pip \ + && pip3 install "spacy==${SPACY_VERSION}" + +ENV SPACY_MODELS_VERSION=3.0.0 +RUN python3 -m spacy download "da_core_news_md-${SPACY_MODELS_VERSION}" --direct \ && python3 -m spacy download "de_core_news_md-${SPACY_MODELS_VERSION}" --direct \ && python3 -m spacy download "el_core_news_md-${SPACY_MODELS_VERSION}" --direct \ && python3 -m spacy download "en_core_web_md-${SPACY_MODELS_VERSION}" --direct \ @@ -43,12 +45,17 @@ RUN apt-get update \ && python3 -m spacy download "fr_core_news_md-${SPACY_MODELS_VERSION}" --direct \ && python3 -m spacy download "it_core_news_md-${SPACY_MODELS_VERSION}" --direct \ && python3 -m spacy download "nl_core_news_md-${SPACY_MODELS_VERSION}" --direct \ - && python3 -m spacy download "pt_core_news_md-${SPACY_MODELS_VERSION}" --direct + && python3 -m spacy download "pt_core_news_md-${SPACY_MODELS_VERSION}" --direct \ + && python3 -m spacy download "ru_core_news_md-${SPACY_MODELS_VERSION}" --direct \ + && python3 -m spacy download "zh_core_web_md-${SPACY_MODELS_VERSION}" --direct ## Install Pipeline ## COPY nlp spacy-nlp /usr/local/bin/ +RUN rm -r /var/lib/apt/lists/* + + ENTRYPOINT ["nlp"] CMD ["--help"] diff --git a/README.md b/README.md index 2d93378..8c522ed 100644 --- a/README.md +++ b/README.md @@ -1,74 +1,88 @@ -# Natural language processing +# NLP - Natural Language Processing -This repository provides all code that is needed to build a container image for natural language processing utilizing [spaCy](https://spacy.io). +This software implements a heavily parallelized pipeline for Natural Language Processing of text files. It is used for nopaque's NLP service but you can also use it standalone, for that purpose a convenient wrapper script is provided. -## Build image +## Software used in this pipeline implementation +- Official Debian Docker image (buster-slim) and programs from its free repositories: https://hub.docker.com/_/debian +- pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20 +- spaCy (3.0.3): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1 +- spaCy medium sized models (3.0.0): + - https://github.com/explosion/spacy-models/releases/tag/da_core_news_md-3.0.0 + - https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.0.0 + - https://github.com/explosion/spacy-models/releases/tag/el_core_news_md-3.0.0 + - https://github.com/explosion/spacy-models/releases/tag/en_core_web_md-3.0.0 + - https://github.com/explosion/spacy-models/releases/tag/es_core_news_md-3.0.0 + - https://github.com/explosion/spacy-models/releases/tag/fr_core_news_md-3.0.0 + - https://github.com/explosion/spacy-models/releases/tag/it_core_news_md-3.0.0 + - https://github.com/explosion/spacy-models/releases/tag/nl_core_news_md-3.0.0 + - https://github.com/explosion/spacy-models/releases/tag/pt_core_news_md-3.0.0 + - https://github.com/explosion/spacy-models/releases/tag/ru_core_news_md-3.0.0 + - https://github.com/explosion/spacy-models/releases/tag/zh_core_web_md-3.0.0 -1. Clone this repository and navigate into it: -``` -git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git && cd nlp + +## Use this image + +1. Create input and output directories for the pipeline. +``` bash +mkdir -p //input //output ``` -2. Build image: -``` -docker build -t sfb1288inf/nlp:latest . -``` +2. Place your text files inside `//input`. Files should all contain text of the same language. -Alternatively build from the GitLab repository without cloning: - -1. Build image: -``` -docker build -t sfb1288inf/nlp:latest https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git +3. Start the pipeline process. Check the [Pipeline arguments](#pipeline-arguments) section for more details. ``` +# Option one: Use the wrapper script +## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/raw/1.0.0/wrapper/nlp, make it executeable and add it to your ${PATH} +cd / +nlp -i input -l -o output -## Download prebuilt image - -The GitLab registry provides a prebuilt image. It is automatically created, utilizing the conquaire build servers. - -1. Download image: -``` -docker pull gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest -``` - -## Run - -1. Create input and output directories for the NLP software: -``` -mkdir -p //files_for_nlp //files_from_nlp -``` - -2. Place your text files inside the `//files_for_nlp` directory. Files should all contain text of the same language. - -3. Start the NLP process. -``` +# Option two: Classic Docker style docker run \ --rm \ -it \ -u $(id -u $USER):$(id -g $USER) \ - -v //files_for_nlp:/input \ - -v //files_from_nlp:/output \ - sfb1288inf/nlp:latest \ + -v //input:/input \ + -v //output:/output \ + gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0 \ -i /input \ - -l \ - -o /output + -l + -o /output \ + ``` -The arguments below `sfb1288inf/nlp:latest` are described in the [NLP arguments](#nlp-arguments) part. -If you want to use the prebuilt image, replace `sfb1288inf/nlp:latest` with `gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest`. +4. Check your results in the `//output` directory. +``` -4. Check your results in the `//files_from_nlp` directory. +### Pipeline arguments -### NLP arguments - -`-i path` -* Sets the input directory using the specified path. -* required = True - -`-o path` -* Sets the output directory using the specified path. -* required = True +`--check-encoding` +* If set, the pipeline tries to automatically determine the right encoding for +your texts. Only use it if you are not sure that your input is provided in UTF-8. +* default = False +* required = False `-l languagecode` * Tells spaCy which language will be used. -* options = de (German), el (Greek), en (English), es (Spanish), fr (French), it (Italian), nl (Dutch), pt (Portuguese) +* options = da (Danish), de (German), el (Greek), en (English), es (Spanish), fr (French), it (Italian), nl (Dutch), pt (Portuguese), ru (Russian), zh (Chinese) * required = True + +`--nCores corenumber` +* Sets the number of CPU cores being used during the NLP process. +* default = min(4, multiprocessing.cpu_count()) +* required = False + +``` bash +# Example with all arguments used +docker run \ + --rm \ + -it \ + -u $(id -u $USER):$(id -g $USER) \ + -v "$HOME"/ocr/input:/input \ + -v "$HOME"/ocr/output:/output \ + gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0 \ + -i /input \ + -l en \ + -o /output \ + --check-encoding \ + --nCores 8 \ +``` diff --git a/nlp b/nlp index 2bc58f8..ecc1203 100755 --- a/nlp +++ b/nlp @@ -1,13 +1,11 @@ #!/usr/bin/env python2.7 # coding=utf-8 -""" -nlp +"""A NLP pipeline for text file processing.""" -Usage: For usage instructions run with option --help -Authors: Patrick Jentsch -""" +__author__ = 'Patrick Jentsch ,' \ + 'Stephan Porada ' +__version__ = '1.0.0' from argparse import ArgumentParser from pyflow import WorkflowRunner @@ -16,14 +14,17 @@ import os import sys -SPACY_MODELS = {'de': 'de_core_news_md', +SPACY_MODELS = {'da': 'da_core_news_md', + 'de': 'de_core_news_md', 'el': 'el_core_news_md', 'en': 'en_core_web_md', 'es': 'es_core_news_md', 'fr': 'fr_core_news_md', 'it': 'it_core_news_md', 'nl': 'nl_core_news_md', - 'pt': 'pt_core_news_md'} + 'pt': 'pt_core_news_md', + 'ru': 'ru_core_news_md', + 'zh': 'zh_core_web_md'} def parse_args(): diff --git a/spacy-nlp b/spacy-nlp index de98e6b..3846bdf 100755 --- a/spacy-nlp +++ b/spacy-nlp @@ -9,14 +9,20 @@ import os import spacy import textwrap -SPACY_MODELS = {'de': 'de_core_news_md', + +SPACY_MODELS = {'da': 'da_core_news_md', + 'de': 'de_core_news_md', 'el': 'el_core_news_md', 'en': 'en_core_web_md', 'es': 'es_core_news_md', 'fr': 'fr_core_news_md', 'it': 'it_core_news_md', 'nl': 'nl_core_news_md', - 'pt': 'pt_core_news_md'} + 'pt': 'pt_core_news_md', + 'ru': 'ru_core_news_md', + 'zh': 'zh_core_web_md'} + + SPACY_MODELS_VERSION = os.environ.get('SPACY_MODELS_VERSION') SPACY_VERSION = os.environ.get('SPACY_VERSION') diff --git a/wrapper/nlp b/wrapper/nlp index f1fcf1e..68a9b77 100755 --- a/wrapper/nlp +++ b/wrapper/nlp @@ -5,7 +5,7 @@ from argparse import ArgumentParser import os import subprocess -CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest' +CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0' CONTAINER_INPUT_DIR = '/input' CONTAINER_OUTPUT_DIR = '/output' UID = str(os.getuid())