Fix version 1.0.0

2026-01-09 12:00:54 +00:00 · 2021-02-25 11:26:11 +01:00
parent 2ced38504c
commit d620c29f27
5 changed files with 102 additions and 74 deletions
--- a/27
+++ b/27
@@ -1,41 +1,43 @@
 FROM debian:buster-slim


-LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <sporada@uni-bielefeld.de>"
+LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>"


 ENV LANG=C.UTF-8


+RUN apt-get update
+
+
 # Install pipeline dependencies #
 ## Install pyFlow ##
 ENV PYFLOW_RELEASE=1.1.20
 ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" .
 RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \
 && cd "pyflow-${PYFLOW_RELEASE}" \
- && apt-get update \
 && apt-get install --no-install-recommends --yes \
      python2.7 \
- && rm -r /var/lib/apt/lists/* \
 && python2.7 setup.py build install \
 && cd .. \
 && rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz"


 ## Install spaCy ##
-ENV SPACY_VERSION=2.3.2
-ENV SPACY_MODELS_VERSION=2.3.0
-RUN apt-get update \
- && apt-get install --no-install-recommends --yes \
+ENV SPACY_VERSION=3.0.3
+RUN apt-get install --no-install-recommends --yes \
      python3.7 \
      python3-pip \
      zip \
- && rm -r /var/lib/apt/lists/* \
 && pip3 install \
      chardet \
      setuptools \
      wheel \
- && pip3 install "spacy==${SPACY_VERSION}" \
+ && pip3 install --upgrade pip \
+ && pip3 install "spacy==${SPACY_VERSION}"
+
+ENV SPACY_MODELS_VERSION=3.0.0
+RUN python3 -m spacy download "da_core_news_md-${SPACY_MODELS_VERSION}" --direct \
 && python3 -m spacy download "de_core_news_md-${SPACY_MODELS_VERSION}" --direct \
 && python3 -m spacy download "el_core_news_md-${SPACY_MODELS_VERSION}" --direct \
 && python3 -m spacy download "en_core_web_md-${SPACY_MODELS_VERSION}" --direct \
@@ -43,12 +45,17 @@ RUN apt-get update \
 && python3 -m spacy download "fr_core_news_md-${SPACY_MODELS_VERSION}" --direct \
 && python3 -m spacy download "it_core_news_md-${SPACY_MODELS_VERSION}" --direct \
 && python3 -m spacy download "nl_core_news_md-${SPACY_MODELS_VERSION}" --direct \
- && python3 -m spacy download "pt_core_news_md-${SPACY_MODELS_VERSION}" --direct
+ && python3 -m spacy download "pt_core_news_md-${SPACY_MODELS_VERSION}" --direct \
+ && python3 -m spacy download "ru_core_news_md-${SPACY_MODELS_VERSION}" --direct \
+ && python3 -m spacy download "zh_core_web_md-${SPACY_MODELS_VERSION}" --direct


 ## Install Pipeline ##
 COPY nlp spacy-nlp /usr/local/bin/


+RUN rm -r /var/lib/apt/lists/*
+
+
 ENTRYPOINT ["nlp"]
 CMD ["--help"]
--- a/README.md
+++ b/README.md
@@ -1,74 +1,88 @@
-# Natural language processing
+# NLP - Natural Language Processing

-This repository provides all code that is needed to build a container image for natural language processing utilizing [spaCy](https://spacy.io).
+This software implements a heavily parallelized pipeline for Natural Language Processing of text files. It is used for nopaque's NLP service but you can also use it standalone, for that purpose a convenient wrapper script is provided.

-## Build image
+## Software used in this pipeline implementation
+- Official Debian Docker image (buster-slim) and programs from its free repositories: https://hub.docker.com/_/debian
+- pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
+- spaCy (3.0.3): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1
+- spaCy medium sized models (3.0.0):
+  - https://github.com/explosion/spacy-models/releases/tag/da_core_news_md-3.0.0
+  - https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.0.0
+  - https://github.com/explosion/spacy-models/releases/tag/el_core_news_md-3.0.0
+  - https://github.com/explosion/spacy-models/releases/tag/en_core_web_md-3.0.0
+  - https://github.com/explosion/spacy-models/releases/tag/es_core_news_md-3.0.0
+  - https://github.com/explosion/spacy-models/releases/tag/fr_core_news_md-3.0.0
+  - https://github.com/explosion/spacy-models/releases/tag/it_core_news_md-3.0.0
+  - https://github.com/explosion/spacy-models/releases/tag/nl_core_news_md-3.0.0
+  - https://github.com/explosion/spacy-models/releases/tag/pt_core_news_md-3.0.0
+  - https://github.com/explosion/spacy-models/releases/tag/ru_core_news_md-3.0.0
+  - https://github.com/explosion/spacy-models/releases/tag/zh_core_web_md-3.0.0

-1. Clone this repository and navigate into it:
-```
-git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git && cd nlp
+
+## Use this image
+
+1. Create input and output directories for the pipeline.
+``` bash
+mkdir -p /<my_data_location>/input /<my_data_location>/output
 ```

-2. Build image:
-```
-docker build -t sfb1288inf/nlp:latest .
-```
+2. Place your text files inside `/<my_data_location>/input`. Files should all contain text of the same language.

-Alternatively build from the GitLab repository without cloning:
-
-1. Build image:
-```
-docker build -t sfb1288inf/nlp:latest https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
+3. Start the pipeline process. Check the [Pipeline arguments](#pipeline-arguments) section for more details.
 ```
+# Option one: Use the wrapper script
+## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/raw/1.0.0/wrapper/nlp, make it executeable and add it to your ${PATH}
+cd /<my_data_location>
+nlp -i input -l <language_code> -o output <optional_pipeline_arguments>

-## Download prebuilt image
-
-The GitLab registry provides a prebuilt image. It is automatically created, utilizing the conquaire build servers.
-
-1. Download image:
-```
-docker pull gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest
-```
-
-## Run
-
-1. Create input and output directories for the NLP software:
-```
-mkdir -p /<mydatalocation>/files_for_nlp /<mydatalocation>/files_from_nlp
-```
-
-2. Place your text files inside the `/<mydatalocation>/files_for_nlp` directory. Files should all contain text of the same language.
-
-3. Start the NLP process.
-```
+# Option two: Classic Docker style
 docker run \
    --rm \
    -it \
    -u $(id -u $USER):$(id -g $USER) \
-    -v /<mydatalocation>/files_for_nlp:/input \
-    -v /<mydatalocation>/files_from_nlp:/output \
-    sfb1288inf/nlp:latest \
+    -v /<my_data_location>/input:/input \
+    -v /<my_data_location>/output:/output \
+    gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0 \
        -i /input \
-        -l <languagecode> \
-        -o /output
+        -l <language_code>
+        -o /output \
+        <optional_pipeline_arguments>
 ```
-The arguments below `sfb1288inf/nlp:latest` are described in the [NLP arguments](#nlp-arguments) part.

-If you want to use the prebuilt image, replace `sfb1288inf/nlp:latest` with `gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest`.
+4. Check your results in the `/<my_data_location>/output` directory.
+```

-4. Check your results in the `/<mydatalocation>/files_from_nlp` directory.
+### Pipeline arguments

-### NLP arguments
-
-`-i path`
-* Sets the input directory using the specified path.
-* required = True
-
-`-o path`
-* Sets the output directory using the specified path.
-* required = True
+`--check-encoding`
+* If set, the pipeline tries to automatically determine the right encoding for
+your texts. Only use it if you are not sure that your input is provided in UTF-8.
+* default = False
+* required = False

 `-l languagecode`
 * Tells spaCy which language will be used.
-* options = de (German), el (Greek), en (English), es (Spanish), fr (French), it (Italian), nl (Dutch), pt (Portuguese)
+* options = da (Danish), de (German), el (Greek), en (English), es (Spanish), fr (French), it (Italian), nl (Dutch), pt (Portuguese), ru (Russian), zh (Chinese)
 * required = True
+
+`--nCores corenumber`
+* Sets the number of CPU cores being used during the NLP process.
+* default = min(4, multiprocessing.cpu_count())
+* required = False
+
+``` bash
+# Example with all arguments used
+docker run \
+    --rm \
+    -it \
+    -u $(id -u $USER):$(id -g $USER) \
+    -v "$HOME"/ocr/input:/input \
+    -v "$HOME"/ocr/output:/output \
+    gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0 \
+        -i /input \
+        -l en \
+        -o /output \
+        --check-encoding \
+        --nCores 8 \
+```
--- a/17
+++ b/17
@@ -1,13 +1,11 @@
 #!/usr/bin/env python2.7
 # coding=utf-8

-"""
-nlp
+"""A NLP pipeline for text file processing."""

-Usage:  For usage instructions run with option --help
-Authors: Patrick Jentsch <p.jentsch@uni-bielefeld.de
-         Stephan Porada <sporada@uni-bielefeld.de>
-"""
+__author__ = 'Patrick Jentsch <p.jentsch@uni-bielefeld.de>,' \
+             'Stephan Porada <porada@posteo.de>'
+__version__ = '1.0.0'

 from argparse import ArgumentParser
 from pyflow import WorkflowRunner
@@ -16,14 +14,17 @@ import os
 import sys


-SPACY_MODELS = {'de': 'de_core_news_md',
+SPACY_MODELS = {'da': 'da_core_news_md',
+                'de': 'de_core_news_md',
                'el': 'el_core_news_md',
                'en': 'en_core_web_md',
                'es': 'es_core_news_md',
                'fr': 'fr_core_news_md',
                'it': 'it_core_news_md',
                'nl': 'nl_core_news_md',
-                'pt': 'pt_core_news_md'}
+                'pt': 'pt_core_news_md',
+                'ru': 'ru_core_news_md',
+                'zh': 'zh_core_web_md'}


 def parse_args():
--- a/10
+++ b/10
@@ -9,14 +9,20 @@ import os
 import spacy
 import textwrap

-SPACY_MODELS = {'de': 'de_core_news_md',
+
+SPACY_MODELS = {'da': 'da_core_news_md',
+                'de': 'de_core_news_md',
                'el': 'el_core_news_md',
                'en': 'en_core_web_md',
                'es': 'es_core_news_md',
                'fr': 'fr_core_news_md',
                'it': 'it_core_news_md',
                'nl': 'nl_core_news_md',
-                'pt': 'pt_core_news_md'}
+                'pt': 'pt_core_news_md',
+                'ru': 'ru_core_news_md',
+                'zh': 'zh_core_web_md'}
+
+
 SPACY_MODELS_VERSION = os.environ.get('SPACY_MODELS_VERSION')
 SPACY_VERSION = os.environ.get('SPACY_VERSION')

--- a/wrapper/nlp
+++ b/wrapper/nlp
@@ -5,7 +5,7 @@ from argparse import ArgumentParser
 import os
 import subprocess

-CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest'
+CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0'
 CONTAINER_INPUT_DIR = '/input'
 CONTAINER_OUTPUT_DIR = '/output'
 UID = str(os.getuid())