mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
synced 2025-01-13 05:30:34 +00:00
Fix version 1.0.0
This commit is contained in:
parent
2ced38504c
commit
d620c29f27
27
Dockerfile
27
Dockerfile
@ -1,41 +1,43 @@
|
|||||||
FROM debian:buster-slim
|
FROM debian:buster-slim
|
||||||
|
|
||||||
|
|
||||||
LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <sporada@uni-bielefeld.de>"
|
LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>"
|
||||||
|
|
||||||
|
|
||||||
ENV LANG=C.UTF-8
|
ENV LANG=C.UTF-8
|
||||||
|
|
||||||
|
|
||||||
|
RUN apt-get update
|
||||||
|
|
||||||
|
|
||||||
# Install pipeline dependencies #
|
# Install pipeline dependencies #
|
||||||
## Install pyFlow ##
|
## Install pyFlow ##
|
||||||
ENV PYFLOW_RELEASE=1.1.20
|
ENV PYFLOW_RELEASE=1.1.20
|
||||||
ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" .
|
ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" .
|
||||||
RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \
|
RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \
|
||||||
&& cd "pyflow-${PYFLOW_RELEASE}" \
|
&& cd "pyflow-${PYFLOW_RELEASE}" \
|
||||||
&& apt-get update \
|
|
||||||
&& apt-get install --no-install-recommends --yes \
|
&& apt-get install --no-install-recommends --yes \
|
||||||
python2.7 \
|
python2.7 \
|
||||||
&& rm -r /var/lib/apt/lists/* \
|
|
||||||
&& python2.7 setup.py build install \
|
&& python2.7 setup.py build install \
|
||||||
&& cd .. \
|
&& cd .. \
|
||||||
&& rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz"
|
&& rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz"
|
||||||
|
|
||||||
|
|
||||||
## Install spaCy ##
|
## Install spaCy ##
|
||||||
ENV SPACY_VERSION=2.3.2
|
ENV SPACY_VERSION=3.0.3
|
||||||
ENV SPACY_MODELS_VERSION=2.3.0
|
RUN apt-get install --no-install-recommends --yes \
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install --no-install-recommends --yes \
|
|
||||||
python3.7 \
|
python3.7 \
|
||||||
python3-pip \
|
python3-pip \
|
||||||
zip \
|
zip \
|
||||||
&& rm -r /var/lib/apt/lists/* \
|
|
||||||
&& pip3 install \
|
&& pip3 install \
|
||||||
chardet \
|
chardet \
|
||||||
setuptools \
|
setuptools \
|
||||||
wheel \
|
wheel \
|
||||||
&& pip3 install "spacy==${SPACY_VERSION}" \
|
&& pip3 install --upgrade pip \
|
||||||
|
&& pip3 install "spacy==${SPACY_VERSION}"
|
||||||
|
|
||||||
|
ENV SPACY_MODELS_VERSION=3.0.0
|
||||||
|
RUN python3 -m spacy download "da_core_news_md-${SPACY_MODELS_VERSION}" --direct \
|
||||||
&& python3 -m spacy download "de_core_news_md-${SPACY_MODELS_VERSION}" --direct \
|
&& python3 -m spacy download "de_core_news_md-${SPACY_MODELS_VERSION}" --direct \
|
||||||
&& python3 -m spacy download "el_core_news_md-${SPACY_MODELS_VERSION}" --direct \
|
&& python3 -m spacy download "el_core_news_md-${SPACY_MODELS_VERSION}" --direct \
|
||||||
&& python3 -m spacy download "en_core_web_md-${SPACY_MODELS_VERSION}" --direct \
|
&& python3 -m spacy download "en_core_web_md-${SPACY_MODELS_VERSION}" --direct \
|
||||||
@ -43,12 +45,17 @@ RUN apt-get update \
|
|||||||
&& python3 -m spacy download "fr_core_news_md-${SPACY_MODELS_VERSION}" --direct \
|
&& python3 -m spacy download "fr_core_news_md-${SPACY_MODELS_VERSION}" --direct \
|
||||||
&& python3 -m spacy download "it_core_news_md-${SPACY_MODELS_VERSION}" --direct \
|
&& python3 -m spacy download "it_core_news_md-${SPACY_MODELS_VERSION}" --direct \
|
||||||
&& python3 -m spacy download "nl_core_news_md-${SPACY_MODELS_VERSION}" --direct \
|
&& python3 -m spacy download "nl_core_news_md-${SPACY_MODELS_VERSION}" --direct \
|
||||||
&& python3 -m spacy download "pt_core_news_md-${SPACY_MODELS_VERSION}" --direct
|
&& python3 -m spacy download "pt_core_news_md-${SPACY_MODELS_VERSION}" --direct \
|
||||||
|
&& python3 -m spacy download "ru_core_news_md-${SPACY_MODELS_VERSION}" --direct \
|
||||||
|
&& python3 -m spacy download "zh_core_web_md-${SPACY_MODELS_VERSION}" --direct
|
||||||
|
|
||||||
|
|
||||||
## Install Pipeline ##
|
## Install Pipeline ##
|
||||||
COPY nlp spacy-nlp /usr/local/bin/
|
COPY nlp spacy-nlp /usr/local/bin/
|
||||||
|
|
||||||
|
|
||||||
|
RUN rm -r /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
|
||||||
ENTRYPOINT ["nlp"]
|
ENTRYPOINT ["nlp"]
|
||||||
CMD ["--help"]
|
CMD ["--help"]
|
||||||
|
120
README.md
120
README.md
@ -1,74 +1,88 @@
|
|||||||
# Natural language processing
|
# NLP - Natural Language Processing
|
||||||
|
|
||||||
This repository provides all code that is needed to build a container image for natural language processing utilizing [spaCy](https://spacy.io).
|
This software implements a heavily parallelized pipeline for Natural Language Processing of text files. It is used for nopaque's NLP service but you can also use it standalone, for that purpose a convenient wrapper script is provided.
|
||||||
|
|
||||||
## Build image
|
## Software used in this pipeline implementation
|
||||||
|
- Official Debian Docker image (buster-slim) and programs from its free repositories: https://hub.docker.com/_/debian
|
||||||
|
- pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
|
||||||
|
- spaCy (3.0.3): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1
|
||||||
|
- spaCy medium sized models (3.0.0):
|
||||||
|
- https://github.com/explosion/spacy-models/releases/tag/da_core_news_md-3.0.0
|
||||||
|
- https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.0.0
|
||||||
|
- https://github.com/explosion/spacy-models/releases/tag/el_core_news_md-3.0.0
|
||||||
|
- https://github.com/explosion/spacy-models/releases/tag/en_core_web_md-3.0.0
|
||||||
|
- https://github.com/explosion/spacy-models/releases/tag/es_core_news_md-3.0.0
|
||||||
|
- https://github.com/explosion/spacy-models/releases/tag/fr_core_news_md-3.0.0
|
||||||
|
- https://github.com/explosion/spacy-models/releases/tag/it_core_news_md-3.0.0
|
||||||
|
- https://github.com/explosion/spacy-models/releases/tag/nl_core_news_md-3.0.0
|
||||||
|
- https://github.com/explosion/spacy-models/releases/tag/pt_core_news_md-3.0.0
|
||||||
|
- https://github.com/explosion/spacy-models/releases/tag/ru_core_news_md-3.0.0
|
||||||
|
- https://github.com/explosion/spacy-models/releases/tag/zh_core_web_md-3.0.0
|
||||||
|
|
||||||
1. Clone this repository and navigate into it:
|
|
||||||
```
|
## Use this image
|
||||||
git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git && cd nlp
|
|
||||||
|
1. Create input and output directories for the pipeline.
|
||||||
|
``` bash
|
||||||
|
mkdir -p /<my_data_location>/input /<my_data_location>/output
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Build image:
|
2. Place your text files inside `/<my_data_location>/input`. Files should all contain text of the same language.
|
||||||
```
|
|
||||||
docker build -t sfb1288inf/nlp:latest .
|
|
||||||
```
|
|
||||||
|
|
||||||
Alternatively build from the GitLab repository without cloning:
|
3. Start the pipeline process. Check the [Pipeline arguments](#pipeline-arguments) section for more details.
|
||||||
|
|
||||||
1. Build image:
|
|
||||||
```
|
|
||||||
docker build -t sfb1288inf/nlp:latest https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
|
|
||||||
```
|
```
|
||||||
|
# Option one: Use the wrapper script
|
||||||
|
## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/raw/1.0.0/wrapper/nlp, make it executeable and add it to your ${PATH}
|
||||||
|
cd /<my_data_location>
|
||||||
|
nlp -i input -l <language_code> -o output <optional_pipeline_arguments>
|
||||||
|
|
||||||
## Download prebuilt image
|
# Option two: Classic Docker style
|
||||||
|
|
||||||
The GitLab registry provides a prebuilt image. It is automatically created, utilizing the conquaire build servers.
|
|
||||||
|
|
||||||
1. Download image:
|
|
||||||
```
|
|
||||||
docker pull gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest
|
|
||||||
```
|
|
||||||
|
|
||||||
## Run
|
|
||||||
|
|
||||||
1. Create input and output directories for the NLP software:
|
|
||||||
```
|
|
||||||
mkdir -p /<mydatalocation>/files_for_nlp /<mydatalocation>/files_from_nlp
|
|
||||||
```
|
|
||||||
|
|
||||||
2. Place your text files inside the `/<mydatalocation>/files_for_nlp` directory. Files should all contain text of the same language.
|
|
||||||
|
|
||||||
3. Start the NLP process.
|
|
||||||
```
|
|
||||||
docker run \
|
docker run \
|
||||||
--rm \
|
--rm \
|
||||||
-it \
|
-it \
|
||||||
-u $(id -u $USER):$(id -g $USER) \
|
-u $(id -u $USER):$(id -g $USER) \
|
||||||
-v /<mydatalocation>/files_for_nlp:/input \
|
-v /<my_data_location>/input:/input \
|
||||||
-v /<mydatalocation>/files_from_nlp:/output \
|
-v /<my_data_location>/output:/output \
|
||||||
sfb1288inf/nlp:latest \
|
gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0 \
|
||||||
-i /input \
|
-i /input \
|
||||||
-l <languagecode> \
|
-l <language_code>
|
||||||
-o /output
|
-o /output \
|
||||||
|
<optional_pipeline_arguments>
|
||||||
```
|
```
|
||||||
The arguments below `sfb1288inf/nlp:latest` are described in the [NLP arguments](#nlp-arguments) part.
|
|
||||||
|
|
||||||
If you want to use the prebuilt image, replace `sfb1288inf/nlp:latest` with `gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest`.
|
4. Check your results in the `/<my_data_location>/output` directory.
|
||||||
|
```
|
||||||
|
|
||||||
4. Check your results in the `/<mydatalocation>/files_from_nlp` directory.
|
### Pipeline arguments
|
||||||
|
|
||||||
### NLP arguments
|
`--check-encoding`
|
||||||
|
* If set, the pipeline tries to automatically determine the right encoding for
|
||||||
`-i path`
|
your texts. Only use it if you are not sure that your input is provided in UTF-8.
|
||||||
* Sets the input directory using the specified path.
|
* default = False
|
||||||
* required = True
|
* required = False
|
||||||
|
|
||||||
`-o path`
|
|
||||||
* Sets the output directory using the specified path.
|
|
||||||
* required = True
|
|
||||||
|
|
||||||
`-l languagecode`
|
`-l languagecode`
|
||||||
* Tells spaCy which language will be used.
|
* Tells spaCy which language will be used.
|
||||||
* options = de (German), el (Greek), en (English), es (Spanish), fr (French), it (Italian), nl (Dutch), pt (Portuguese)
|
* options = da (Danish), de (German), el (Greek), en (English), es (Spanish), fr (French), it (Italian), nl (Dutch), pt (Portuguese), ru (Russian), zh (Chinese)
|
||||||
* required = True
|
* required = True
|
||||||
|
|
||||||
|
`--nCores corenumber`
|
||||||
|
* Sets the number of CPU cores being used during the NLP process.
|
||||||
|
* default = min(4, multiprocessing.cpu_count())
|
||||||
|
* required = False
|
||||||
|
|
||||||
|
``` bash
|
||||||
|
# Example with all arguments used
|
||||||
|
docker run \
|
||||||
|
--rm \
|
||||||
|
-it \
|
||||||
|
-u $(id -u $USER):$(id -g $USER) \
|
||||||
|
-v "$HOME"/ocr/input:/input \
|
||||||
|
-v "$HOME"/ocr/output:/output \
|
||||||
|
gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0 \
|
||||||
|
-i /input \
|
||||||
|
-l en \
|
||||||
|
-o /output \
|
||||||
|
--check-encoding \
|
||||||
|
--nCores 8 \
|
||||||
|
```
|
||||||
|
17
nlp
17
nlp
@ -1,13 +1,11 @@
|
|||||||
#!/usr/bin/env python2.7
|
#!/usr/bin/env python2.7
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
"""
|
"""A NLP pipeline for text file processing."""
|
||||||
nlp
|
|
||||||
|
|
||||||
Usage: For usage instructions run with option --help
|
__author__ = 'Patrick Jentsch <p.jentsch@uni-bielefeld.de>,' \
|
||||||
Authors: Patrick Jentsch <p.jentsch@uni-bielefeld.de
|
'Stephan Porada <porada@posteo.de>'
|
||||||
Stephan Porada <sporada@uni-bielefeld.de>
|
__version__ = '1.0.0'
|
||||||
"""
|
|
||||||
|
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
from pyflow import WorkflowRunner
|
from pyflow import WorkflowRunner
|
||||||
@ -16,14 +14,17 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|
||||||
SPACY_MODELS = {'de': 'de_core_news_md',
|
SPACY_MODELS = {'da': 'da_core_news_md',
|
||||||
|
'de': 'de_core_news_md',
|
||||||
'el': 'el_core_news_md',
|
'el': 'el_core_news_md',
|
||||||
'en': 'en_core_web_md',
|
'en': 'en_core_web_md',
|
||||||
'es': 'es_core_news_md',
|
'es': 'es_core_news_md',
|
||||||
'fr': 'fr_core_news_md',
|
'fr': 'fr_core_news_md',
|
||||||
'it': 'it_core_news_md',
|
'it': 'it_core_news_md',
|
||||||
'nl': 'nl_core_news_md',
|
'nl': 'nl_core_news_md',
|
||||||
'pt': 'pt_core_news_md'}
|
'pt': 'pt_core_news_md',
|
||||||
|
'ru': 'ru_core_news_md',
|
||||||
|
'zh': 'zh_core_web_md'}
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
|
10
spacy-nlp
10
spacy-nlp
@ -9,14 +9,20 @@ import os
|
|||||||
import spacy
|
import spacy
|
||||||
import textwrap
|
import textwrap
|
||||||
|
|
||||||
SPACY_MODELS = {'de': 'de_core_news_md',
|
|
||||||
|
SPACY_MODELS = {'da': 'da_core_news_md',
|
||||||
|
'de': 'de_core_news_md',
|
||||||
'el': 'el_core_news_md',
|
'el': 'el_core_news_md',
|
||||||
'en': 'en_core_web_md',
|
'en': 'en_core_web_md',
|
||||||
'es': 'es_core_news_md',
|
'es': 'es_core_news_md',
|
||||||
'fr': 'fr_core_news_md',
|
'fr': 'fr_core_news_md',
|
||||||
'it': 'it_core_news_md',
|
'it': 'it_core_news_md',
|
||||||
'nl': 'nl_core_news_md',
|
'nl': 'nl_core_news_md',
|
||||||
'pt': 'pt_core_news_md'}
|
'pt': 'pt_core_news_md',
|
||||||
|
'ru': 'ru_core_news_md',
|
||||||
|
'zh': 'zh_core_web_md'}
|
||||||
|
|
||||||
|
|
||||||
SPACY_MODELS_VERSION = os.environ.get('SPACY_MODELS_VERSION')
|
SPACY_MODELS_VERSION = os.environ.get('SPACY_MODELS_VERSION')
|
||||||
SPACY_VERSION = os.environ.get('SPACY_VERSION')
|
SPACY_VERSION = os.environ.get('SPACY_VERSION')
|
||||||
|
|
||||||
|
@ -5,7 +5,7 @@ from argparse import ArgumentParser
|
|||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest'
|
CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0'
|
||||||
CONTAINER_INPUT_DIR = '/input'
|
CONTAINER_INPUT_DIR = '/input'
|
||||||
CONTAINER_OUTPUT_DIR = '/output'
|
CONTAINER_OUTPUT_DIR = '/output'
|
||||||
UID = str(os.getuid())
|
UID = str(os.getuid())
|
||||||
|
Loading…
x
Reference in New Issue
Block a user