mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
				synced 2025-10-31 07:02:43 +00:00 
			
		
		
		
	Fix version 1.0.0
This commit is contained in:
		
							
								
								
									
										27
									
								
								Dockerfile
									
									
									
									
									
								
							
							
						
						
									
										27
									
								
								Dockerfile
									
									
									
									
									
								
							| @@ -1,41 +1,43 @@ | ||||
| FROM debian:buster-slim | ||||
|  | ||||
|  | ||||
| LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <sporada@uni-bielefeld.de>" | ||||
| LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>" | ||||
|  | ||||
|  | ||||
| ENV LANG=C.UTF-8 | ||||
|  | ||||
|  | ||||
| RUN apt-get update | ||||
|  | ||||
|  | ||||
| # Install pipeline dependencies # | ||||
| ## Install pyFlow ## | ||||
| ENV PYFLOW_RELEASE=1.1.20 | ||||
| ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" . | ||||
| RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \ | ||||
|  && cd "pyflow-${PYFLOW_RELEASE}" \ | ||||
|  && apt-get update \ | ||||
|  && apt-get install --no-install-recommends --yes \ | ||||
|       python2.7 \ | ||||
|  && rm -r /var/lib/apt/lists/* \ | ||||
|  && python2.7 setup.py build install \ | ||||
|  && cd .. \ | ||||
|  && rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz" | ||||
|  | ||||
|  | ||||
| ## Install spaCy ## | ||||
| ENV SPACY_VERSION=2.3.2 | ||||
| ENV SPACY_MODELS_VERSION=2.3.0 | ||||
| RUN apt-get update \ | ||||
|  && apt-get install --no-install-recommends --yes \ | ||||
| ENV SPACY_VERSION=3.0.3 | ||||
| RUN apt-get install --no-install-recommends --yes \ | ||||
|       python3.7 \ | ||||
|       python3-pip \ | ||||
|       zip \ | ||||
|  && rm -r /var/lib/apt/lists/* \ | ||||
|  && pip3 install \ | ||||
|       chardet \ | ||||
|       setuptools \ | ||||
|       wheel \ | ||||
|  && pip3 install "spacy==${SPACY_VERSION}" \ | ||||
|  && pip3 install --upgrade pip \ | ||||
|  && pip3 install "spacy==${SPACY_VERSION}" | ||||
|  | ||||
| ENV SPACY_MODELS_VERSION=3.0.0 | ||||
| RUN python3 -m spacy download "da_core_news_md-${SPACY_MODELS_VERSION}" --direct \ | ||||
|  && python3 -m spacy download "de_core_news_md-${SPACY_MODELS_VERSION}" --direct \ | ||||
|  && python3 -m spacy download "el_core_news_md-${SPACY_MODELS_VERSION}" --direct \ | ||||
|  && python3 -m spacy download "en_core_web_md-${SPACY_MODELS_VERSION}" --direct \ | ||||
| @@ -43,12 +45,17 @@ RUN apt-get update \ | ||||
|  && python3 -m spacy download "fr_core_news_md-${SPACY_MODELS_VERSION}" --direct \ | ||||
|  && python3 -m spacy download "it_core_news_md-${SPACY_MODELS_VERSION}" --direct \ | ||||
|  && python3 -m spacy download "nl_core_news_md-${SPACY_MODELS_VERSION}" --direct \ | ||||
|  && python3 -m spacy download "pt_core_news_md-${SPACY_MODELS_VERSION}" --direct | ||||
|  && python3 -m spacy download "pt_core_news_md-${SPACY_MODELS_VERSION}" --direct \ | ||||
|  && python3 -m spacy download "ru_core_news_md-${SPACY_MODELS_VERSION}" --direct \ | ||||
|  && python3 -m spacy download "zh_core_web_md-${SPACY_MODELS_VERSION}" --direct | ||||
|  | ||||
|  | ||||
| ## Install Pipeline ## | ||||
| COPY nlp spacy-nlp /usr/local/bin/ | ||||
|  | ||||
|  | ||||
| RUN rm -r /var/lib/apt/lists/* | ||||
|  | ||||
|  | ||||
| ENTRYPOINT ["nlp"] | ||||
| CMD ["--help"] | ||||
|   | ||||
							
								
								
									
										120
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										120
									
								
								README.md
									
									
									
									
									
								
							| @@ -1,74 +1,88 @@ | ||||
| # Natural language processing | ||||
| # NLP - Natural Language Processing | ||||
|  | ||||
| This repository provides all code that is needed to build a container image for natural language processing utilizing [spaCy](https://spacy.io). | ||||
| This software implements a heavily parallelized pipeline for Natural Language Processing of text files. It is used for nopaque's NLP service but you can also use it standalone, for that purpose a convenient wrapper script is provided. | ||||
|  | ||||
| ## Build image | ||||
| ## Software used in this pipeline implementation | ||||
| - Official Debian Docker image (buster-slim) and programs from its free repositories: https://hub.docker.com/_/debian | ||||
| - pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20 | ||||
| - spaCy (3.0.3): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1 | ||||
| - spaCy medium sized models (3.0.0): | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/da_core_news_md-3.0.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.0.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/el_core_news_md-3.0.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/en_core_web_md-3.0.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/es_core_news_md-3.0.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/fr_core_news_md-3.0.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/it_core_news_md-3.0.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/nl_core_news_md-3.0.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/pt_core_news_md-3.0.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/ru_core_news_md-3.0.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/zh_core_web_md-3.0.0 | ||||
|  | ||||
| 1. Clone this repository and navigate into it: | ||||
| ``` | ||||
| git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git && cd nlp | ||||
|  | ||||
| ## Use this image | ||||
|  | ||||
| 1. Create input and output directories for the pipeline. | ||||
| ``` bash | ||||
| mkdir -p /<my_data_location>/input /<my_data_location>/output | ||||
| ``` | ||||
|  | ||||
| 2. Build image: | ||||
| ``` | ||||
| docker build -t sfb1288inf/nlp:latest . | ||||
| ``` | ||||
| 2. Place your text files inside `/<my_data_location>/input`. Files should all contain text of the same language. | ||||
|  | ||||
| Alternatively build from the GitLab repository without cloning: | ||||
|  | ||||
| 1. Build image: | ||||
| ``` | ||||
| docker build -t sfb1288inf/nlp:latest https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git | ||||
| 3. Start the pipeline process. Check the [Pipeline arguments](#pipeline-arguments) section for more details. | ||||
| ``` | ||||
| # Option one: Use the wrapper script | ||||
| ## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/raw/1.0.0/wrapper/nlp, make it executeable and add it to your ${PATH} | ||||
| cd /<my_data_location> | ||||
| nlp -i input -l <language_code> -o output <optional_pipeline_arguments> | ||||
|  | ||||
| ## Download prebuilt image | ||||
|  | ||||
| The GitLab registry provides a prebuilt image. It is automatically created, utilizing the conquaire build servers. | ||||
|  | ||||
| 1. Download image: | ||||
| ``` | ||||
| docker pull gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest | ||||
| ``` | ||||
|  | ||||
| ## Run | ||||
|  | ||||
| 1. Create input and output directories for the NLP software: | ||||
| ``` | ||||
| mkdir -p /<mydatalocation>/files_for_nlp /<mydatalocation>/files_from_nlp | ||||
| ``` | ||||
|  | ||||
| 2. Place your text files inside the `/<mydatalocation>/files_for_nlp` directory. Files should all contain text of the same language. | ||||
|  | ||||
| 3. Start the NLP process. | ||||
| ``` | ||||
| # Option two: Classic Docker style | ||||
| docker run \ | ||||
|     --rm \ | ||||
|     -it \ | ||||
|     -u $(id -u $USER):$(id -g $USER) \ | ||||
|     -v /<mydatalocation>/files_for_nlp:/input \ | ||||
|     -v /<mydatalocation>/files_from_nlp:/output \ | ||||
|     sfb1288inf/nlp:latest \ | ||||
|     -v /<my_data_location>/input:/input \ | ||||
|     -v /<my_data_location>/output:/output \ | ||||
|     gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0 \ | ||||
|         -i /input \ | ||||
|         -l <languagecode> \ | ||||
|         -o /output | ||||
|         -l <language_code> | ||||
|         -o /output \ | ||||
|         <optional_pipeline_arguments> | ||||
| ``` | ||||
| The arguments below `sfb1288inf/nlp:latest` are described in the [NLP arguments](#nlp-arguments) part. | ||||
|  | ||||
| If you want to use the prebuilt image, replace `sfb1288inf/nlp:latest` with `gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest`. | ||||
| 4. Check your results in the `/<my_data_location>/output` directory. | ||||
| ``` | ||||
|  | ||||
| 4. Check your results in the `/<mydatalocation>/files_from_nlp` directory. | ||||
| ### Pipeline arguments | ||||
|  | ||||
| ### NLP arguments | ||||
|  | ||||
| `-i path` | ||||
| * Sets the input directory using the specified path. | ||||
| * required = True | ||||
|  | ||||
| `-o path` | ||||
| * Sets the output directory using the specified path. | ||||
| * required = True | ||||
| `--check-encoding` | ||||
| * If set, the pipeline tries to automatically determine the right encoding for | ||||
| your texts. Only use it if you are not sure that your input is provided in UTF-8. | ||||
| * default = False | ||||
| * required = False | ||||
|  | ||||
| `-l languagecode` | ||||
| * Tells spaCy which language will be used. | ||||
| * options = de (German), el (Greek), en (English), es (Spanish), fr (French), it (Italian), nl (Dutch), pt (Portuguese) | ||||
| * options = da (Danish), de (German), el (Greek), en (English), es (Spanish), fr (French), it (Italian), nl (Dutch), pt (Portuguese), ru (Russian), zh (Chinese) | ||||
| * required = True | ||||
|  | ||||
| `--nCores corenumber` | ||||
| * Sets the number of CPU cores being used during the NLP process. | ||||
| * default = min(4, multiprocessing.cpu_count()) | ||||
| * required = False | ||||
|  | ||||
| ``` bash | ||||
| # Example with all arguments used | ||||
| docker run \ | ||||
|     --rm \ | ||||
|     -it \ | ||||
|     -u $(id -u $USER):$(id -g $USER) \ | ||||
|     -v "$HOME"/ocr/input:/input \ | ||||
|     -v "$HOME"/ocr/output:/output \ | ||||
|     gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0 \ | ||||
|         -i /input \ | ||||
|         -l en \ | ||||
|         -o /output \ | ||||
|         --check-encoding \ | ||||
|         --nCores 8 \ | ||||
| ``` | ||||
|   | ||||
							
								
								
									
										17
									
								
								nlp
									
									
									
									
									
								
							
							
						
						
									
										17
									
								
								nlp
									
									
									
									
									
								
							| @@ -1,13 +1,11 @@ | ||||
| #!/usr/bin/env python2.7 | ||||
| # coding=utf-8 | ||||
|  | ||||
| """ | ||||
| nlp | ||||
| """A NLP pipeline for text file processing.""" | ||||
|  | ||||
| Usage:  For usage instructions run with option --help | ||||
| Authors: Patrick Jentsch <p.jentsch@uni-bielefeld.de | ||||
|          Stephan Porada <sporada@uni-bielefeld.de> | ||||
| """ | ||||
| __author__ = 'Patrick Jentsch <p.jentsch@uni-bielefeld.de>,' \ | ||||
|              'Stephan Porada <porada@posteo.de>' | ||||
| __version__ = '1.0.0' | ||||
|  | ||||
| from argparse import ArgumentParser | ||||
| from pyflow import WorkflowRunner | ||||
| @@ -16,14 +14,17 @@ import os | ||||
| import sys | ||||
|  | ||||
|  | ||||
| SPACY_MODELS = {'de': 'de_core_news_md', | ||||
| SPACY_MODELS = {'da': 'da_core_news_md', | ||||
|                 'de': 'de_core_news_md', | ||||
|                 'el': 'el_core_news_md', | ||||
|                 'en': 'en_core_web_md', | ||||
|                 'es': 'es_core_news_md', | ||||
|                 'fr': 'fr_core_news_md', | ||||
|                 'it': 'it_core_news_md', | ||||
|                 'nl': 'nl_core_news_md', | ||||
|                 'pt': 'pt_core_news_md'} | ||||
|                 'pt': 'pt_core_news_md', | ||||
|                 'ru': 'ru_core_news_md', | ||||
|                 'zh': 'zh_core_web_md'} | ||||
|  | ||||
|  | ||||
| def parse_args(): | ||||
|   | ||||
							
								
								
									
										10
									
								
								spacy-nlp
									
									
									
									
									
								
							
							
						
						
									
										10
									
								
								spacy-nlp
									
									
									
									
									
								
							| @@ -9,14 +9,20 @@ import os | ||||
| import spacy | ||||
| import textwrap | ||||
|  | ||||
| SPACY_MODELS = {'de': 'de_core_news_md', | ||||
|  | ||||
| SPACY_MODELS = {'da': 'da_core_news_md', | ||||
|                 'de': 'de_core_news_md', | ||||
|                 'el': 'el_core_news_md', | ||||
|                 'en': 'en_core_web_md', | ||||
|                 'es': 'es_core_news_md', | ||||
|                 'fr': 'fr_core_news_md', | ||||
|                 'it': 'it_core_news_md', | ||||
|                 'nl': 'nl_core_news_md', | ||||
|                 'pt': 'pt_core_news_md'} | ||||
|                 'pt': 'pt_core_news_md', | ||||
|                 'ru': 'ru_core_news_md', | ||||
|                 'zh': 'zh_core_web_md'} | ||||
|  | ||||
|  | ||||
| SPACY_MODELS_VERSION = os.environ.get('SPACY_MODELS_VERSION') | ||||
| SPACY_VERSION = os.environ.get('SPACY_VERSION') | ||||
|  | ||||
|   | ||||
| @@ -5,7 +5,7 @@ from argparse import ArgumentParser | ||||
| import os | ||||
| import subprocess | ||||
|  | ||||
| CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest' | ||||
| CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0' | ||||
| CONTAINER_INPUT_DIR = '/input' | ||||
| CONTAINER_OUTPUT_DIR = '/output' | ||||
| UID = str(os.getuid()) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user