From 6c8b32fad4cc9c6dca18e4183f1104b9981a8b9a Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Mon, 20 May 2019 12:08:13 +0200 Subject: [PATCH] Update --- Dockerfile | 1 + README.md | 86 ++++++++++++++++++++++++++++++++++++++---------------- nlp | 2 +- spacy_nlp | 7 +++-- 4 files changed, 67 insertions(+), 29 deletions(-) diff --git a/Dockerfile b/Dockerfile index b4ef535..7ba70b2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -31,6 +31,7 @@ RUN pip3 install wheel && pip3 install -U spacy && \ python3 -m spacy download en && \ python3 -m spacy download es && \ python3 -m spacy download fr && \ + python3 -m spacy download it && \ python3 -m spacy download pt COPY nlp /usr/local/bin diff --git a/README.md b/README.md index 856ae64..c11f4de 100644 --- a/README.md +++ b/README.md @@ -1,37 +1,73 @@ # Natural language processing -This repository provides all code that is needed to build a container image for natural language processing utilising [spaCy](https://spacy.io). -In case you don't want to build the image by yourself, there is also a prebuild image that can be used in the [registry](https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/container_registry). +This repository provides all code that is needed to build a container image for natural language processing utilizing [spaCy](https://spacy.io). -## Build the image +## Build image -```console -user@machine:~$ cd -user@machine:~$ docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp . +1. Clone this repository and navigate into it: +``` +git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git && cd nlp ``` -## Starting a container - -```console -user@machine:~$ docker run \ - --name nlp-container \ - -dit \ - -v :/root/files_for_nlp \ - -v :/root/files_from_nlp \ - gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp +2. Build image: +``` +docker build -t sfb1288inf/nlp:latest . ``` -## Start a natural language processing run +Alternatively build from the GitLab repository without cloning: -```console -user@machine:~$ docker exec -it nlp-container \ - nlp -i files_for_nlp -o files_from_nlp -l +1. Build image: +``` +docker build -t sfb1288inf/nlp:latest https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git ``` -Where needs to be one of the following: +## Download prebuilt image -* de (German) -* en (English) -* es (Spanish) -* fr (French) -* pt (Portuguese) \ No newline at end of file +The GitLab registry provides a prebuilt image. It is automatically created, utilizing the conquaire build servers. + +1. Download image: +``` +docker pull gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest +``` + +## Run + +1. Create input and output directories for the NLP software: +``` +mkdir -p //files_for_nlp //files_from_nlp +``` + +2. Place your text files inside the `//files_for_nlp` directory. Files should all contain text of the same language. + +3. Start the NLP process. +``` +docker run \ + --rm \ + -it \ + -v //files_for_nlp:/files_for_nlp \ + -v //files_from_nlp:/files_from_nlp \ + sfb1288inf/nlp:latest \ + -i /files_for_nlp \ + -o /files_from_nlp \ + -l +``` +The arguments below `sfb1288inf/nlp:latest` are described in the [NLP arguments](#nlp-arguments) part. + +If you want to use the prebuilt image, replace `sfb1288inf/nlp:latest` with `gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest`. + +4. Check your results in the `//files_from_nlp` directory. + +### NLP arguments + +`-i path` +* Sets the input directory using the specified path. +* required = True + +`-o path` +* Sets the output directory using the specified path. +* required = True + +`-l languagecode` +* Tells spaCy which language will be used. +* options = de (German), el (Greek), en (English), es (Spanish), fr (French), it (Italian), nl (Dutch), pt (Portuguese) +* required = True diff --git a/nlp b/nlp index af92e18..203c922 100755 --- a/nlp +++ b/nlp @@ -28,7 +28,7 @@ def parse_arguments(): ) parser.add_argument( '-l', - choices=['de', 'en', 'es', 'fr', 'pt'], + choices=['de', 'el', 'en', 'es', 'fr', 'it', 'nl', 'pt'], dest='lang', required=True ) diff --git a/spacy_nlp b/spacy_nlp index e01bb05..abf0fe9 100755 --- a/spacy_nlp +++ b/spacy_nlp @@ -15,7 +15,7 @@ parser.add_argument( ) parser.add_argument( '-l', - choices=['de', 'en', 'es', 'fr', 'pt'], + choices=['de', 'el', 'en', 'es', 'fr', 'it', 'nl', 'pt'], dest='lang', required=True ) @@ -26,8 +26,9 @@ parser.add_argument( args = parser.parse_args() SPACY_MODELS = { - 'de': 'de_core_news_sm', 'en': 'en_core_web_sm', 'es': 'es_core_news_sm', - 'fr': 'fr_core_news_sm', 'pt': 'pt_core_news_sm' + 'de': 'de_core_news_sm', 'el': 'el_core_news_sm', 'en': 'en_core_web_sm', + 'es': 'es_core_news_sm', 'fr': 'fr_core_news_sm', 'it': 'it_core_news_sm', + 'nl': 'nl_core_news_sm', 'pt': 'pt_core_news_sm' } # Set the language model for spacy