mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
				synced 2025-10-21 23:55:38 +00:00 
			
		
		
		
	Compare commits
	
		
			44 Commits
		
	
	
		
			1.0
			...
			d620c29f27
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | d620c29f27 | ||
|  | 2ced38504c | ||
|  | f02c0953bf | ||
|  | 5329446277 | ||
|  | 15e373db58 | ||
|  | 8afdfb13b2 | ||
|  | 1ed42f68ad | ||
|  | 42583fea46 | ||
|  | 5bd0feda5c | ||
|  | 5980a995e5 | ||
|  | fe7ab93513 | ||
|  | 91708308bc | ||
|  | 887e814020 | ||
|  | 3fc6ebff4c | ||
|  | bef51b7d81 | ||
|  | 68e86338d4 | ||
|  | 30d127f3af | ||
|  | e061a7426d | ||
|  | 41910afb79 | ||
|  | 5d2fee029e | ||
|  | 6e87e0decd | ||
|  | 79043f3dd7 | ||
|  | 1a3e4a0a02 | ||
|  | 504861ae07 | ||
|  | 88d03d4360 | ||
|  | 6769be049a | ||
|  | ec2cf1dcff | ||
|  | e4ef4835e5 | ||
|  | 5f20f9be40 | ||
|  | b0a402b3ac | ||
|  | 543a1ba29a | ||
|  | d5a2d38c17 | ||
|  | 4af9d9c899 | ||
|  | de8160a5b6 | ||
|  | d564ed0464 | ||
|  | abf6c430c3 | ||
|  | 19426a4c78 | ||
|  | a32184db5c | ||
|  | a16b010bdc | ||
|  | af293d6141 | ||
|  | 43717de313 | ||
|  | 48fb20ae6b | ||
|  | 2f57b1a0af | ||
|  | e68d5c39ee | 
| @@ -1,42 +1,71 @@ | ||||
| image: docker:latest | ||||
| image: docker:19.03.13 | ||||
|  | ||||
| variables: | ||||
|   DOCKER_TLS_CERTDIR: "/certs" | ||||
|  | ||||
| services: | ||||
|   - docker:dind | ||||
|   - docker:19.03.13-dind | ||||
|  | ||||
| stages: | ||||
|   - build | ||||
|   - push | ||||
|  | ||||
| before_script: | ||||
|   - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY | ||||
| .reg_setup: | ||||
|   before_script: | ||||
|     - apk add --no-cache curl | ||||
|     - curl --fail --show-error --location "https://github.com/genuinetools/reg/releases/download/v$REG_VERSION/reg-linux-amd64" --output /usr/local/bin/reg | ||||
|     - echo "$REG_SHA256  /usr/local/bin/reg" | sha256sum -c - | ||||
|     - chmod a+x /usr/local/bin/reg | ||||
|   variables: | ||||
|     REG_SHA256: ade837fc5224acd8c34732bf54a94f579b47851cc6a7fd5899a98386b782e228 | ||||
|     REG_VERSION: 0.16.1 | ||||
|  | ||||
| Build: | ||||
| build_image: | ||||
|   script: | ||||
|     - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY | ||||
|     - docker build --pull -t $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA . | ||||
|     - docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA | ||||
|     - docker build -t $INTERMEDIATE_IMAGE_TAG . | ||||
|     - docker push $INTERMEDIATE_IMAGE_TAG | ||||
|   stage: build | ||||
|   tags: | ||||
|   - docker | ||||
|     - docker | ||||
|   variables: | ||||
|     INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA | ||||
|  | ||||
| Push latest: | ||||
| push_master: | ||||
|   extends: | ||||
|     - .reg_setup | ||||
|   only: | ||||
|     - master | ||||
|   script: | ||||
|     - docker pull $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA | ||||
|     - docker tag $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA $CI_REGISTRY_IMAGE:latest | ||||
|     - docker push $CI_REGISTRY_IMAGE:latest | ||||
|     - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY | ||||
|     - docker pull $INTERMEDIATE_IMAGE_TAG | ||||
|     - /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG | ||||
|     - docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG | ||||
|     - docker push $IMAGE_TAG | ||||
|   stage: push | ||||
|   tags: | ||||
|   - docker | ||||
|     - docker | ||||
|   variables: | ||||
|     IMAGE_TAG: $CI_REGISTRY_IMAGE:latest | ||||
|     INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA | ||||
|  | ||||
| Push tag: | ||||
| push_other: | ||||
|   extends: | ||||
|     - .reg_setup | ||||
|   except: | ||||
|     - master | ||||
|   only: | ||||
|     - branches | ||||
|     - tags | ||||
|   script: | ||||
|     - docker pull $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA | ||||
|     - docker tag $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME | ||||
|     - docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME | ||||
|     - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY | ||||
|     - docker pull $INTERMEDIATE_IMAGE_TAG | ||||
|     - /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG | ||||
|     - docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG | ||||
|     - docker push $IMAGE_TAG | ||||
|   stage: push | ||||
|   tags: | ||||
|   - docker | ||||
|     - docker | ||||
|   variables: | ||||
|     IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME | ||||
|     INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA | ||||
|   | ||||
							
								
								
									
										84
									
								
								Dockerfile
									
									
									
									
									
								
							
							
						
						
									
										84
									
								
								Dockerfile
									
									
									
									
									
								
							| @@ -1,43 +1,61 @@ | ||||
| FROM debian:9-slim | ||||
| FROM debian:buster-slim | ||||
|  | ||||
|  | ||||
| LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>" | ||||
|  | ||||
| LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de" | ||||
|  | ||||
| ENV DEBIAN_FRONTEND=noninteractive | ||||
| ENV LANG=C.UTF-8 | ||||
|  | ||||
| RUN apt-get update && \ | ||||
|     apt-get install -y --no-install-recommends \ | ||||
|     build-essential \ | ||||
|     ca-certificates \ | ||||
|     python2.7 \ | ||||
|     python3.5 \ | ||||
|     python3-dev \ | ||||
|     python3-pip \ | ||||
|     python3-setuptools \ | ||||
|     wget | ||||
|  | ||||
| # Install pyFlow | ||||
| ENV PYFLOW_VERSION 1.1.20 | ||||
| RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \ | ||||
|     tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \ | ||||
|     cd pyflow-"$PYFLOW_VERSION" && \ | ||||
|     python2.7 setup.py build install && \ | ||||
|     cd .. && \ | ||||
|     rm -r pyflow-"$PYFLOW_VERSION" pyflow-"$PYFLOW_VERSION".tar.gz | ||||
| RUN apt-get update | ||||
|  | ||||
| # Install spaCy | ||||
| RUN pip3 install wheel && pip3 install -U spacy && \ | ||||
|     python3.5 -m spacy download de && \ | ||||
|     python3.5 -m spacy download en && \ | ||||
|     python3.5 -m spacy download es && \ | ||||
|     python3.5 -m spacy download fr && \ | ||||
|     python3.5 -m spacy download it && \ | ||||
|     python3.5 -m spacy download pt | ||||
|  | ||||
| COPY nlp /usr/local/bin | ||||
| COPY spacy_nlp /usr/local/bin | ||||
| # Install pipeline dependencies # | ||||
| ## Install pyFlow ## | ||||
| ENV PYFLOW_RELEASE=1.1.20 | ||||
| ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" . | ||||
| RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \ | ||||
|  && cd "pyflow-${PYFLOW_RELEASE}" \ | ||||
|  && apt-get install --no-install-recommends --yes \ | ||||
|       python2.7 \ | ||||
|  && python2.7 setup.py build install \ | ||||
|  && cd .. \ | ||||
|  && rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz" | ||||
|  | ||||
|  | ||||
| ## Install spaCy ## | ||||
| ENV SPACY_VERSION=3.0.3 | ||||
| RUN apt-get install --no-install-recommends --yes \ | ||||
|       python3.7 \ | ||||
|       python3-pip \ | ||||
|       zip \ | ||||
|  && pip3 install \ | ||||
|       chardet \ | ||||
|       setuptools \ | ||||
|       wheel \ | ||||
|  && pip3 install --upgrade pip \ | ||||
|  && pip3 install "spacy==${SPACY_VERSION}" | ||||
|  | ||||
| ENV SPACY_MODELS_VERSION=3.0.0 | ||||
| RUN python3 -m spacy download "da_core_news_md-${SPACY_MODELS_VERSION}" --direct \ | ||||
|  && python3 -m spacy download "de_core_news_md-${SPACY_MODELS_VERSION}" --direct \ | ||||
|  && python3 -m spacy download "el_core_news_md-${SPACY_MODELS_VERSION}" --direct \ | ||||
|  && python3 -m spacy download "en_core_web_md-${SPACY_MODELS_VERSION}" --direct \ | ||||
|  && python3 -m spacy download "es_core_news_md-${SPACY_MODELS_VERSION}" --direct \ | ||||
|  && python3 -m spacy download "fr_core_news_md-${SPACY_MODELS_VERSION}" --direct \ | ||||
|  && python3 -m spacy download "it_core_news_md-${SPACY_MODELS_VERSION}" --direct \ | ||||
|  && python3 -m spacy download "nl_core_news_md-${SPACY_MODELS_VERSION}" --direct \ | ||||
|  && python3 -m spacy download "pt_core_news_md-${SPACY_MODELS_VERSION}" --direct \ | ||||
|  && python3 -m spacy download "ru_core_news_md-${SPACY_MODELS_VERSION}" --direct \ | ||||
|  && python3 -m spacy download "zh_core_web_md-${SPACY_MODELS_VERSION}" --direct | ||||
|  | ||||
|  | ||||
| ## Install Pipeline ## | ||||
| COPY nlp spacy-nlp /usr/local/bin/ | ||||
|  | ||||
|  | ||||
| RUN rm -r /var/lib/apt/lists/* | ||||
|  | ||||
| RUN mkdir /input /output && \ | ||||
|     chmod a+rw /input /output | ||||
|  | ||||
| ENTRYPOINT ["nlp"] | ||||
| CMD ["--help"] | ||||
|   | ||||
							
								
								
									
										120
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										120
									
								
								README.md
									
									
									
									
									
								
							| @@ -1,74 +1,88 @@ | ||||
| # Natural language processing | ||||
| # NLP - Natural Language Processing | ||||
|  | ||||
| This repository provides all code that is needed to build a container image for natural language processing utilizing [spaCy](https://spacy.io). | ||||
| This software implements a heavily parallelized pipeline for Natural Language Processing of text files. It is used for nopaque's NLP service but you can also use it standalone, for that purpose a convenient wrapper script is provided. | ||||
|  | ||||
| ## Build image | ||||
| ## Software used in this pipeline implementation | ||||
| - Official Debian Docker image (buster-slim) and programs from its free repositories: https://hub.docker.com/_/debian | ||||
| - pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20 | ||||
| - spaCy (3.0.3): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1 | ||||
| - spaCy medium sized models (3.0.0): | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/da_core_news_md-3.0.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.0.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/el_core_news_md-3.0.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/en_core_web_md-3.0.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/es_core_news_md-3.0.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/fr_core_news_md-3.0.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/it_core_news_md-3.0.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/nl_core_news_md-3.0.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/pt_core_news_md-3.0.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/ru_core_news_md-3.0.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/zh_core_web_md-3.0.0 | ||||
|  | ||||
| 1. Clone this repository and navigate into it: | ||||
| ``` | ||||
| git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git && cd nlp | ||||
|  | ||||
| ## Use this image | ||||
|  | ||||
| 1. Create input and output directories for the pipeline. | ||||
| ``` bash | ||||
| mkdir -p /<my_data_location>/input /<my_data_location>/output | ||||
| ``` | ||||
|  | ||||
| 2. Build image: | ||||
| ``` | ||||
| docker build -t sfb1288inf/nlp:latest . | ||||
| ``` | ||||
| 2. Place your text files inside `/<my_data_location>/input`. Files should all contain text of the same language. | ||||
|  | ||||
| Alternatively build from the GitLab repository without cloning: | ||||
|  | ||||
| 1. Build image: | ||||
| ``` | ||||
| docker build -t sfb1288inf/nlp:latest https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git | ||||
| 3. Start the pipeline process. Check the [Pipeline arguments](#pipeline-arguments) section for more details. | ||||
| ``` | ||||
| # Option one: Use the wrapper script | ||||
| ## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/raw/1.0.0/wrapper/nlp, make it executeable and add it to your ${PATH} | ||||
| cd /<my_data_location> | ||||
| nlp -i input -l <language_code> -o output <optional_pipeline_arguments> | ||||
|  | ||||
| ## Download prebuilt image | ||||
|  | ||||
| The GitLab registry provides a prebuilt image. It is automatically created, utilizing the conquaire build servers. | ||||
|  | ||||
| 1. Download image: | ||||
| ``` | ||||
| docker pull gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest | ||||
| ``` | ||||
|  | ||||
| ## Run | ||||
|  | ||||
| 1. Create input and output directories for the NLP software: | ||||
| ``` | ||||
| mkdir -p /<mydatalocation>/files_for_nlp /<mydatalocation>/files_from_nlp | ||||
| ``` | ||||
|  | ||||
| 2. Place your text files inside the `/<mydatalocation>/files_for_nlp` directory. Files should all contain text of the same language. | ||||
|  | ||||
| 3. Start the NLP process. | ||||
| ``` | ||||
| # Option two: Classic Docker style | ||||
| docker run \ | ||||
|     --rm \ | ||||
|     -it \ | ||||
|     -u $(id -u $USER):$(id -g $USER) \ | ||||
|     -v /<mydatalocation>/files_for_nlp:/input \ | ||||
|     -v /<mydatalocation>/files_from_nlp:/output \ | ||||
|     sfb1288inf/nlp:latest \ | ||||
|     -v /<my_data_location>/input:/input \ | ||||
|     -v /<my_data_location>/output:/output \ | ||||
|     gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0 \ | ||||
|         -i /input \ | ||||
|         -l <languagecode> \ | ||||
|         -o /output | ||||
|         -l <language_code> | ||||
|         -o /output \ | ||||
|         <optional_pipeline_arguments> | ||||
| ``` | ||||
| The arguments below `sfb1288inf/nlp:latest` are described in the [NLP arguments](#nlp-arguments) part. | ||||
|  | ||||
| If you want to use the prebuilt image, replace `sfb1288inf/nlp:latest` with `gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest`. | ||||
| 4. Check your results in the `/<my_data_location>/output` directory. | ||||
| ``` | ||||
|  | ||||
| 4. Check your results in the `/<mydatalocation>/files_from_nlp` directory. | ||||
| ### Pipeline arguments | ||||
|  | ||||
| ### NLP arguments | ||||
|  | ||||
| `-i path` | ||||
| * Sets the input directory using the specified path. | ||||
| * required = True | ||||
|  | ||||
| `-o path` | ||||
| * Sets the output directory using the specified path. | ||||
| * required = True | ||||
| `--check-encoding` | ||||
| * If set, the pipeline tries to automatically determine the right encoding for | ||||
| your texts. Only use it if you are not sure that your input is provided in UTF-8. | ||||
| * default = False | ||||
| * required = False | ||||
|  | ||||
| `-l languagecode` | ||||
| * Tells spaCy which language will be used. | ||||
| * options = de (German), el (Greek), en (English), es (Spanish), fr (French), it (Italian), nl (Dutch), pt (Portuguese) | ||||
| * options = da (Danish), de (German), el (Greek), en (English), es (Spanish), fr (French), it (Italian), nl (Dutch), pt (Portuguese), ru (Russian), zh (Chinese) | ||||
| * required = True | ||||
|  | ||||
| `--nCores corenumber` | ||||
| * Sets the number of CPU cores being used during the NLP process. | ||||
| * default = min(4, multiprocessing.cpu_count()) | ||||
| * required = False | ||||
|  | ||||
| ``` bash | ||||
| # Example with all arguments used | ||||
| docker run \ | ||||
|     --rm \ | ||||
|     -it \ | ||||
|     -u $(id -u $USER):$(id -g $USER) \ | ||||
|     -v "$HOME"/ocr/input:/input \ | ||||
|     -v "$HOME"/ocr/output:/output \ | ||||
|     gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0 \ | ||||
|         -i /input \ | ||||
|         -l en \ | ||||
|         -o /output \ | ||||
|         --check-encoding \ | ||||
|         --nCores 8 \ | ||||
| ``` | ||||
|   | ||||
							
								
								
									
										226
									
								
								nlp
									
									
									
									
									
								
							
							
						
						
									
										226
									
								
								nlp
									
									
									
									
									
								
							| @@ -1,133 +1,171 @@ | ||||
| #!/usr/bin/env python2.7 | ||||
| # coding=utf-8 | ||||
|  | ||||
| """ | ||||
| nlp | ||||
| """A NLP pipeline for text file processing.""" | ||||
|  | ||||
| Usage:  For usage instructions run with option --help | ||||
| Author: Patrick Jentsch <p.jentsch@uni-bielefeld.de> | ||||
| """ | ||||
| __author__ = 'Patrick Jentsch <p.jentsch@uni-bielefeld.de>,' \ | ||||
|              'Stephan Porada <porada@posteo.de>' | ||||
| __version__ = '1.0.0' | ||||
|  | ||||
|  | ||||
| import argparse | ||||
| from argparse import ArgumentParser | ||||
| from pyflow import WorkflowRunner | ||||
| import multiprocessing | ||||
| import os | ||||
| import sys | ||||
| from pyflow import WorkflowRunner | ||||
|  | ||||
|  | ||||
| def parse_arguments(): | ||||
|     parser = argparse.ArgumentParser( | ||||
|         description='Performs NLP of documents utilizing spaCy. The results are served as verticalized text files.' | ||||
|     ) | ||||
| SPACY_MODELS = {'da': 'da_core_news_md', | ||||
|                 'de': 'de_core_news_md', | ||||
|                 'el': 'el_core_news_md', | ||||
|                 'en': 'en_core_web_md', | ||||
|                 'es': 'es_core_news_md', | ||||
|                 'fr': 'fr_core_news_md', | ||||
|                 'it': 'it_core_news_md', | ||||
|                 'nl': 'nl_core_news_md', | ||||
|                 'pt': 'pt_core_news_md', | ||||
|                 'ru': 'ru_core_news_md', | ||||
|                 'zh': 'zh_core_web_md'} | ||||
|  | ||||
|     parser.add_argument( | ||||
|         '-i', | ||||
|         dest='input_dir', | ||||
|         required=True | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         '-l', | ||||
|         choices=['de', 'el', 'en', 'es', 'fr', 'it', 'nl', 'pt'], | ||||
|         dest='lang', | ||||
|         required=True | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         '-o', | ||||
|         dest='output_dir', | ||||
|         required=True | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         '--nCores', | ||||
|         default=min(4, multiprocessing.cpu_count()), | ||||
|         dest='n_cores', | ||||
|         help='total number of cores available', | ||||
|         required=False, | ||||
|         type=int | ||||
|     ) | ||||
|  | ||||
| def parse_args(): | ||||
|     parser = ArgumentParser(description='NLP Pipeline utilizing spaCy.') | ||||
|     parser.add_argument('-i', '--input-directory', | ||||
|                         help='Input directory (only txt files get processed)', | ||||
|                         required=True) | ||||
|     parser.add_argument('-o', '--output-directory', | ||||
|                         help='Output directory', | ||||
|                         required=True) | ||||
|     parser.add_argument('-l', '--language', | ||||
|                         choices=SPACY_MODELS.keys(), | ||||
|                         required=True) | ||||
|     parser.add_argument('--check-encoding', action='store_true') | ||||
|     parser.add_argument('--log-dir') | ||||
|     parser.add_argument('--n-cores', | ||||
|                         default=min(4, multiprocessing.cpu_count()), | ||||
|                         help='total number of cores available', type=int) | ||||
|     parser.add_argument('--zip', help='Zips everything into one archive.') | ||||
|     return parser.parse_args() | ||||
|  | ||||
|  | ||||
| class NLPWorkflow(WorkflowRunner): | ||||
|     def __init__(self, args): | ||||
|         self.jobs = analyze_jobs(args.input_dir, args.output_dir) | ||||
|         self.lang = args.lang | ||||
|         self.n_cores = args.n_cores | ||||
| class NLPPipelineJob: | ||||
|     """An NLP pipeline job class | ||||
|  | ||||
|     Each input file of the pipeline is represented as an NLP pipeline job, | ||||
|     which holds all necessary information for the pipeline to process it. | ||||
|  | ||||
|     Arguments: | ||||
|     file -- Path to the file | ||||
|     output_dir -- Path to a directory, where job results a stored | ||||
|     intermediate_dir -- Path to a directory, where intermediate files are | ||||
|                         stored. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, file, output_dir): | ||||
|         self.file = file | ||||
|         self.name = os.path.basename(file).rsplit('.', 1)[0] | ||||
|         self.output_dir = output_dir | ||||
|  | ||||
|  | ||||
| class NLPPipeline(WorkflowRunner): | ||||
|     def __init__(self, input_dir, lang, output_dir, check_encoding, n_cores, zip): | ||||
|         self.input_dir = input_dir | ||||
|         self.lang = lang | ||||
|         self.output_dir = output_dir | ||||
|         self.check_encoding = check_encoding | ||||
|         self.n_cores = n_cores | ||||
|         self.output_dir = output_dir | ||||
|         if zip is None: | ||||
|             self.zip = zip | ||||
|         else: | ||||
|             if zip.lower().endswith('.zip'): | ||||
|                 # Remove .zip file extension if provided | ||||
|                 self.zip = zip[:-4] | ||||
|                 self.zip = self.zip if self.zip else 'output' | ||||
|             else: | ||||
|                 self.zip = zip | ||||
|         self.jobs = collect_jobs(self.input_dir, self.output_dir) | ||||
|  | ||||
|     def workflow(self): | ||||
|         if len(self.jobs) == 0: | ||||
|         if not self.jobs: | ||||
|             return | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # Create output directories                      # | ||||
|         ' # setup output directory                         # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         create_output_directories_jobs = [] | ||||
|         for index, job in enumerate(self.jobs): | ||||
|             cmd = 'mkdir -p "%s"' % (job['output_dir']) | ||||
|             create_output_directories_jobs.append( | ||||
|                 self.addTask( | ||||
|                     command=cmd, | ||||
|                     label='create_output_directories_job_-_%i' % (index) | ||||
|                 ) | ||||
|             ) | ||||
|         setup_output_directory_tasks = [] | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             cmd = 'mkdir' | ||||
|             cmd += ' -p' | ||||
|             cmd += ' "{}"'.format(job.output_dir) | ||||
|             lbl = 'setup_output_directory_-_{}'.format(i) | ||||
|             task = self.addTask(command=cmd, label=lbl) | ||||
|             setup_output_directory_tasks.append(task) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # Natural language processing                    # | ||||
|         ' # nlp                                 # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         nlp_jobs = [] | ||||
|         nlp_job_n_cores = min( | ||||
|             self.n_cores, | ||||
|             max(1, int(self.n_cores / len(self.jobs))) | ||||
|         ) | ||||
|         for index, job in enumerate(self.jobs): | ||||
|             cmd = 'spacy_nlp -l "%s" "%s" "%s"' % ( | ||||
|                 self.lang, | ||||
|                 job['path'], | ||||
|                 os.path.join(job['output_dir'], job['name'] + '.vrt') | ||||
|             ) | ||||
|             nlp_jobs.append( | ||||
|                 self.addTask( | ||||
|                     command=cmd, | ||||
|                     dependencies='create_output_directories_job_-_%i' % (index), | ||||
|                     label='nlp_job_-_%i' % (index), | ||||
|                     nCores=nlp_job_n_cores | ||||
|                 ) | ||||
|             ) | ||||
|         nlp_tasks = [] | ||||
|         n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs)))) | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             output_file = os.path.join(job.output_dir, '{}.vrt'.format(job.name))  # noqa | ||||
|             cmd = 'spacy-nlp' | ||||
|             cmd += ' -i "{}"'.format(job.file) | ||||
|             cmd += ' -l "{}"'.format(self.lang) | ||||
|             cmd += ' -o "{}"'.format(output_file) | ||||
|             if self.check_encoding: | ||||
|                 cmd += ' --check-encoding' | ||||
|             deps = 'setup_output_directory_-_{}'.format(i) | ||||
|             lbl = 'nlp_-_{}'.format(i) | ||||
|             task = self.addTask(command=cmd, dependencies=deps, label=lbl, nCores=n_cores)  # noqa | ||||
|             nlp_tasks.append(task) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # zip creation                                   # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         zip_creation_tasks = [] | ||||
|         if self.zip is not None: | ||||
|             cmd = 'cd "{}"'.format(self.output_dir) | ||||
|             cmd += ' && ' | ||||
|             cmd += 'zip' | ||||
|             cmd += ' -r' | ||||
|             cmd += ' "{}.zip" .'.format(self.zip) | ||||
|             cmd += ' -x "pyflow.data*"' | ||||
|             cmd += ' -i "*.vrt"' | ||||
|             cmd += ' && ' | ||||
|             cmd += 'cd -' | ||||
|             deps = nlp_tasks | ||||
|             lbl = 'zip_creation' | ||||
|             task = self.addTask(command=cmd, dependencies=deps, label=lbl) | ||||
|             zip_creation_tasks.append(task) | ||||
|  | ||||
|  | ||||
| def analyze_jobs(input_dir, output_dir): | ||||
| def collect_jobs(input_dir, output_dir): | ||||
|     jobs = [] | ||||
|  | ||||
|     for file in os.listdir(input_dir): | ||||
|         if os.path.isdir(os.path.join(input_dir, file)): | ||||
|             jobs += analyze_jobs( | ||||
|                 os.path.join(input_dir, file), | ||||
|                 os.path.join(output_dir, file), | ||||
|             ) | ||||
|         elif file.endswith('.txt'): | ||||
|             jobs.append( | ||||
|                 { | ||||
|                     'filename': file, | ||||
|                     'name': file.rsplit('.', 1)[0], | ||||
|                     'output_dir': os.path.join(output_dir, file), | ||||
|                     'path': os.path.join(input_dir, file) | ||||
|                 } | ||||
|             ) | ||||
|  | ||||
|             jobs += collect_jobs(os.path.join(input_dir, file), | ||||
|                                  os.path.join(output_dir, file)) | ||||
|         elif file.lower().endswith('.txt'): | ||||
|             jobs.append(NLPPipelineJob(os.path.join(input_dir, file), | ||||
|                                        os.path.join(output_dir, file))) | ||||
|     return jobs | ||||
|  | ||||
|  | ||||
| def main(): | ||||
|     args = parse_arguments() | ||||
|  | ||||
|     wflow = NLPWorkflow(args) | ||||
|  | ||||
|     retval = wflow.run(dataDirRoot=args.output_dir, nCores=args.n_cores) | ||||
|  | ||||
|     args = parse_args() | ||||
|     nlp_pipeline = NLPPipeline(args.input_directory, args.language, | ||||
|                                args.output_directory, args.check_encoding, | ||||
|                                args.n_cores, args.zip) | ||||
|     retval = nlp_pipeline.run( | ||||
|         dataDirRoot=(args.log_dir or args.output_directory), | ||||
|         nCores=args.n_cores | ||||
|     ) | ||||
|     sys.exit(retval) | ||||
|  | ||||
|  | ||||
|   | ||||
							
								
								
									
										119
									
								
								spacy-nlp
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										119
									
								
								spacy-nlp
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,119 @@ | ||||
| #!/usr/bin/env python3.7 | ||||
| # coding=utf-8 | ||||
|  | ||||
| from argparse import ArgumentParser | ||||
| from xml.sax.saxutils import escape | ||||
| import chardet | ||||
| import hashlib | ||||
| import os | ||||
| import spacy | ||||
| import textwrap | ||||
|  | ||||
|  | ||||
| SPACY_MODELS = {'da': 'da_core_news_md', | ||||
|                 'de': 'de_core_news_md', | ||||
|                 'el': 'el_core_news_md', | ||||
|                 'en': 'en_core_web_md', | ||||
|                 'es': 'es_core_news_md', | ||||
|                 'fr': 'fr_core_news_md', | ||||
|                 'it': 'it_core_news_md', | ||||
|                 'nl': 'nl_core_news_md', | ||||
|                 'pt': 'pt_core_news_md', | ||||
|                 'ru': 'ru_core_news_md', | ||||
|                 'zh': 'zh_core_web_md'} | ||||
|  | ||||
|  | ||||
| SPACY_MODELS_VERSION = os.environ.get('SPACY_MODELS_VERSION') | ||||
| SPACY_VERSION = os.environ.get('SPACY_VERSION') | ||||
|  | ||||
| # Parse the given arguments | ||||
| parser = ArgumentParser(description=('Tag a text file with spaCy and save it ' | ||||
|                                      'as a verticalized text file.')) | ||||
| parser.add_argument('-i', '--input', metavar='txt-sourcefile', required=True) | ||||
| parser.add_argument('-o', '--output', metavar='vrt-destfile', required=True) | ||||
| parser.add_argument('-l', '--language', choices=SPACY_MODELS.keys(), required=True)  # noqa | ||||
| parser.add_argument('--check-encoding', action='store_true') | ||||
| args = parser.parse_args() | ||||
|  | ||||
|  | ||||
| # If requested: Check the encoding of the text contents from the input file | ||||
| # Else: Use utf-8 | ||||
| if args.check_encoding: | ||||
|     with open(args.input, "rb") as input_file: | ||||
|         bytes = input_file.read() | ||||
|         encoding = chardet.detect(bytes)['encoding'] | ||||
| else: | ||||
|     encoding = 'utf-8' | ||||
|  | ||||
|  | ||||
| # hashing in chunks to avoid full RAM with huge files. | ||||
| with open(args.input, 'rb') as input_file: | ||||
|     source_md5 = hashlib.md5() | ||||
|     for chunk in iter(lambda: input_file.read(128 * source_md5.block_size), b''): | ||||
|         source_md5.update(chunk) | ||||
|     source_md5 = source_md5.hexdigest() | ||||
|  | ||||
| # Load the text contents from the input file | ||||
| with open(args.input, encoding=encoding) as input_file: | ||||
|     text = input_file.read() | ||||
|     # spaCys NLP is limited to strings with maximum 1 million characters at | ||||
|     # once. So we split it into suitable chunks. | ||||
|     text_chunks = textwrap.wrap(text, 1000000, break_long_words=False) | ||||
|     # the text variable potentially occupies a lot of system memory and is no | ||||
|     # longer needed... | ||||
|     del text | ||||
|  | ||||
|  | ||||
| # Setup the spaCy toolkit by loading the chosen language model | ||||
| model = SPACY_MODELS[args.language] | ||||
| nlp = spacy.load(model) | ||||
|  | ||||
|  | ||||
| # Create the output file in verticalized text format | ||||
| # See: http://cwb.sourceforge.net/files/CWB_Encoding_Tutorial/node3.html | ||||
| output_file_original_filename = args.output | ||||
| output_file_stand_off_filename = args.output.replace('.vrt', '.stand-off.vrt') | ||||
| common_xml = ('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n' | ||||
|               + '<corpus>\n' | ||||
|               + '<text>\n' | ||||
|               + '<nlp name="spaCy:{}"\n'.format(SPACY_VERSION) | ||||
|               + '     model="{}:{}"\n'.format(model, SPACY_MODELS_VERSION) | ||||
|               + '     source-md5="{}" />\n'.format(source_md5)) | ||||
|  | ||||
| with open(output_file_original_filename, 'w+') as output_file_original, \ | ||||
|      open(output_file_stand_off_filename, 'w+') as output_file_stand_off: | ||||
|  | ||||
|     output_file_original.write(common_xml) | ||||
|     output_file_stand_off.write(common_xml) | ||||
|     text_offset = 0 | ||||
|     for text_chunk in text_chunks: | ||||
|         doc = nlp(text_chunk) | ||||
|         for sent in doc.sents: | ||||
|             output_file_original.write('<s>\n') | ||||
|             output_file_stand_off.write('<s>\n') | ||||
|             space_flag = False | ||||
|             # Skip whitespace tokens | ||||
|             sent_no_space = [token for token in sent | ||||
|                              if not token.text.isspace()] | ||||
|             # No space variant for cwb original .vrt file input. | ||||
|             for token in sent_no_space: | ||||
|                 output_file_original.write('{}'.format(escape(token.text)) | ||||
|                                            + '\t{}'.format(escape(token.lemma_)) | ||||
|                                            + '\t{}'.format(token.pos_) | ||||
|                                            + '\t{}'.format(token.tag_) | ||||
|                                            + '\t{}\n'.format(token.ent_type_ or 'NULL')) | ||||
|             # Stand off variant with spaces. | ||||
|             for token in sent: | ||||
|                 token_start = token.idx + text_offset | ||||
|                 token_end = token.idx + len(token.text) + text_offset | ||||
|                 output_file_stand_off.write('{}:{}'.format(token_start, | ||||
|                                                            token_end) | ||||
|                                             + '\t{}'.format(escape(token.lemma_)) | ||||
|                                             + '\t{}'.format(token.pos_) | ||||
|                                             + '\t{}'.format(token.tag_) | ||||
|                                             + '\t{}\n'.format(token.ent_type_ or 'NULL')) | ||||
|             output_file_original.write('</s>\n') | ||||
|             output_file_stand_off.write('</s>\n') | ||||
|         text_offset = token_end + 1 | ||||
|     output_file_original.write('</text>\n</corpus>') | ||||
|     output_file_stand_off.write('</text>\n</corpus>') | ||||
							
								
								
									
										71
									
								
								spacy_nlp
									
									
									
									
									
								
							
							
						
						
									
										71
									
								
								spacy_nlp
									
									
									
									
									
								
							| @@ -1,71 +0,0 @@ | ||||
| #!/usr/bin/env python3.5 | ||||
| # coding=utf-8 | ||||
|  | ||||
| import argparse | ||||
| import os | ||||
| import spacy | ||||
| import textwrap | ||||
|  | ||||
| parser = argparse.ArgumentParser( | ||||
|     description='Tag a text file with spaCy and save it as a verticalized text file.' | ||||
| ) | ||||
| parser.add_argument( | ||||
|     'i', | ||||
|     metavar='txt-sourcefile', | ||||
| ) | ||||
| parser.add_argument( | ||||
|     '-l', | ||||
|     choices=['de', 'el', 'en', 'es', 'fr', 'it', 'nl', 'pt'], | ||||
|     dest='lang', | ||||
|     required=True | ||||
| ) | ||||
| parser.add_argument( | ||||
|     'o', | ||||
|     metavar='vrt-destfile', | ||||
| ) | ||||
| args = parser.parse_args() | ||||
|  | ||||
| SPACY_MODELS = { | ||||
|     'de': 'de_core_news_sm', 'el': 'el_core_news_sm', 'en': 'en_core_web_sm', | ||||
|     'es': 'es_core_news_sm', 'fr': 'fr_core_news_sm', 'it': 'it_core_news_sm', | ||||
|     'nl': 'nl_core_news_sm', 'pt': 'pt_core_news_sm' | ||||
| } | ||||
|  | ||||
| # Set the language model for spacy | ||||
| nlp = spacy.load(SPACY_MODELS[args.lang]) | ||||
|  | ||||
| # Read text from the input file and if neccessary split it into parts with a | ||||
| # length of less than 1 million characters. | ||||
| with open(args.i) as input_file: | ||||
|     text = input_file.read() | ||||
|     texts = textwrap.wrap(text, 1000000, break_long_words=False) | ||||
|     text = None | ||||
|  | ||||
| # Create and open the output file | ||||
| output_file = open(args.o, 'w+') | ||||
|  | ||||
| output_file.write( | ||||
|     '<?xml version="1.0" encoding="UTF-8"?>\n<corpus>\n<text id="%s">\n' % ( | ||||
|         os.path.basename(args.i).rsplit(".", 1)[0] | ||||
|     ) | ||||
| ) | ||||
| for text in texts: | ||||
|     # Run spacy nlp over the text (partial string if above 1 million chars) | ||||
|     doc = nlp(text) | ||||
|     for sent in doc.sents: | ||||
|         output_file.write('<s>\n') | ||||
|         for token in sent: | ||||
|             # Skip whitespace tokens like "\n" or "\t" | ||||
|             if token.text.isspace(): | ||||
|                 continue | ||||
|             # Write all information in .vrt style to the output file | ||||
|             # text, lemma, simple_pos, pos, ner | ||||
|             output_file.write( | ||||
|                 token.text + '\t' + token.lemma_ + '\t' | ||||
|                 + token.pos_ + '\t' + token.tag_ + '\t' | ||||
|                 + (token.ent_type_ if token.ent_type_ != '' else 'NULL') + '\n' | ||||
|             ) | ||||
|         output_file.write('</s>\n') | ||||
| output_file.write('</text>\n</corpus>') | ||||
|  | ||||
| output_file.close() | ||||
							
								
								
									
										48
									
								
								wrapper/nlp
									
									
									
									
									
								
							
							
						
						
									
										48
									
								
								wrapper/nlp
									
									
									
									
									
								
							| @@ -1,39 +1,33 @@ | ||||
| #!/usr/bin/env python3 | ||||
| # coding=utf-8 | ||||
|  | ||||
| import argparse | ||||
| from argparse import ArgumentParser | ||||
| import os | ||||
| import subprocess | ||||
|  | ||||
| container_image = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest' | ||||
| container_input_dir = '/input' | ||||
| container_output_dir = '/output' | ||||
| uid = str(os.getuid()) | ||||
| gid = str(os.getgid()) | ||||
| CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0' | ||||
| CONTAINER_INPUT_DIR = '/input' | ||||
| CONTAINER_OUTPUT_DIR = '/output' | ||||
| UID = str(os.getuid()) | ||||
| GID = str(os.getgid()) | ||||
|  | ||||
| parser = argparse.ArgumentParser(add_help=False) | ||||
| parser.add_argument( | ||||
|     '-i', | ||||
|     dest='input_dir', | ||||
|     required=False | ||||
| ) | ||||
| parser.add_argument( | ||||
|     '-o', | ||||
|     dest='output_dir', | ||||
|     required=False | ||||
| ) | ||||
| parser = ArgumentParser(add_help=False) | ||||
| parser.add_argument('-i', '--input-directory') | ||||
| parser.add_argument('-o', '--output-directory') | ||||
| args, remaining_args = parser.parse_known_args() | ||||
|  | ||||
| cmd = ['docker', 'run', '--rm', '-it', '-u', uid + ':' + gid] | ||||
| if args.input_dir is not None: | ||||
|     host_input_dir = os.path.abspath(args.input_dir) | ||||
|     cmd += ['-v', host_input_dir + ':' + container_input_dir] | ||||
|     remaining_args += ['-i', container_input_dir] | ||||
| if args.output_dir is not None: | ||||
|     host_output_dir = os.path.abspath(args.output_dir) | ||||
|     cmd += ['-v', host_output_dir + ':' + container_output_dir] | ||||
|     remaining_args += ['-o', container_output_dir] | ||||
| cmd.append(container_image) | ||||
| cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)] | ||||
| if args.output_directory is not None: | ||||
|     cmd += ['-v', '{}:{}'.format(os.path.abspath(args.output_directory), | ||||
|                                  CONTAINER_OUTPUT_DIR)] | ||||
|     remaining_args.insert(0, CONTAINER_OUTPUT_DIR) | ||||
|     remaining_args.insert(0, '-o') | ||||
| if args.input_directory is not None: | ||||
|     cmd += ['-v', '{}:{}'.format(os.path.abspath(args.input_directory), | ||||
|                                  CONTAINER_INPUT_DIR)] | ||||
|     remaining_args.insert(0, CONTAINER_INPUT_DIR) | ||||
|     remaining_args.insert(0, '-i') | ||||
| cmd.append(CONTAINER_IMAGE) | ||||
| cmd += remaining_args | ||||
|  | ||||
| subprocess.run(cmd) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user