Mark required arguments in scripts as required

Bump spaCy version, bugfixes, codestyle
optimizations
2025-07-09 15:23:18 +00:00 · 2022-02-03 10:40:25 +01:00 · 2022-01-27 16:50:22 +01:00 · 2021-08-11 16:47:29 +02:00 · 2021-08-10 14:43:55 +02:00 · 2021-08-06 16:50:22 +02:00
11 changed files with 1005 additions and 329 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -1,8 +1,5 @@
 image: docker:19.03.13
 variables:
  DOCKER_TLS_CERTDIR: "/certs"
 services:
  - docker:19.03.13-dind
@ -10,6 +7,10 @@ stages:
  - build
  - push
 variables:
  DOCKER_TLS_CERTDIR: "/certs"
  INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME-$CI_COMMIT_SHA
 .reg_setup:
  before_script:
    - apk add --no-cache curl
@ -28,8 +29,6 @@ build_image:
  stage: build
  tags:
    - docker
  variables:
    INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
 push_master:
  extends:
@ -47,7 +46,6 @@ push_master:
    - docker
  variables:
    IMAGE_TAG: $CI_REGISTRY_IMAGE:latest
    INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
 push_other:
  extends:
@ -68,4 +66,3 @@ push_other:
    - docker
  variables:
    IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
    INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
--- a/54
+++ b/54
@ -7,28 +7,36 @@ LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <por
 ENV LANG=C.UTF-8
-RUN apt-get update
+RUN apt-get update \
 && apt-get install --no-install-recommends --yes \
      procps \
      python3.7 \
      python3-pip \
      wget \
 && python3 -m pip install \
      chardet \
      setuptools \
      wheel
-
+# Install the NLP pipeline and it's dependencies #
 # Install pipeline dependencies #
 ## Install pyFlow ##
-ENV PYFLOW_RELEASE=1.1.20
+ENV PYFLOW_VERSION=1.1.20
-ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" .
+RUN wget --no-check-certificate --quiet \
-RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \
+      "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \
- && cd "pyflow-${PYFLOW_RELEASE}" \
+ && tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
 && cd "pyflow-${PYFLOW_VERSION}" \
 && apt-get install --no-install-recommends --yes \
      python2.7 \
 && python2.7 setup.py build install \
- && cd .. \
+ && cd - > /dev/null \
- && rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz"
+ && rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"
 ## Install spaCy ##
-ENV SPACY_VERSION=3.0.3
+ENV SPACY_VERSION=3.2.1
 RUN apt-get install --no-install-recommends --yes \
      python3.7 \
      python3-pip \
      zip \
 && pip3 install \
      chardet \
      setuptools \
@ -36,22 +44,20 @@ RUN apt-get install --no-install-recommends --yes \
 && pip3 install --upgrade pip \
 && pip3 install "spacy==${SPACY_VERSION}"
-ENV SPACY_MODELS_VERSION=3.0.0
+
-RUN python3 -m spacy download "da_core_news_md-${SPACY_MODELS_VERSION}" --direct \
+ENV SPACY_MODELS="de_core_news_md,en_core_web_md,it_core_news_md,pl_core_news_md,zh_core_web_md"
- && python3 -m spacy download "de_core_news_md-${SPACY_MODELS_VERSION}" --direct \
+ENV SPACY_MODELS_VERSION=3.2.0
- && python3 -m spacy download "el_core_news_md-${SPACY_MODELS_VERSION}" --direct \
+RUN for spacy_model in $(echo ${SPACY_MODELS} | tr "," "\n"); do python3 -m spacy download "${spacy_model}-${SPACY_MODELS_VERSION}" --direct; done
- && python3 -m spacy download "en_core_web_md-${SPACY_MODELS_VERSION}" --direct \
+
- && python3 -m spacy download "es_core_news_md-${SPACY_MODELS_VERSION}" --direct \
+
- && python3 -m spacy download "fr_core_news_md-${SPACY_MODELS_VERSION}" --direct \
+COPY packages .
- && python3 -m spacy download "it_core_news_md-${SPACY_MODELS_VERSION}" --direct \
+RUN cd stand-off-data-py \
- && python3 -m spacy download "nl_core_news_md-${SPACY_MODELS_VERSION}" --direct \
+ && python3 -m pip install . \
- && python3 -m spacy download "pt_core_news_md-${SPACY_MODELS_VERSION}" --direct \
+ && cd -
 && python3 -m spacy download "ru_core_news_md-${SPACY_MODELS_VERSION}" --direct \
 && python3 -m spacy download "zh_core_web_md-${SPACY_MODELS_VERSION}" --direct
 ## Install Pipeline ##
-COPY nlp spacy-nlp /usr/local/bin/
+COPY nlp spacy-nlp vrt-creator /usr/local/bin/
 RUN rm -r /var/lib/apt/lists/*
--- a/21
+++ b/21
@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2021 Bielefeld University - CRC 1288 - INF
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/README.md
+++ b/README.md
@ -1,88 +1,41 @@
 # NLP - Natural Language Processing
-This software implements a heavily parallelized pipeline for Natural Language Processing of text files. It is used for nopaque's NLP service but you can also use it standalone, for that purpose a convenient wrapper script is provided.
+This software implements a heavily parallelized pipeline for Natural Language Processing of text files. It is used for nopaque's NLP service but you can also use it standalone, for that purpose a convenient wrapper script is provided. The pipeline is designed to run on Linux operating systems, but with some tweaks it should also run on Windows with WSL installed.
 ## Software used in this pipeline implementation
- Official Debian Docker image (buster-slim) and programs from its free repositories: https://hub.docker.com/_/debian
+
 - Official Debian Docker image (buster-slim): https://hub.docker.com/_/debian
  - Software from Debian Buster's free repositories
 - pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
- spaCy (3.0.3): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1
+- spaCy (3.2.1): https://github.com/explosion/spaCy/releases/tag/v3.2.1
- spaCy medium sized models (3.0.0):
+- spaCy medium sized models (3.2.0):
-  - https://github.com/explosion/spacy-models/releases/tag/da_core_news_md-3.0.0
+  - https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.2.0
-  - https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.0.0
+  - https://github.com/explosion/spacy-models/releases/tag/en_core_web_md-3.2.0
-  - https://github.com/explosion/spacy-models/releases/tag/el_core_news_md-3.0.0
+  - https://github.com/explosion/spacy-models/releases/tag/it_core_news_md-3.2.0
-  - https://github.com/explosion/spacy-models/releases/tag/en_core_web_md-3.0.0
+  - https://github.com/explosion/spacy-models/releases/tag/nl_core_news_md-3.2.0
-  - https://github.com/explosion/spacy-models/releases/tag/es_core_news_md-3.0.0
+  - https://github.com/explosion/spacy-models/releases/tag/pl_core_news_md-3.2.0
-  - https://github.com/explosion/spacy-models/releases/tag/fr_core_news_md-3.0.0
+  - https://github.com/explosion/spacy-models/releases/tag/zh_core_web_md-3.2.0
  - https://github.com/explosion/spacy-models/releases/tag/it_core_news_md-3.0.0
  - https://github.com/explosion/spacy-models/releases/tag/nl_core_news_md-3.0.0
  - https://github.com/explosion/spacy-models/releases/tag/pt_core_news_md-3.0.0
  - https://github.com/explosion/spacy-models/releases/tag/ru_core_news_md-3.0.0
  - https://github.com/explosion/spacy-models/releases/tag/zh_core_web_md-3.0.0
-## Use this image
+## Installation
-1. Create input and output directories for the pipeline.
+1. Install Docker and Python 3.
-``` bash
+2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git`
-mkdir -p /<my_data_location>/input /<my_data_location>/output
+3. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:v0.1.0 nlp`
-```
+4. Add the wrapper script (`wrapper/nlp` relative to this README file) to your `${PATH}`.
 5. Create working directories for the pipeline: `mkdir -p /<my_data_location>/{input,output}`.
 2. Place your text files inside `/<my_data_location>/input`. Files should all contain text of the same language.
-3. Start the pipeline process. Check the [Pipeline arguments](#pipeline-arguments) section for more details.
+## Use the Pipeline
-```
+
-# Option one: Use the wrapper script
+1. Place your plain text files inside `/<my_data_location>/input`. Files should all contain text of the same language.
-## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/raw/1.0.0/wrapper/nlp, make it executeable and add it to your ${PATH}
+2. Clear your `/<my_data_location>/output` directory.
 3. Start the pipeline process. Check the pipeline help (`nlp --help`) for more details.
 ```bash
 cd /<my_data_location>
-nlp -i input -l <language_code> -o output <optional_pipeline_arguments>
+nlp \
-
+  --input-dir input \
-# Option two: Classic Docker style
+  --output-dir output \
-docker run \
+  -m <model_code> <optional_pipeline_arguments>
    --rm \
    -it \
    -u $(id -u $USER):$(id -g $USER) \
    -v /<my_data_location>/input:/input \
    -v /<my_data_location>/output:/output \
    gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0 \
        -i /input \
        -l <language_code>
        -o /output \
        <optional_pipeline_arguments>
 ```
 4. Check your results in the `/<my_data_location>/output` directory.
 ```
 ### Pipeline arguments
 `--check-encoding`
 * If set, the pipeline tries to automatically determine the right encoding for
 your texts. Only use it if you are not sure that your input is provided in UTF-8.
 * default = False
 * required = False
 `-l languagecode`
 * Tells spaCy which language will be used.
 * options = da (Danish), de (German), el (Greek), en (English), es (Spanish), fr (French), it (Italian), nl (Dutch), pt (Portuguese), ru (Russian), zh (Chinese)
 * required = True
 `--nCores corenumber`
 * Sets the number of CPU cores being used during the NLP process.
 * default = min(4, multiprocessing.cpu_count())
 * required = False
 ``` bash
 # Example with all arguments used
 docker run \
    --rm \
    -it \
    -u $(id -u $USER):$(id -g $USER) \
    -v "$HOME"/ocr/input:/input \
    -v "$HOME"/ocr/output:/output \
    gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0 \
        -i /input \
        -l en \
        -o /output \
        --check-encoding \
        --nCores 8 \
 ```
--- a/351
+++ b/351
@ -1,169 +1,286 @@
 #!/usr/bin/env python2.7
 # coding=utf-8
-"""A NLP pipeline for text file processing."""
+''' A NLP pipeline for text file processing. '''
-
+__version__ = '0.1.0'
 __author__ = 'Patrick Jentsch <p.jentsch@uni-bielefeld.de>,' \
             'Stephan Porada <porada@posteo.de>'
 __version__ = '1.0.0'
 from argparse import ArgumentParser
 from pyflow import WorkflowRunner
-import multiprocessing
+import json
 import os
 import sys
-SPACY_MODELS = {'da': 'da_core_news_md',
+SPACY_MODELS = {
-                'de': 'de_core_news_md',
+    'de': 'de_core_news_md',
-                'el': 'el_core_news_md',
+    'en': 'en_core_web_md',
-                'en': 'en_core_web_md',
+    'it': 'it_core_news_md',
-                'es': 'es_core_news_md',
+    'nl': 'nl_core_news_md',
-                'fr': 'fr_core_news_md',
+    'pl': 'pl_core_news_md',
-                'it': 'it_core_news_md',
+    'zh': 'zh_core_web_md'
-                'nl': 'nl_core_news_md',
+}
                'pt': 'pt_core_news_md',
                'ru': 'ru_core_news_md',
                'zh': 'zh_core_web_md'}
-def parse_args():
+class PipelineJob:
-    parser = ArgumentParser(description='NLP Pipeline utilizing spaCy.')
+    '''
-    parser.add_argument('-i', '--input-directory',
+    NLP pipeline job class.
                        help='Input directory (only txt files get processed)',
                        required=True)
    parser.add_argument('-o', '--output-directory',
                        help='Output directory',
                        required=True)
    parser.add_argument('-l', '--language',
                        choices=SPACY_MODELS.keys(),
                        required=True)
    parser.add_argument('--check-encoding', action='store_true')
    parser.add_argument('--log-dir')
    parser.add_argument('--n-cores',
                        default=min(4, multiprocessing.cpu_count()),
                        help='total number of cores available', type=int)
    parser.add_argument('--zip', help='Zips everything into one archive.')
    return parser.parse_args()
-
+    Each plain text input file of the pipeline is represented as an NLP
-class NLPPipelineJob:
+    pipeline job, which holds all necessary information for the pipeline to
-    """An NLP pipeline job class
+    process it.
    Each input file of the pipeline is represented as an NLP pipeline job,
    which holds all necessary information for the pipeline to process it.
    Arguments:
    file -- Path to the file
-    output_dir -- Path to a directory, where job results a stored
+    output_dir -- Path to a directory, where job results are stored
-    intermediate_dir -- Path to a directory, where intermediate files are
+    '''
                        stored.
    """
    def __init__(self, file, output_dir):
        self.file = file
-        self.name = os.path.basename(file).rsplit('.', 1)[0]
+        self.name = os.path.basename(file)[:-4]
        self.output_dir = output_dir
-class NLPPipeline(WorkflowRunner):
+class NLPWorkflow(WorkflowRunner):
-    def __init__(self, input_dir, lang, output_dir, check_encoding, n_cores, zip):
+    def __init__(self, job, model, check_encoding=False, id_prefix=''):
        self.job = job
        self.model = model
        self.check_encoding = check_encoding
        self.id_prefix = id_prefix
    def workflow(self):
        '''
        ' ##################################################
        ' # spacy                                          #
        ' ##################################################
        '''
        n_cores = 1
        mem_mb = min(1024, self.getMemMb())
        cmd = 'spacy-nlp'
        cmd += ' --input-file "{}"'.format(self.job.file)
        cmd += ' --output-file "{}"'.format(
            os.path.join(self.job.output_dir, '{}.json'.format(self.job.name))
        )
        cmd += ' -m "{}"'.format(self.model)
        if self.check_encoding:
            cmd += ' --check-encoding'
        cmd += ' --id-prefix "{}"'.format(self.id_prefix)
        self.addTask(
            'spacy',
            command=cmd,
            memMb=mem_mb,
            nCores=n_cores
        )
 class CreateVrtWorkflow(WorkflowRunner):
    def __init__(self, job):
        self.job = job
    def workflow(self):
        '''
        ' ##################################################
        ' # vrt-creator                                    #
        ' ##################################################
        '''
        n_cores = 1
        mem_mb = min(256, self.getMemMb())
        cmd = 'vrt-creator'
        cmd += ' --stand-off-data-file "{}"'.format(
            os.path.join(self.job.output_dir, '{}.json'.format(self.job.name))
        )
        cmd += ' --text-file "{}"'.format(self.job.file)
        cmd += ' --output-file "{}"'.format(
            os.path.join(self.job.output_dir, '{}.vrt'.format(self.job.name))
        )
        self.addTask(
            'vrt_creator',
            command=cmd,
            memMb=mem_mb,
            nCores=n_cores
        )
 class MainWorkflow(WorkflowRunner):
    def __init__(
        self,
        input_dir,
        model,
        output_dir,
        check_encoding=False,
        id_prefix=''
    ):
        self.input_dir = input_dir
-        self.lang = lang
+        self.model = model
        self.output_dir = output_dir
        self.check_encoding = check_encoding
-        self.n_cores = n_cores
+        self.id_prefix = id_prefix
-        self.output_dir = output_dir
+        self.jobs = []
-        if zip is None:
+
-            self.zip = zip
+    def collect_jobs(self):
-        else:
+        self.jobs = []
-            if zip.lower().endswith('.zip'):
+        for file in os.listdir(self.input_dir):
-                # Remove .zip file extension if provided
+            if os.path.isdir(os.path.join(self.input_dir, file)):
-                self.zip = zip[:-4]
+                continue
-                self.zip = self.zip if self.zip else 'output'
+            if not file.lower().endswith('.txt'):
-            else:
+                continue
-                self.zip = zip
+            job = PipelineJob(
-        self.jobs = collect_jobs(self.input_dir, self.output_dir)
+                os.path.join(self.input_dir, file),
                os.path.join(self.output_dir, file)
            )
            self.jobs.append(job)
    def workflow(self):
        if not self.jobs:
            return
-        '''
+        # Create output and temporary directories
-        ' ##################################################
+        for job in self.jobs:
-        ' # setup output directory                         #
+            os.mkdir(job.output_dir)
        ' ##################################################
        '''
        setup_output_directory_tasks = []
        for i, job in enumerate(self.jobs):
            cmd = 'mkdir'
            cmd += ' -p'
            cmd += ' "{}"'.format(job.output_dir)
            lbl = 'setup_output_directory_-_{}'.format(i)
            task = self.addTask(command=cmd, label=lbl)
            setup_output_directory_tasks.append(task)
        '''
        ' ##################################################
-        ' # nlp                                 #
+        ' # nlp                                            #
        ' ##################################################
        '''
        nlp_tasks = []
        n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs))))
        for i, job in enumerate(self.jobs):
-            output_file = os.path.join(job.output_dir, '{}.vrt'.format(job.name))  # noqa
+            task = self.addWorkflowTask(
-            cmd = 'spacy-nlp'
+                'nlp_-_{}'.format(i),
-            cmd += ' -i "{}"'.format(job.file)
+                NLPWorkflow(
-            cmd += ' -l "{}"'.format(self.lang)
+                    job,
-            cmd += ' -o "{}"'.format(output_file)
+                    self.model,
-            if self.check_encoding:
+                    check_encoding=self.check_encoding,
-                cmd += ' --check-encoding'
+                    id_prefix=self.id_prefix
-            deps = 'setup_output_directory_-_{}'.format(i)
+                )
-            lbl = 'nlp_-_{}'.format(i)
+            )
            task = self.addTask(command=cmd, dependencies=deps, label=lbl, nCores=n_cores)  # noqa
            nlp_tasks.append(task)
        '''
        ' ##################################################
-        ' # zip creation                                   #
+        ' # create vrt                                     #
        ' ##################################################
        '''
-        zip_creation_tasks = []
+        create_vrt_tasks = []
-        if self.zip is not None:
+        for i, job in enumerate(self.jobs):
-            cmd = 'cd "{}"'.format(self.output_dir)
+            task = self.addWorkflowTask(
-            cmd += ' && '
+                'create_vrt_-_{}'.format(i),
-            cmd += 'zip'
+                CreateVrtWorkflow(job),
-            cmd += ' -r'
+                dependencies='nlp_-_{}'.format(i)
-            cmd += ' "{}.zip" .'.format(self.zip)
+            )
-            cmd += ' -x "pyflow.data*"'
+            create_vrt_tasks.append(task)
-            cmd += ' -i "*.vrt"'
+
-            cmd += ' && '
+        self.waitForTasks()
-            cmd += 'cd -'
+        outputs = []
-            deps = nlp_tasks
+        for job in self.jobs:
-            lbl = 'zip_creation'
+            # Track output files
-            task = self.addTask(command=cmd, dependencies=deps, label=lbl)
+            relative_output_dir = os.path.relpath(
-            zip_creation_tasks.append(task)
+                job.output_dir,
                start=self.output_dir
            )
            outputs.append(
                {
                    'description': 'JSON stand off data',
                    'file': os.path.join(
                        relative_output_dir,
                        '{}.json'.format(job.name)
                    ),
                    'mimetype': 'application/json'
                }
            )
            outputs.append(
                {
                    'description': 'CWB vrt file',
                    'file': os.path.join(
                        relative_output_dir,
                        '{}.vrt'.format(job.name)
                    ),
                    'mimetype': 'application/vrt+xml'
                }
            )
        with open(os.path.join(self.output_dir, 'outputs.json'), 'w') as f:
            json.dump(outputs, f, indent=4)
-def collect_jobs(input_dir, output_dir):
+def parse_args():
-    jobs = []
+    parser = ArgumentParser(
-    for file in os.listdir(input_dir):
+        description='NLP pipeline for plain text file processing'
-        if os.path.isdir(os.path.join(input_dir, file)):
+    )
-            jobs += collect_jobs(os.path.join(input_dir, file),
+    parser.add_argument(
-                                 os.path.join(output_dir, file))
+        '-i', '--input-dir',
-        elif file.lower().endswith('.txt'):
+        help='Input directory',
-            jobs.append(NLPPipelineJob(os.path.join(input_dir, file),
+        required=True
-                                       os.path.join(output_dir, file)))
+    )
-    return jobs
+    parser.add_argument(
        '-o', '--output-dir',
        help='Output directory',
        required=True
    )
    parser.add_argument(
        '-m', '--model',
        choices=SPACY_MODELS.keys(),
        help='The model to be used',
        required=True
    )
    parser.add_argument(
        '--check-encoding',
        action='store_true',
        help='Check encoding of the input file, UTF-8 is used instead'
    )
    parser.add_argument(
        '--id-prefix',
        default='',
        help='A prefix for all the ids within the stand off annotations'
    )
    parser.add_argument(
        '--log-dir',
        help='Logging directory (Default: --output-dir)'
    )
    parser.add_argument(
        '--mem-mb',
        help='Amount of system memory to be used '
             '(Default: min(--n-cores * 1024, available system memory))',
        type=int
    )
    parser.add_argument(
        '--n-cores',
        default=1,
        help='Number of CPU threads to be used',
        type=int
    )
    parser.add_argument(
        '-v', '--version',
        action='version',
        help='Returns the current version of the NLP pipeline',
        version='%(prog)s {}'.format(__version__)
    )
    args = parser.parse_args()
    # Set some tricky default values and check for insufficient input
    if args.log_dir is None:
        args.log_dir = args.output_dir
    if args.n_cores < 1:
        raise Exception('--n-cores must be greater or equal 1')
    if args.mem_mb is None:
        max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0])
        args.mem_mb = min(args.n_cores * 1024, max_mem_mb)
    if args.mem_mb < 1024:
        raise Exception('--mem-mb must be greater or equal 1024')
    return args
 def main():
    args = parse_args()
-    nlp_pipeline = NLPPipeline(args.input_directory, args.language,
+    main_workflow = MainWorkflow(
-                               args.output_directory, args.check_encoding,
+        args.input_dir,
-                               args.n_cores, args.zip)
+        args.model,
-    retval = nlp_pipeline.run(
+        args.output_dir,
-        dataDirRoot=(args.log_dir or args.output_directory),
+        check_encoding=args.check_encoding,
        id_prefix=args.id_prefix
    )
    main_workflow.collect_jobs()
    retval = main_workflow.run(
        dataDirRoot=args.log_dir,
        memMb=args.mem_mb,
        nCores=args.n_cores
    )
    sys.exit(retval)
--- a/packages/stand-off-data-py/setup.py
+++ b/packages/stand-off-data-py/setup.py
@ -0,0 +1,14 @@
 import setuptools
 setuptools.setup(
    name='Stand off data',
    author='Patrick Jentsch',
    author_email='p.jentsch@uni-bielefeld.de',
    description='A python library to handle stand off data.',
    py_modules=['stand_off_data'],
    classifiers=[
        'Programming Language :: Python :: 3',
        'Operating System :: OS Independent',
    ],
    python_requires='>=3.5'
 )
--- a/packages/stand-off-data-py/stand_off_data.py
+++ b/packages/stand-off-data-py/stand_off_data.py
@ -0,0 +1,282 @@
 from xml.sax.saxutils import escape
 class StandOffData:
    def __init__(self, attrs={}):
        self.meta = attrs.get('meta', {})
        self.lookup = {}
        for x in attrs.get('tags', []):
            self.add_tag_definition(x)
        self.annotations = [
            TagAnnotation(x, self.lookup)
            for x in attrs.get('annotations', [])
        ]
    def add_tag_definition(self, attrs):
        tag_definition = TagDefinition(attrs)
        if tag_definition.id in self.lookup:
            raise Exception(f'Tag id already in use: {self.to_dict()}')
        self.lookup[tag_definition.id] = tag_definition
    def to_dict(self):
        return {
            'meta': self.meta,
            'lookup': {k: v.to_dict() for k, v in self.lookup.items()},
            'annotations': [x.to_dict() for x in self.annotations]
        }
    def to_vrt(self, text):
        # Devide annotations into CWB's verticalized text format (.vrt) logic
        p_attrs = []    # positional attributes
        s_attrs = []    # structural attributes
        for annotation in self.annotations:
            if annotation.name == 'token':
                p_attrs.append(annotation)
            else:
                s_attrs.append(annotation)
        # Sort annotations, necessary for the next checks
        p_attrs.sort()
        s_attrs.sort()
        # Check for p_attr<->p_attr overlap
        for i, p_attr in enumerate(p_attrs[:-1]):
            next_p_attr = p_attrs[i + 1]
            # Check if first_p_attr starts/ends within second_p_attr
            if ((p_attr.start >= next_p_attr.start) and (p_attr.start < next_p_attr.end)  # noqa
                or (p_attr.end > next_p_attr.start) and (p_attr.end <= next_p_attr.end)):  # noqa
                raise Exception(
                    'Positional attribute overlaps another: '
                    f'{p_attr.to_dict()}<->{next_p_attr.to_dict()}'
                )
        # Check for s_attr<->p_attr overlap
        for i, s_attr in enumerate(s_attrs):
            for p_attr in p_attrs:
                # Check if s_attr starts within p_attr
                if s_attr.start > p_attr.start and s_attr.start < p_attr.end:
                    # Change s_attr start to p_attr's start
                    s_attrs[i].start = p_attr.start
                # Check if s_attr ends within p_attr
                if s_attr.end < p_attr.end and s_attr.end > p_attr.start:
                    # Change s_attr end to p_attr's end
                    s_attrs[i].end = p_attr.end
                # Check if s_attr starts/ends before/after p_attr
                if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start:
                    # No further Checking needed (because p_attrs are sorted)
                    break
        p_attr_buffer = {}
        for i, p_attr in enumerate(p_attrs):
            p_attr_buffer[p_attr.start] = i
        s_attr_start_buffer = {}
        s_attr_end_buffer = {}
        for i, s_attr in enumerate(s_attrs):
            if s_attr.start in s_attr_start_buffer:
                s_attr_start_buffer[s_attr.start].append(i)
            else:
                s_attr_start_buffer[s_attr.start] = [i]
            if s_attr.end in s_attr_end_buffer:
                s_attr_end_buffer[s_attr.end].insert(0, i)
            else:
                s_attr_end_buffer[s_attr.end] = [i]
        vrt = ''
        vrt += '<text>\n'
        current_position = 0
        text_len = len(text)
        # As long as we have something in our buffers we process it
        while current_position <= text_len:
            # s_attr endings
            # for k in {k: v for k, v in s_attr_end_buffer.items() if k <= current_position}:  # noqa
            if current_position in s_attr_end_buffer:
                # s_attr_indexes = s_attr_end_buffer.pop(k)
                s_attr_indexes = s_attr_end_buffer.pop(current_position)
                for s_attr_index in s_attr_indexes:
                    s_attr = s_attrs[s_attr_index]
                    vrt += f'</{escape(s_attr.name)}>\n'
            # s_attrs starts
            # for k in {k: v for k, v in s_attr_start_buffer.items() if k <= current_position}:  # noqa
            if current_position in s_attr_start_buffer:
                # s_attr_indexes = s_attr_start_buffer.pop(k)
                s_attr_indexes = s_attr_start_buffer.pop(current_position)
                for s_attr_index in s_attr_indexes:
                    s_attr = s_attrs[s_attr_index]
                    vrt += f'<{escape(s_attr.name)}'
                    for property in s_attr.properties:
                        vrt += f' {escape(property.name)}="{escape(str(property.value))}"'  # noqa
                    vrt += '>\n'
            # p_attrs
            if current_position not in p_attr_buffer:
                current_position += 1
                continue
            p_attr_index = p_attr_buffer.pop(current_position)
            p_attr = p_attrs[p_attr_index]
            if text[p_attr.start:p_attr.end].isspace():
                current_position = p_attr.end
                continue
            _p_attr = {
                'lemma': 'None',
                'pos': 'None',
                'simple_pos': 'None',
                'word': 'None'
            }
            for property in p_attr.properties:
                if property.name not in _p_attr:
                    continue
                _p_attr[property.name] = escape(str(property.value))
            _p_attr['word'] = escape(text[p_attr.start:p_attr.end])
            vrt += '{word}\t{pos}\t{lemma}\t{simple_pos}\n'.format(**_p_attr)
            current_position = p_attr.end
        vrt += '</text>\n'
        return vrt
 class TagAnnotation:
    def __init__(self, attrs, lookup):
        self.lookup = lookup
        self.tag_id = attrs['tag_id']
        self.start = attrs['start']
        self.end = attrs['end']
        self.properties = [
            PropertyAnnotation(x, self.lookup[self.tag_id].properties)
            for x in attrs.get('properties', [])
        ]
        ''' Sanity checks '''
        if self.tag_id not in self.lookup:
            raise Exception(f'Unknown tag: {self.to_dict()}')
        if self.end < self.start:
            raise Exception(f'Annotation end less then start: {self.to_dict()}')  # noqa
        # property_ids = [x.property_id for x in self.properties]
        # for required_property_id, required_property in self.lookup[self.tag_id].required_properties.items():  # noqa
        #     if required_property_id not in property_ids:
        #         raise Exception(
        #             f'Missing required property: {required_property.to_dict()}'
        #         )
    @property
    def name(self):
        return self.lookup[self.tag_id].name
    def to_dict(self):
        return {
            'tag_id': self.tag_id,
            'start': self.start,
            'end': self.end,
            'properties': [x.to_dict() for x in self.properties]
        }
    def __lt__(self, other):
        if self.start == other.start:
            if self.name == 'token' and other.name != 'token':
                return False
            elif self.name != 'token' and other.name == 'token':
                return True
            else:
                return self.end > other.end
        else:
            return self.start < other.start
    def __le__(self, other):
        if self.start == other.start:
            if self.name == 'token' and other.name != 'token':
                return False
            elif self.name != 'token' and other.name == 'token':
                return True
            else:
                return self.end >= other.end
        else:
            return self.start <= other.start
    def __eq__(self, other):
        if self.start == other.start:
            if self.name == 'token' and other.name != 'token':
                return False
            elif self.name != 'token' and other.name == 'token':
                return False
            else:
                return self.end == other.end
        else:
            return False
    def __ne__(self, other):
        return not self == other
    def __gt__(self, other):
        return not self <= other
    def __ge__(self, other):
        return not self < other
 class PropertyAnnotation:
    def __init__(self, attrs, lookup):
        self.lookup = lookup
        self.property_id = attrs['property_id']
        self.value = attrs['value']
        # TODO: Process attrs['possibleValues'] as self.labels (no id?)
        ''' Sanity checks '''
        if self.property_id not in self.lookup:
            raise Exception(f'Unknown property: {self.to_dict()}')
    @property
    def name(self):
        return self.lookup[self.property_id].name
    def to_dict(self):
        return {
            'property_id': self.property_id,
            'tag_id': self.tag_id,
            'value': self.value
        }
 class TagDefinition:
    def __init__(self, attrs):
        self.id = attrs['id']
        self.name = attrs['name']
        self.description = attrs.get('description', '')
        self.properties = {}
        for x in attrs.get('properties', []):
            self.add_property_definition(x)
    def add_property_definition(self, attrs):
        property_definition = PropertyDefinition(attrs)
        if property_definition.id in self.properties:
            raise Exception(
                f'Property id already in use: {property_definition.to_dict()}')
        self.properties[property_definition.id] = property_definition
    # @property
    # def required_properties(self):
    #     return {property.id: property for property in self.properties.values()
    #             if property.is_required}
    def to_dict(self):
        return {
            'id': self.id,
            'name': self.name,
            'description': self.description,
            'properties': {k: v.to_dict() for k, v in self.properties.items()}
        }
 class PropertyDefinition:
    def __init__(self, attrs):
        self.id = attrs['id']
        self.name = attrs['name']
        self.description = attrs.get('description', '')
        self.flags = attrs.get('flags', [])
        self.labels = attrs.get('labels', [])
    # @property
    # def is_required(self):
    #     return 'required' in self.flags
    @property
    def has_multiple_values(self):
        return 'multiple' in self.flags
    def to_dict(self):
        return {
            'id': self.id,
            'name': self.name,
            'description': self.description,
            'flags': self.flags,
            'labels': self.labels
        }
--- a/packages/stand-off-data-py/stand_off_data/init.py
+++ b/packages/stand-off-data-py/stand_off_data/init.py
@ -0,0 +1,2 @@
 # flake8: noqa
 from .models import StandOffData
--- a/416
+++ b/416
@ -2,118 +2,352 @@
 # coding=utf-8
 from argparse import ArgumentParser
 from xml.sax.saxutils import escape
 import chardet
 import hashlib
 import json
 import os
 import spacy
 import textwrap
 import uuid
-SPACY_MODELS = {'da': 'da_core_news_md',
+spacy_models = {
-                'de': 'de_core_news_md',
+    spacy.info(pipeline)['lang']: pipeline
-                'el': 'el_core_news_md',
+    for pipeline in spacy.info()['pipelines']
-                'en': 'en_core_web_md',
+}
                'es': 'es_core_news_md',
                'fr': 'fr_core_news_md',
                'it': 'it_core_news_md',
                'nl': 'nl_core_news_md',
                'pt': 'pt_core_news_md',
                'ru': 'ru_core_news_md',
                'zh': 'zh_core_web_md'}
 SPACY_MODELS_VERSION = os.environ.get('SPACY_MODELS_VERSION')
 SPACY_VERSION = os.environ.get('SPACY_VERSION')
 # Parse the given arguments
-parser = ArgumentParser(description=('Tag a text file with spaCy and save it '
+parser = ArgumentParser(
-                                     'as a verticalized text file.'))
+    description='Create annotations for a given plain txt file'
-parser.add_argument('-i', '--input', metavar='txt-sourcefile', required=True)
+)
-parser.add_argument('-o', '--output', metavar='vrt-destfile', required=True)
+parser.add_argument(
-parser.add_argument('-l', '--language', choices=SPACY_MODELS.keys(), required=True)  # noqa
+    '-i', '--input-file',
-parser.add_argument('--check-encoding', action='store_true')
+    help='Input file',
    required=True
 )
 parser.add_argument(
    '-o', '--output-file',
    help='Output file',
    required=True
 )
 parser.add_argument(
    '-m', '--model',
    choices=spacy_models.keys(),
    help='The model to be used',
    required=True
 )
 parser.add_argument(
    '-c', '--check-encoding',
    action='store_true',
    help='Check encoding of the input file, UTF-8 is used instead'
 )
 parser.add_argument(
    '--id-prefix',
    default='',
    help='A prefix for all the ids within the stand off annotations'
 )
 args = parser.parse_args()
-# If requested: Check the encoding of the text contents from the input file
+def generate_id(name):
-# Else: Use utf-8
+    return f'{args.id_prefix}{uuid.uuid3(uuid.NAMESPACE_DNS, name)}'
 if args.check_encoding:
    with open(args.input, "rb") as input_file:
        bytes = input_file.read()
        encoding = chardet.detect(bytes)['encoding']
 else:
    encoding = 'utf-8'
-# hashing in chunks to avoid full RAM with huge files.
+with open(args.input_file, "rb") as input_file:
-with open(args.input, 'rb') as input_file:
+    if args.check_encoding:
-    source_md5 = hashlib.md5()
+        encoding = chardet.detect(input_file.read())['encoding']
-    for chunk in iter(lambda: input_file.read(128 * source_md5.block_size), b''):
+    else:
-        source_md5.update(chunk)
+        encoding = 'utf-8'
-    source_md5 = source_md5.hexdigest()
+    input_file.seek(0)
    text_md5 = hashlib.md5()
    for chunk in iter(lambda: input_file.read(128 * text_md5.block_size), b''):
        text_md5.update(chunk)
 # Load the text contents from the input file
-with open(args.input, encoding=encoding) as input_file:
+with open(args.input_file, encoding=encoding) as input_file:
-    text = input_file.read()
+    # spaCy NLP is limited to strings with a maximum of 1 million characters at
    # spaCys NLP is limited to strings with maximum 1 million characters at
    # once. So we split it into suitable chunks.
-    text_chunks = textwrap.wrap(text, 1000000, break_long_words=False)
+    text_chunks = textwrap.wrap(
-    # the text variable potentially occupies a lot of system memory and is no
+        input_file.read(),
-    # longer needed...
+        1000000,
-    del text
+        break_long_words=False,
        break_on_hyphens=False,
        drop_whitespace=False,
        expand_tabs=False,
        replace_whitespace=False
    )
 model_name = spacy_models[args.model]
 nlp = spacy.load(model_name)
-# Setup the spaCy toolkit by loading the chosen language model
+meta = {
-model = SPACY_MODELS[args.language]
+    'generator': {
-nlp = spacy.load(model)
+        'name': 'nopaque spacy NLP',
        'version': '0.1.0',
        'arguments': {
            'check_encoding': args.check_encoding,
            'model': args.model
        }
    },
    'file': {
        'encoding': encoding,
        'md5': text_md5.hexdigest(),
        'name': os.path.basename(args.input_file)
    }
 }
 tags = []
 token = {
    'id': generate_id('token'),
    'name': 'token',
    'description': 'An individual token — i.e. a word, punctuation symbol, whitespace, etc.',  # noqa
    'properties': []
 }
 # TODO: Check if all languages support token.sentiment
 token['properties'].append(
    {
        'id': generate_id('token.sentiment'),
        'name': 'sentiment',
        'description': 'A scalar value indicating the positivity or negativity of the token.'  # noqa
    }
 )
 if nlp.has_pipe('lemmatizer'):
    token['properties'].append(
        {
            'id': generate_id('token.lemma'),
            'name': 'lemma',
            'description': 'The base form of the word'
        }
    )
 if nlp.has_pipe('morphologizer') or nlp.has_pipe('tagger'):
    token['properties'].append(
        {
            'id': generate_id('token.simple_pos'),
            'name': 'simple_pos',
            'description': 'The simple UPOS part-of-speech tag',
            'labels': [
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'ADJ',
                    'description': 'adjective'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'ADP',
                    'description': 'adposition'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'ADV',
                    'description': 'adverb'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'AUX',
                    'description': 'auxiliary verb'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'CONJ',
                    'description': 'coordinating conjunction'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'DET',
                    'description': 'determiner'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'INTJ',
                    'description': 'interjection'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'NOUN',
                    'description': 'noun'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'NUM',
                    'description': 'numeral'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'PART',
                    'description': 'particle'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'PRON',
                    'description': 'pronoun'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'PROPN',
                    'description': 'proper noun'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'PUNCT',
                    'description': 'punctuation'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'SCONJ',
                    'description': 'subordinating conjunction'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'SYM',
                    'description': 'symbol'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'VERB',
                    'description': 'verb'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'X',
                    'description': 'other'
                }
            ]
        }
    )
 if nlp.has_pipe('tagger'):
    token['properties'].append(
        {
            'id': generate_id('token.pos'),
            'name': 'pos',
            'description': 'The detailed part-of-speech tag',
            'labels': [
                {
                    'id': generate_id(f'token.pos={label}'),
                    'name': label,
                    'description': spacy.explain(label) or ''
                } for label in spacy.info(model_name)['labels']['tagger']
            ]
        }
    )
 if nlp.has_pipe('ner') or nlp.has_pipe('entity_ruler'):
    tags.append(
        {
            'id': generate_id('ent'),
            'name': 'ent',
            'description': 'Encodes the start and end of a named entity',
            'properties': [
                {
                    'id': generate_id('ent.type'),
                    'name': 'type',
                    'description': 'Label indicating the type of the entity',
                    'labels': [
                        {
                            'id': generate_id('ent.type={}'.format(label)),
                            'name': label,
                            'description': spacy.explain(label) or ''
                        } for label in spacy.info(model_name)['labels']['ner']
                    ]
                }
            ]
        }
    )
 if nlp.has_pipe('parser') or nlp.has_pipe('senter') or nlp.has_pipe('sentencizer'):  # noqa
    # TODO: Check if all languages support sent.sentiment
    tags.append(
        {
            'id': generate_id('s'),
            'name': 's',
            'description': 'Encodes the start and end of a sentence',
            'properties': [
                {
                    'id': generate_id('s.sentiment'),
                    'name': 'sentiment',
                    'description': 'A scalar value indicating the positivity or negativity of the sentence.'  # noqa
                }
            ]
        }
    )
 tags.append(token)
-# Create the output file in verticalized text format
+annotations = []
 # See: http://cwb.sourceforge.net/files/CWB_Encoding_Tutorial/node3.html
 output_file_original_filename = args.output
 output_file_stand_off_filename = args.output.replace('.vrt', '.stand-off.vrt')
 common_xml = ('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'
              + '<corpus>\n'
              + '<text>\n'
              + '<nlp name="spaCy:{}"\n'.format(SPACY_VERSION)
              + '     model="{}:{}"\n'.format(model, SPACY_MODELS_VERSION)
              + '     source-md5="{}" />\n'.format(source_md5))
-with open(output_file_original_filename, 'w+') as output_file_original, \
+chunk_offset = 0
-     open(output_file_stand_off_filename, 'w+') as output_file_stand_off:
+while text_chunks:
-
+    text_chunk = text_chunks.pop(0)
-    output_file_original.write(common_xml)
+    doc = nlp(text_chunk)
-    output_file_stand_off.write(common_xml)
+    if hasattr(doc, 'ents'):
-    text_offset = 0
+        for ent in doc.ents:
-    for text_chunk in text_chunks:
+            annotation = {
-        doc = nlp(text_chunk)
+                'start': ent.start_char + chunk_offset,
                'end': ent.end_char + chunk_offset,
                'tag_id': generate_id('ent'),
                'properties': [
                    {
                        'property_id': generate_id('ent.type'),
                        'value': ent.label_
                    }
                ]
            }
            annotations.append(annotation)
    if hasattr(doc, 'sents'):
        for sent in doc.sents:
-            output_file_original.write('<s>\n')
+            annotation = {
-            output_file_stand_off.write('<s>\n')
+                'start': sent.start_char + chunk_offset,
-            space_flag = False
+                'end': sent.end_char + chunk_offset,
-            # Skip whitespace tokens
+                'tag_id': generate_id('s'),
-            sent_no_space = [token for token in sent
+                'properties': []
-                             if not token.text.isspace()]
+            }
-            # No space variant for cwb original .vrt file input.
+            if hasattr(sent, 'sentiment'):
-            for token in sent_no_space:
+                annotation['properties'].append(
-                output_file_original.write('{}'.format(escape(token.text))
+                    {
-                                           + '\t{}'.format(escape(token.lemma_))
+                        'property_id': generate_id('s.sentiment'),
-                                           + '\t{}'.format(token.pos_)
+                        'value': sent.sentiment
-                                           + '\t{}'.format(token.tag_)
+                    }
-                                           + '\t{}\n'.format(token.ent_type_ or 'NULL'))
+                )
-            # Stand off variant with spaces.
+            annotations.append(annotation)
-            for token in sent:
+    for token in doc:
-                token_start = token.idx + text_offset
+        annotation = {
-                token_end = token.idx + len(token.text) + text_offset
+            'start': token.idx + chunk_offset,
-                output_file_stand_off.write('{}:{}'.format(token_start,
+            'end': token.idx + len(token.text) + chunk_offset,
-                                                           token_end)
+            'tag_id': generate_id('token'),
-                                            + '\t{}'.format(escape(token.lemma_))
+            'properties': []
-                                            + '\t{}'.format(token.pos_)
+        }
-                                            + '\t{}'.format(token.tag_)
+        if hasattr(token, 'lemma_'):
-                                            + '\t{}\n'.format(token.ent_type_ or 'NULL'))
+            annotation['properties'].append(
-            output_file_original.write('</s>\n')
+                {
-            output_file_stand_off.write('</s>\n')
+                    'property_id': generate_id('token.lemma'),
-        text_offset = token_end + 1
+                    'value': token.lemma_
-    output_file_original.write('</text>\n</corpus>')
+                }
-    output_file_stand_off.write('</text>\n</corpus>')
+            )
        if hasattr(token, 'pos_'):
            annotation['properties'].append(
                {
                    'property_id': generate_id('token.simple_pos'),
                    'value': token.pos_
                }
            )
        if hasattr(token, 'sentiment'):
            annotation['properties'].append(
                {
                    'property_id': generate_id('token.sentiment'),
                    'value': token.sentiment
                }
            )
        if hasattr(token, 'tag_'):
            annotation['properties'].append(
                {
                   'property_id': generate_id('token.pos'),
                   'value': token.tag_
                }
            )
        annotations.append(annotation)
    chunk_offset += len(text_chunk)
    text_chunk = None
 with open(args.output_file, 'w') as output_file:
    json.dump(
        {'meta': meta, 'tags': tags, 'annotations': annotations},
        output_file,
        indent=4
    )
--- a/45
+++ b/45
@ -0,0 +1,45 @@
 #!/usr/bin/env python3.7
 # coding=utf-8
 from argparse import ArgumentParser
 from stand_off_data import StandOffData
 import hashlib
 import json
 parser = ArgumentParser(
    description='Convert plain text and JSON stand off to a CWB vrt file'
 )
 parser.add_argument(
    '-s', '--stand-off-data-file',
    help='JSON stand off data input file',
    required=True
 )
 parser.add_argument(
    '-t', '--text-file',
    help='Plain text input file',
    required=True
 )
 parser.add_argument(
    '-o', '--output-file',
    help='Output file',
    required=True
 )
 args = parser.parse_args()
 with open(args.stand_off_data_file) as stand_of_data_file:
    stand_off_data = StandOffData(json.load(stand_of_data_file))
 with open(args.text_file, "rb") as text_file:
    text_md5 = hashlib.md5()
    for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''):
        text_md5.update(chunk)
    if text_md5.hexdigest() != stand_off_data.meta['file']['md5']:
        raise Exception('md5 not equal')
 with open(args.text_file, encoding=stand_off_data.meta['file']['encoding']) as text_file:  # noqa
    text = text_file.read()
 with open(args.output_file, 'w') as vrt_file:
    vrt_file.write(stand_off_data.to_vrt(text))
--- a/wrapper/nlp
+++ b/wrapper/nlp
@ -4,30 +4,35 @@
 from argparse import ArgumentParser
 import os
 import subprocess
 import sys
-CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0'
+CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:v0.1.0'
 CONTAINER_INPUT_DIR = '/input'
 CONTAINER_OUTPUT_DIR = '/output'
 CONTAINER_LOG_DIR = '/logs'
 UID = str(os.getuid())
 GID = str(os.getgid())
 parser = ArgumentParser(add_help=False)
-parser.add_argument('-i', '--input-directory')
+parser.add_argument('-i', '--input-dir')
-parser.add_argument('-o', '--output-directory')
+parser.add_argument('-o', '--output-dir')
 parser.add_argument('--log-dir')
 args, remaining_args = parser.parse_known_args()
-cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)]
+cmd = ['docker', 'run', '--rm', '-it', '-u', f'{UID}:{GID}']
-if args.output_directory is not None:
+if args.input_dir is not None:
-    cmd += ['-v', '{}:{}'.format(os.path.abspath(args.output_directory),
+    mapping = f'{os.path.abspath(args.input_dir)}:{CONTAINER_INPUT_DIR}'
-                                 CONTAINER_OUTPUT_DIR)]
+    cmd += ['-v', mapping]
-    remaining_args.insert(0, CONTAINER_OUTPUT_DIR)
+    remaining_args += ['-i', CONTAINER_INPUT_DIR]
-    remaining_args.insert(0, '-o')
+if args.output_dir is not None:
-if args.input_directory is not None:
+    mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}'
-    cmd += ['-v', '{}:{}'.format(os.path.abspath(args.input_directory),
+    cmd += ['-v', mapping]
-                                 CONTAINER_INPUT_DIR)]
+    remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
-    remaining_args.insert(0, CONTAINER_INPUT_DIR)
+if args.log_dir is not None:
-    remaining_args.insert(0, '-i')
+    mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}'
    cmd += ['-v', mapping]
    remaining_args += ['--log-dir', CONTAINER_LOG_DIR]
 cmd.append(CONTAINER_IMAGE)
 cmd += remaining_args
-subprocess.run(cmd)
+sys.exit(subprocess.run(cmd).returncode)
Author	SHA1	Message	Date
Patrick Jentsch	6f0545e48b	Mark required arguments in scripts as required	2022-02-03 10:40:25 +01:00
Patrick Jentsch	a2e8e72e54	Bump spaCy version, bugfixes, codestyle	2022-01-27 16:50:22 +01:00
Patrick Jentsch	29ccfac4f6	optimizations	2021-08-11 16:47:29 +02:00
Patrick Jentsch	0ba0c14b72	First attempt	2021-08-10 14:43:55 +02:00
Patrick Jentsch	66516eeb89	WIP use the new package	2021-08-06 16:50:22 +02:00
Patrick Jentsch	a4b2fc3a65	Create package for stand-off-data-py	2021-07-22 16:59:29 +02:00
Patrick Jentsch	4dea95a108	Preliminary work	2021-07-13 16:31:53 +02:00
Patrick Jentsch	5139fd9727	Fix problem where encoding is not set	2021-06-22 12:46:01 +02:00
Patrick Jentsch	fd39246e4b	Update file handling. Now md5 is correct	2021-05-18 10:26:03 +02:00
Patrick Jentsch	bd5d8ddedb	Fix problems caused by wrong textwrap.wrap usage	2021-04-30 09:44:35 +02:00
Patrick Jentsch	f7b7da2b1f	restrict memory usage for nlp tasks	2021-04-22 08:46:28 +02:00
Patrick Jentsch	2813d1a222	Fix long text processing	2021-04-22 08:43:34 +02:00
Patrick Jentsch	cd976692d6	Don't process files in subdirectories	2021-04-12 13:24:31 +02:00
Patrick Jentsch	4e7669d009	Return the returncode	2021-04-12 09:26:21 +02:00
Patrick Jentsch	8105edfd1b	Add missing argument to wrapper script	2021-04-12 09:20:28 +02:00
Patrick Jentsch	72409bd12d	Fix race condition	2021-03-26 14:48:38 +01:00
Patrick Jentsch	54f336e620	Fix permissions	2021-03-26 10:09:45 +01:00
Patrick Jentsch	3b570e5df1	more pipeline help tweaks	2021-03-26 10:02:14 +01:00
Patrick Jentsch	dc62755d12	Update README and pipeline help	2021-03-26 10:01:51 +01:00
Patrick Jentsch	aa1bfa259d	Use JSON files for stand-off annotations.	2021-03-26 09:46:17 +01:00
		`@ -0,0 +1,2 @@`
							`# flake8: noqa`
							`from .models import StandOffData`