Mark required arguments in scripts as required

Bump spaCy version, bugfixes, codestyle
optimizations
2025-03-14 15:00:35 +00:00 · 2022-02-03 10:40:25 +01:00 · 2022-01-27 16:50:22 +01:00 · 2021-08-11 16:47:29 +02:00 · 2021-08-10 14:43:55 +02:00 · 2021-08-06 16:50:22 +02:00
12 changed files with 1129 additions and 312 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -1,42 +1,68 @@
-image: docker:latest
+image: docker:19.03.13
 services:
-  - docker:dind
+  - docker:19.03.13-dind
 stages:
  - build
  - push
-before_script:
+variables:
-  - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
+  DOCKER_TLS_CERTDIR: "/certs"
  INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME-$CI_COMMIT_SHA
-Build:
+.reg_setup:
  before_script:
    - apk add --no-cache curl
    - curl --fail --show-error --location "https://github.com/genuinetools/reg/releases/download/v$REG_VERSION/reg-linux-amd64" --output /usr/local/bin/reg
    - echo "$REG_SHA256  /usr/local/bin/reg" | sha256sum -c -
    - chmod a+x /usr/local/bin/reg
  variables:
    REG_SHA256: ade837fc5224acd8c34732bf54a94f579b47851cc6a7fd5899a98386b782e228
    REG_VERSION: 0.16.1
 build_image:
  script:
    - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
-    - docker build --pull -t $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA .
+    - docker build -t $INTERMEDIATE_IMAGE_TAG .
-    - docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
+    - docker push $INTERMEDIATE_IMAGE_TAG
  stage: build
  tags:
    - docker
-Push latest:
+push_master:
  extends:
    - .reg_setup
  only:
    - master
  script:
-    - docker pull $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
+    - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
-    - docker tag $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA $CI_REGISTRY_IMAGE:latest
+    - docker pull $INTERMEDIATE_IMAGE_TAG
-    - docker push $CI_REGISTRY_IMAGE:latest
+    - /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG
    - docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG
    - docker push $IMAGE_TAG
  stage: push
  tags:
    - docker
  variables:
    IMAGE_TAG: $CI_REGISTRY_IMAGE:latest
-Push tag:
+push_other:
  extends:
    - .reg_setup
  except:
    - master
  only:
    - branches
    - tags
  script:
-    - docker pull $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
+    - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
-    - docker tag $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
+    - docker pull $INTERMEDIATE_IMAGE_TAG
-    - docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
+    - /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG
    - docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG
    - docker push $IMAGE_TAG
  stage: push
  tags:
    - docker
  variables:
    IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
--- a/88
+++ b/88
@ -1,43 +1,67 @@
-FROM debian:9-slim
+FROM debian:buster-slim
 LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>"
 LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de"
 ENV DEBIAN_FRONTEND=noninteractive
 ENV LANG=C.UTF-8
-RUN apt-get update && \
+
-    apt-get install -y --no-install-recommends \
+RUN apt-get update \
-    build-essential \
+ && apt-get install --no-install-recommends --yes \
-    ca-certificates \
+      procps \
-    python2.7 \
+      python3.7 \
    python3.5 \
    python3-dev \
      python3-pip \
-    python3-setuptools \
+      wget \
-    wget
+ && python3 -m pip install \
      chardet \
      setuptools \
      wheel
-# Install pyFlow
+# Install the NLP pipeline and it's dependencies #
-ENV PYFLOW_VERSION 1.1.20
+## Install pyFlow ##
-RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \
+ENV PYFLOW_VERSION=1.1.20
-    tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \
+RUN wget --no-check-certificate --quiet \
-    cd pyflow-"$PYFLOW_VERSION" && \
+      "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \
-    python2.7 setup.py build install && \
+ && tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
-    cd .. && \
+ && cd "pyflow-${PYFLOW_VERSION}" \
-    rm -r pyflow-"$PYFLOW_VERSION" pyflow-"$PYFLOW_VERSION".tar.gz
+ && apt-get install --no-install-recommends --yes \
      python2.7 \
 && python2.7 setup.py build install \
 && cd - > /dev/null \
 && rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"
 # Install spaCy
 RUN pip3 install wheel && pip3 install -U spacy && \
    python3.5 -m spacy download de && \
    python3.5 -m spacy download en && \
    python3.5 -m spacy download es && \
    python3.5 -m spacy download fr && \
    python3.5 -m spacy download it && \
    python3.5 -m spacy download pt
-COPY nlp /usr/local/bin
+## Install spaCy ##
-COPY spacy_nlp /usr/local/bin
+ENV SPACY_VERSION=3.2.1
 RUN apt-get install --no-install-recommends --yes \
      python3.7 \
      python3-pip \
 && pip3 install \
      chardet \
      setuptools \
      wheel \
 && pip3 install --upgrade pip \
 && pip3 install "spacy==${SPACY_VERSION}"
 ENV SPACY_MODELS="de_core_news_md,en_core_web_md,it_core_news_md,pl_core_news_md,zh_core_web_md"
 ENV SPACY_MODELS_VERSION=3.2.0
 RUN for spacy_model in $(echo ${SPACY_MODELS} | tr "," "\n"); do python3 -m spacy download "${spacy_model}-${SPACY_MODELS_VERSION}" --direct; done
 COPY packages .
 RUN cd stand-off-data-py \
 && python3 -m pip install . \
 && cd -
 ## Install Pipeline ##
 COPY nlp spacy-nlp vrt-creator /usr/local/bin/
 RUN rm -r /var/lib/apt/lists/*
 RUN mkdir /input /output && \
    chmod a+rw /input /output
 ENTRYPOINT ["nlp"]
 CMD ["--help"]
--- a/21
+++ b/21
@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2021 Bielefeld University - CRC 1288 - INF
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/README.md
+++ b/README.md
@ -1,74 +1,41 @@
-# Natural language processing
+# NLP - Natural Language Processing
-This repository provides all code that is needed to build a container image for natural language processing utilizing [spaCy](https://spacy.io).
+This software implements a heavily parallelized pipeline for Natural Language Processing of text files. It is used for nopaque's NLP service but you can also use it standalone, for that purpose a convenient wrapper script is provided. The pipeline is designed to run on Linux operating systems, but with some tweaks it should also run on Windows with WSL installed.
-## Build image
+## Software used in this pipeline implementation
-1. Clone this repository and navigate into it:
+- Official Debian Docker image (buster-slim): https://hub.docker.com/_/debian
  - Software from Debian Buster's free repositories
 - pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
 - spaCy (3.2.1): https://github.com/explosion/spaCy/releases/tag/v3.2.1
 - spaCy medium sized models (3.2.0):
  - https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.2.0
  - https://github.com/explosion/spacy-models/releases/tag/en_core_web_md-3.2.0
  - https://github.com/explosion/spacy-models/releases/tag/it_core_news_md-3.2.0
  - https://github.com/explosion/spacy-models/releases/tag/nl_core_news_md-3.2.0
  - https://github.com/explosion/spacy-models/releases/tag/pl_core_news_md-3.2.0
  - https://github.com/explosion/spacy-models/releases/tag/zh_core_web_md-3.2.0
 ## Installation
 1. Install Docker and Python 3.
 2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git`
 3. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:v0.1.0 nlp`
 4. Add the wrapper script (`wrapper/nlp` relative to this README file) to your `${PATH}`.
 5. Create working directories for the pipeline: `mkdir -p /<my_data_location>/{input,output}`.
 ## Use the Pipeline
 1. Place your plain text files inside `/<my_data_location>/input`. Files should all contain text of the same language.
 2. Clear your `/<my_data_location>/output` directory.
 3. Start the pipeline process. Check the pipeline help (`nlp --help`) for more details.
 ```bash
 cd /<my_data_location>
 nlp \
  --input-dir input \
  --output-dir output \
  -m <model_code> <optional_pipeline_arguments>
 ```
-git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git && cd nlp
+4. Check your results in the `/<my_data_location>/output` directory.
 ```
 2. Build image:
 ```
 docker build -t sfb1288inf/nlp:latest .
 ```
 Alternatively build from the GitLab repository without cloning:
 1. Build image:
 ```
 docker build -t sfb1288inf/nlp:latest https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
 ```
 ## Download prebuilt image
 The GitLab registry provides a prebuilt image. It is automatically created, utilizing the conquaire build servers.
 1. Download image:
 ```
 docker pull gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest
 ```
 ## Run
 1. Create input and output directories for the NLP software:
 ```
 mkdir -p /<mydatalocation>/files_for_nlp /<mydatalocation>/files_from_nlp
 ```
 2. Place your text files inside the `/<mydatalocation>/files_for_nlp` directory. Files should all contain text of the same language.
 3. Start the NLP process.
 ```
 docker run \
    --rm \
    -it \
    -u $(id -u $USER):$(id -g $USER) \
    -v /<mydatalocation>/files_for_nlp:/input \
    -v /<mydatalocation>/files_from_nlp:/output \
    sfb1288inf/nlp:latest \
        -i /input \
        -l <languagecode> \
        -o /output
 ```
 The arguments below `sfb1288inf/nlp:latest` are described in the [NLP arguments](#nlp-arguments) part.
 If you want to use the prebuilt image, replace `sfb1288inf/nlp:latest` with `gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest`.
 4. Check your results in the `/<mydatalocation>/files_from_nlp` directory.
 ### NLP arguments
 `-i path`
 * Sets the input directory using the specified path.
 * required = True
 `-o path`
 * Sets the output directory using the specified path.
 * required = True
 `-l languagecode`
 * Tells spaCy which language will be used.
 * options = de (German), el (Greek), en (English), es (Spanish), fr (French), it (Italian), nl (Dutch), pt (Portuguese)
 * required = True
--- a/351
+++ b/351
@ -1,133 +1,288 @@
 #!/usr/bin/env python2.7
 # coding=utf-8
-"""
+''' A NLP pipeline for text file processing. '''
-nlp
+__version__ = '0.1.0'
-Usage:  For usage instructions run with option --help
+from argparse import ArgumentParser
-Author: Patrick Jentsch <p.jentsch@uni-bielefeld.de>
+from pyflow import WorkflowRunner
-"""
+import json
 import argparse
 import multiprocessing
 import os
 import sys
 from pyflow import WorkflowRunner
-def parse_arguments():
+SPACY_MODELS = {
-    parser = argparse.ArgumentParser(
+    'de': 'de_core_news_md',
-        description='Performs NLP of documents utilizing spaCy. The results are served as verticalized text files.'
+    'en': 'en_core_web_md',
-    )
+    'it': 'it_core_news_md',
    'nl': 'nl_core_news_md',
    'pl': 'pl_core_news_md',
    'zh': 'zh_core_web_md'
 }
-    parser.add_argument(
+
-        '-i',
+class PipelineJob:
-        dest='input_dir',
+    '''
-        required=True
+    NLP pipeline job class.
-    )
+
-    parser.add_argument(
+    Each plain text input file of the pipeline is represented as an NLP
-        '-l',
+    pipeline job, which holds all necessary information for the pipeline to
-        choices=['de', 'el', 'en', 'es', 'fr', 'it', 'nl', 'pt'],
+    process it.
-        dest='lang',
+
-        required=True
+    Arguments:
-    )
+    file -- Path to the file
-    parser.add_argument(
+    output_dir -- Path to a directory, where job results are stored
-        '-o',
+    '''
-        dest='output_dir',
+
-        required=True
+    def __init__(self, file, output_dir):
-    )
+        self.file = file
-    parser.add_argument(
+        self.name = os.path.basename(file)[:-4]
-        '--nCores',
+        self.output_dir = output_dir
        default=min(4, multiprocessing.cpu_count()),
        dest='n_cores',
        help='total number of cores available',
        required=False,
        type=int
    )
    return parser.parse_args()
 class NLPWorkflow(WorkflowRunner):
-    def __init__(self, args):
+    def __init__(self, job, model, check_encoding=False, id_prefix=''):
-        self.jobs = analyze_jobs(args.input_dir, args.output_dir)
+        self.job = job
-        self.lang = args.lang
+        self.model = model
-        self.n_cores = args.n_cores
+        self.check_encoding = check_encoding
        self.id_prefix = id_prefix
    def workflow(self):
-        if len(self.jobs) == 0:
+        '''
        ' ##################################################
        ' # spacy                                          #
        ' ##################################################
        '''
        n_cores = 1
        mem_mb = min(1024, self.getMemMb())
        cmd = 'spacy-nlp'
        cmd += ' --input-file "{}"'.format(self.job.file)
        cmd += ' --output-file "{}"'.format(
            os.path.join(self.job.output_dir, '{}.json'.format(self.job.name))
        )
        cmd += ' -m "{}"'.format(self.model)
        if self.check_encoding:
            cmd += ' --check-encoding'
        cmd += ' --id-prefix "{}"'.format(self.id_prefix)
        self.addTask(
            'spacy',
            command=cmd,
            memMb=mem_mb,
            nCores=n_cores
        )
 class CreateVrtWorkflow(WorkflowRunner):
    def __init__(self, job):
        self.job = job
    def workflow(self):
        '''
        ' ##################################################
        ' # vrt-creator                                    #
        ' ##################################################
        '''
        n_cores = 1
        mem_mb = min(256, self.getMemMb())
        cmd = 'vrt-creator'
        cmd += ' --stand-off-data-file "{}"'.format(
            os.path.join(self.job.output_dir, '{}.json'.format(self.job.name))
        )
        cmd += ' --text-file "{}"'.format(self.job.file)
        cmd += ' --output-file "{}"'.format(
            os.path.join(self.job.output_dir, '{}.vrt'.format(self.job.name))
        )
        self.addTask(
            'vrt_creator',
            command=cmd,
            memMb=mem_mb,
            nCores=n_cores
        )
 class MainWorkflow(WorkflowRunner):
    def __init__(
        self,
        input_dir,
        model,
        output_dir,
        check_encoding=False,
        id_prefix=''
    ):
        self.input_dir = input_dir
        self.model = model
        self.output_dir = output_dir
        self.check_encoding = check_encoding
        self.id_prefix = id_prefix
        self.jobs = []
    def collect_jobs(self):
        self.jobs = []
        for file in os.listdir(self.input_dir):
            if os.path.isdir(os.path.join(self.input_dir, file)):
                continue
            if not file.lower().endswith('.txt'):
                continue
            job = PipelineJob(
                os.path.join(self.input_dir, file),
                os.path.join(self.output_dir, file)
            )
            self.jobs.append(job)
    def workflow(self):
        if not self.jobs:
            return
-        '''
+        # Create output and temporary directories
-        ' ##################################################
+        for job in self.jobs:
-        ' # Create output directories                      #
+            os.mkdir(job.output_dir)
        ' ##################################################
        '''
        create_output_directories_jobs = []
        for index, job in enumerate(self.jobs):
            cmd = 'mkdir -p "%s"' % (job['output_dir'])
            create_output_directories_jobs.append(
                self.addTask(
                    command=cmd,
                    label='create_output_directories_job_-_%i' % (index)
                )
            )
        '''
        ' ##################################################
-        ' # Natural language processing                    #
+        ' # nlp                                            #
        ' ##################################################
        '''
-        nlp_jobs = []
+        nlp_tasks = []
-        nlp_job_n_cores = min(
+        for i, job in enumerate(self.jobs):
-            self.n_cores,
+            task = self.addWorkflowTask(
-            max(1, int(self.n_cores / len(self.jobs)))
+                'nlp_-_{}'.format(i),
-        )
+                NLPWorkflow(
-        for index, job in enumerate(self.jobs):
+                    job,
-            cmd = 'spacy_nlp -l "%s" "%s" "%s"' % (
+                    self.model,
-                self.lang,
+                    check_encoding=self.check_encoding,
-                job['path'],
+                    id_prefix=self.id_prefix
                os.path.join(job['output_dir'], job['name'] + '.vrt')
            )
            nlp_jobs.append(
                self.addTask(
                    command=cmd,
                    dependencies='create_output_directories_job_-_%i' % (index),
                    label='nlp_job_-_%i' % (index),
                    nCores=nlp_job_n_cores
                )
            )
            nlp_tasks.append(task)
-
+        '''
-def analyze_jobs(input_dir, output_dir):
+        ' ##################################################
-    jobs = []
+        ' # create vrt                                     #
-
+        ' ##################################################
-    for file in os.listdir(input_dir):
+        '''
-        if os.path.isdir(os.path.join(input_dir, file)):
+        create_vrt_tasks = []
-            jobs += analyze_jobs(
+        for i, job in enumerate(self.jobs):
-                os.path.join(input_dir, file),
+            task = self.addWorkflowTask(
-                os.path.join(output_dir, file),
+                'create_vrt_-_{}'.format(i),
                CreateVrtWorkflow(job),
                dependencies='nlp_-_{}'.format(i)
            )
-        elif file.endswith('.txt'):
+            create_vrt_tasks.append(task)
-            jobs.append(
+
        self.waitForTasks()
        outputs = []
        for job in self.jobs:
            # Track output files
            relative_output_dir = os.path.relpath(
                job.output_dir,
                start=self.output_dir
            )
            outputs.append(
                {
-                    'filename': file,
+                    'description': 'JSON stand off data',
-                    'name': file.rsplit('.', 1)[0],
+                    'file': os.path.join(
-                    'output_dir': os.path.join(output_dir, file),
+                        relative_output_dir,
-                    'path': os.path.join(input_dir, file)
+                        '{}.json'.format(job.name)
                    ),
                    'mimetype': 'application/json'
                }
            )
            outputs.append(
                {
                    'description': 'CWB vrt file',
                    'file': os.path.join(
                        relative_output_dir,
                        '{}.vrt'.format(job.name)
                    ),
                    'mimetype': 'application/vrt+xml'
                }
            )
        with open(os.path.join(self.output_dir, 'outputs.json'), 'w') as f:
            json.dump(outputs, f, indent=4)
-    return jobs
+
 def parse_args():
    parser = ArgumentParser(
        description='NLP pipeline for plain text file processing'
    )
    parser.add_argument(
        '-i', '--input-dir',
        help='Input directory',
        required=True
    )
    parser.add_argument(
        '-o', '--output-dir',
        help='Output directory',
        required=True
    )
    parser.add_argument(
        '-m', '--model',
        choices=SPACY_MODELS.keys(),
        help='The model to be used',
        required=True
    )
    parser.add_argument(
        '--check-encoding',
        action='store_true',
        help='Check encoding of the input file, UTF-8 is used instead'
    )
    parser.add_argument(
        '--id-prefix',
        default='',
        help='A prefix for all the ids within the stand off annotations'
    )
    parser.add_argument(
        '--log-dir',
        help='Logging directory (Default: --output-dir)'
    )
    parser.add_argument(
        '--mem-mb',
        help='Amount of system memory to be used '
             '(Default: min(--n-cores * 1024, available system memory))',
        type=int
    )
    parser.add_argument(
        '--n-cores',
        default=1,
        help='Number of CPU threads to be used',
        type=int
    )
    parser.add_argument(
        '-v', '--version',
        action='version',
        help='Returns the current version of the NLP pipeline',
        version='%(prog)s {}'.format(__version__)
    )
    args = parser.parse_args()
    # Set some tricky default values and check for insufficient input
    if args.log_dir is None:
        args.log_dir = args.output_dir
    if args.n_cores < 1:
        raise Exception('--n-cores must be greater or equal 1')
    if args.mem_mb is None:
        max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0])
        args.mem_mb = min(args.n_cores * 1024, max_mem_mb)
    if args.mem_mb < 1024:
        raise Exception('--mem-mb must be greater or equal 1024')
    return args
 def main():
-    args = parse_arguments()
+    args = parse_args()
-
+    main_workflow = MainWorkflow(
-    wflow = NLPWorkflow(args)
+        args.input_dir,
-
+        args.model,
-    retval = wflow.run(dataDirRoot=args.output_dir, nCores=args.n_cores)
+        args.output_dir,
-
+        check_encoding=args.check_encoding,
        id_prefix=args.id_prefix
    )
    main_workflow.collect_jobs()
    retval = main_workflow.run(
        dataDirRoot=args.log_dir,
        memMb=args.mem_mb,
        nCores=args.n_cores
    )
    sys.exit(retval)
--- a/packages/stand-off-data-py/setup.py
+++ b/packages/stand-off-data-py/setup.py
@ -0,0 +1,14 @@
 import setuptools
 setuptools.setup(
    name='Stand off data',
    author='Patrick Jentsch',
    author_email='p.jentsch@uni-bielefeld.de',
    description='A python library to handle stand off data.',
    py_modules=['stand_off_data'],
    classifiers=[
        'Programming Language :: Python :: 3',
        'Operating System :: OS Independent',
    ],
    python_requires='>=3.5'
 )
--- a/packages/stand-off-data-py/stand_off_data.py
+++ b/packages/stand-off-data-py/stand_off_data.py
@ -0,0 +1,282 @@
 from xml.sax.saxutils import escape
 class StandOffData:
    def __init__(self, attrs={}):
        self.meta = attrs.get('meta', {})
        self.lookup = {}
        for x in attrs.get('tags', []):
            self.add_tag_definition(x)
        self.annotations = [
            TagAnnotation(x, self.lookup)
            for x in attrs.get('annotations', [])
        ]
    def add_tag_definition(self, attrs):
        tag_definition = TagDefinition(attrs)
        if tag_definition.id in self.lookup:
            raise Exception(f'Tag id already in use: {self.to_dict()}')
        self.lookup[tag_definition.id] = tag_definition
    def to_dict(self):
        return {
            'meta': self.meta,
            'lookup': {k: v.to_dict() for k, v in self.lookup.items()},
            'annotations': [x.to_dict() for x in self.annotations]
        }
    def to_vrt(self, text):
        # Devide annotations into CWB's verticalized text format (.vrt) logic
        p_attrs = []    # positional attributes
        s_attrs = []    # structural attributes
        for annotation in self.annotations:
            if annotation.name == 'token':
                p_attrs.append(annotation)
            else:
                s_attrs.append(annotation)
        # Sort annotations, necessary for the next checks
        p_attrs.sort()
        s_attrs.sort()
        # Check for p_attr<->p_attr overlap
        for i, p_attr in enumerate(p_attrs[:-1]):
            next_p_attr = p_attrs[i + 1]
            # Check if first_p_attr starts/ends within second_p_attr
            if ((p_attr.start >= next_p_attr.start) and (p_attr.start < next_p_attr.end)  # noqa
                or (p_attr.end > next_p_attr.start) and (p_attr.end <= next_p_attr.end)):  # noqa
                raise Exception(
                    'Positional attribute overlaps another: '
                    f'{p_attr.to_dict()}<->{next_p_attr.to_dict()}'
                )
        # Check for s_attr<->p_attr overlap
        for i, s_attr in enumerate(s_attrs):
            for p_attr in p_attrs:
                # Check if s_attr starts within p_attr
                if s_attr.start > p_attr.start and s_attr.start < p_attr.end:
                    # Change s_attr start to p_attr's start
                    s_attrs[i].start = p_attr.start
                # Check if s_attr ends within p_attr
                if s_attr.end < p_attr.end and s_attr.end > p_attr.start:
                    # Change s_attr end to p_attr's end
                    s_attrs[i].end = p_attr.end
                # Check if s_attr starts/ends before/after p_attr
                if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start:
                    # No further Checking needed (because p_attrs are sorted)
                    break
        p_attr_buffer = {}
        for i, p_attr in enumerate(p_attrs):
            p_attr_buffer[p_attr.start] = i
        s_attr_start_buffer = {}
        s_attr_end_buffer = {}
        for i, s_attr in enumerate(s_attrs):
            if s_attr.start in s_attr_start_buffer:
                s_attr_start_buffer[s_attr.start].append(i)
            else:
                s_attr_start_buffer[s_attr.start] = [i]
            if s_attr.end in s_attr_end_buffer:
                s_attr_end_buffer[s_attr.end].insert(0, i)
            else:
                s_attr_end_buffer[s_attr.end] = [i]
        vrt = ''
        vrt += '<text>\n'
        current_position = 0
        text_len = len(text)
        # As long as we have something in our buffers we process it
        while current_position <= text_len:
            # s_attr endings
            # for k in {k: v for k, v in s_attr_end_buffer.items() if k <= current_position}:  # noqa
            if current_position in s_attr_end_buffer:
                # s_attr_indexes = s_attr_end_buffer.pop(k)
                s_attr_indexes = s_attr_end_buffer.pop(current_position)
                for s_attr_index in s_attr_indexes:
                    s_attr = s_attrs[s_attr_index]
                    vrt += f'</{escape(s_attr.name)}>\n'
            # s_attrs starts
            # for k in {k: v for k, v in s_attr_start_buffer.items() if k <= current_position}:  # noqa
            if current_position in s_attr_start_buffer:
                # s_attr_indexes = s_attr_start_buffer.pop(k)
                s_attr_indexes = s_attr_start_buffer.pop(current_position)
                for s_attr_index in s_attr_indexes:
                    s_attr = s_attrs[s_attr_index]
                    vrt += f'<{escape(s_attr.name)}'
                    for property in s_attr.properties:
                        vrt += f' {escape(property.name)}="{escape(str(property.value))}"'  # noqa
                    vrt += '>\n'
            # p_attrs
            if current_position not in p_attr_buffer:
                current_position += 1
                continue
            p_attr_index = p_attr_buffer.pop(current_position)
            p_attr = p_attrs[p_attr_index]
            if text[p_attr.start:p_attr.end].isspace():
                current_position = p_attr.end
                continue
            _p_attr = {
                'lemma': 'None',
                'pos': 'None',
                'simple_pos': 'None',
                'word': 'None'
            }
            for property in p_attr.properties:
                if property.name not in _p_attr:
                    continue
                _p_attr[property.name] = escape(str(property.value))
            _p_attr['word'] = escape(text[p_attr.start:p_attr.end])
            vrt += '{word}\t{pos}\t{lemma}\t{simple_pos}\n'.format(**_p_attr)
            current_position = p_attr.end
        vrt += '</text>\n'
        return vrt
 class TagAnnotation:
    def __init__(self, attrs, lookup):
        self.lookup = lookup
        self.tag_id = attrs['tag_id']
        self.start = attrs['start']
        self.end = attrs['end']
        self.properties = [
            PropertyAnnotation(x, self.lookup[self.tag_id].properties)
            for x in attrs.get('properties', [])
        ]
        ''' Sanity checks '''
        if self.tag_id not in self.lookup:
            raise Exception(f'Unknown tag: {self.to_dict()}')
        if self.end < self.start:
            raise Exception(f'Annotation end less then start: {self.to_dict()}')  # noqa
        # property_ids = [x.property_id for x in self.properties]
        # for required_property_id, required_property in self.lookup[self.tag_id].required_properties.items():  # noqa
        #     if required_property_id not in property_ids:
        #         raise Exception(
        #             f'Missing required property: {required_property.to_dict()}'
        #         )
    @property
    def name(self):
        return self.lookup[self.tag_id].name
    def to_dict(self):
        return {
            'tag_id': self.tag_id,
            'start': self.start,
            'end': self.end,
            'properties': [x.to_dict() for x in self.properties]
        }
    def __lt__(self, other):
        if self.start == other.start:
            if self.name == 'token' and other.name != 'token':
                return False
            elif self.name != 'token' and other.name == 'token':
                return True
            else:
                return self.end > other.end
        else:
            return self.start < other.start
    def __le__(self, other):
        if self.start == other.start:
            if self.name == 'token' and other.name != 'token':
                return False
            elif self.name != 'token' and other.name == 'token':
                return True
            else:
                return self.end >= other.end
        else:
            return self.start <= other.start
    def __eq__(self, other):
        if self.start == other.start:
            if self.name == 'token' and other.name != 'token':
                return False
            elif self.name != 'token' and other.name == 'token':
                return False
            else:
                return self.end == other.end
        else:
            return False
    def __ne__(self, other):
        return not self == other
    def __gt__(self, other):
        return not self <= other
    def __ge__(self, other):
        return not self < other
 class PropertyAnnotation:
    def __init__(self, attrs, lookup):
        self.lookup = lookup
        self.property_id = attrs['property_id']
        self.value = attrs['value']
        # TODO: Process attrs['possibleValues'] as self.labels (no id?)
        ''' Sanity checks '''
        if self.property_id not in self.lookup:
            raise Exception(f'Unknown property: {self.to_dict()}')
    @property
    def name(self):
        return self.lookup[self.property_id].name
    def to_dict(self):
        return {
            'property_id': self.property_id,
            'tag_id': self.tag_id,
            'value': self.value
        }
 class TagDefinition:
    def __init__(self, attrs):
        self.id = attrs['id']
        self.name = attrs['name']
        self.description = attrs.get('description', '')
        self.properties = {}
        for x in attrs.get('properties', []):
            self.add_property_definition(x)
    def add_property_definition(self, attrs):
        property_definition = PropertyDefinition(attrs)
        if property_definition.id in self.properties:
            raise Exception(
                f'Property id already in use: {property_definition.to_dict()}')
        self.properties[property_definition.id] = property_definition
    # @property
    # def required_properties(self):
    #     return {property.id: property for property in self.properties.values()
    #             if property.is_required}
    def to_dict(self):
        return {
            'id': self.id,
            'name': self.name,
            'description': self.description,
            'properties': {k: v.to_dict() for k, v in self.properties.items()}
        }
 class PropertyDefinition:
    def __init__(self, attrs):
        self.id = attrs['id']
        self.name = attrs['name']
        self.description = attrs.get('description', '')
        self.flags = attrs.get('flags', [])
        self.labels = attrs.get('labels', [])
    # @property
    # def is_required(self):
    #     return 'required' in self.flags
    @property
    def has_multiple_values(self):
        return 'multiple' in self.flags
    def to_dict(self):
        return {
            'id': self.id,
            'name': self.name,
            'description': self.description,
            'flags': self.flags,
            'labels': self.labels
        }
--- a/packages/stand-off-data-py/stand_off_data/init.py
+++ b/packages/stand-off-data-py/stand_off_data/init.py
@ -0,0 +1,2 @@
 # flake8: noqa
 from .models import StandOffData
--- a/353
+++ b/353
@ -0,0 +1,353 @@
 #!/usr/bin/env python3.7
 # coding=utf-8
 from argparse import ArgumentParser
 import chardet
 import hashlib
 import json
 import os
 import spacy
 import textwrap
 import uuid
 spacy_models = {
    spacy.info(pipeline)['lang']: pipeline
    for pipeline in spacy.info()['pipelines']
 }
 # Parse the given arguments
 parser = ArgumentParser(
    description='Create annotations for a given plain txt file'
 )
 parser.add_argument(
    '-i', '--input-file',
    help='Input file',
    required=True
 )
 parser.add_argument(
    '-o', '--output-file',
    help='Output file',
    required=True
 )
 parser.add_argument(
    '-m', '--model',
    choices=spacy_models.keys(),
    help='The model to be used',
    required=True
 )
 parser.add_argument(
    '-c', '--check-encoding',
    action='store_true',
    help='Check encoding of the input file, UTF-8 is used instead'
 )
 parser.add_argument(
    '--id-prefix',
    default='',
    help='A prefix for all the ids within the stand off annotations'
 )
 args = parser.parse_args()
 def generate_id(name):
    return f'{args.id_prefix}{uuid.uuid3(uuid.NAMESPACE_DNS, name)}'
 with open(args.input_file, "rb") as input_file:
    if args.check_encoding:
        encoding = chardet.detect(input_file.read())['encoding']
    else:
        encoding = 'utf-8'
    input_file.seek(0)
    text_md5 = hashlib.md5()
    for chunk in iter(lambda: input_file.read(128 * text_md5.block_size), b''):
        text_md5.update(chunk)
 # Load the text contents from the input file
 with open(args.input_file, encoding=encoding) as input_file:
    # spaCy NLP is limited to strings with a maximum of 1 million characters at
    # once. So we split it into suitable chunks.
    text_chunks = textwrap.wrap(
        input_file.read(),
        1000000,
        break_long_words=False,
        break_on_hyphens=False,
        drop_whitespace=False,
        expand_tabs=False,
        replace_whitespace=False
    )
 model_name = spacy_models[args.model]
 nlp = spacy.load(model_name)
 meta = {
    'generator': {
        'name': 'nopaque spacy NLP',
        'version': '0.1.0',
        'arguments': {
            'check_encoding': args.check_encoding,
            'model': args.model
        }
    },
    'file': {
        'encoding': encoding,
        'md5': text_md5.hexdigest(),
        'name': os.path.basename(args.input_file)
    }
 }
 tags = []
 token = {
    'id': generate_id('token'),
    'name': 'token',
    'description': 'An individual token — i.e. a word, punctuation symbol, whitespace, etc.',  # noqa
    'properties': []
 }
 # TODO: Check if all languages support token.sentiment
 token['properties'].append(
    {
        'id': generate_id('token.sentiment'),
        'name': 'sentiment',
        'description': 'A scalar value indicating the positivity or negativity of the token.'  # noqa
    }
 )
 if nlp.has_pipe('lemmatizer'):
    token['properties'].append(
        {
            'id': generate_id('token.lemma'),
            'name': 'lemma',
            'description': 'The base form of the word'
        }
    )
 if nlp.has_pipe('morphologizer') or nlp.has_pipe('tagger'):
    token['properties'].append(
        {
            'id': generate_id('token.simple_pos'),
            'name': 'simple_pos',
            'description': 'The simple UPOS part-of-speech tag',
            'labels': [
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'ADJ',
                    'description': 'adjective'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'ADP',
                    'description': 'adposition'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'ADV',
                    'description': 'adverb'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'AUX',
                    'description': 'auxiliary verb'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'CONJ',
                    'description': 'coordinating conjunction'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'DET',
                    'description': 'determiner'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'INTJ',
                    'description': 'interjection'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'NOUN',
                    'description': 'noun'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'NUM',
                    'description': 'numeral'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'PART',
                    'description': 'particle'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'PRON',
                    'description': 'pronoun'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'PROPN',
                    'description': 'proper noun'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'PUNCT',
                    'description': 'punctuation'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'SCONJ',
                    'description': 'subordinating conjunction'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'SYM',
                    'description': 'symbol'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'VERB',
                    'description': 'verb'
                },
                {
                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'X',
                    'description': 'other'
                }
            ]
        }
    )
 if nlp.has_pipe('tagger'):
    token['properties'].append(
        {
            'id': generate_id('token.pos'),
            'name': 'pos',
            'description': 'The detailed part-of-speech tag',
            'labels': [
                {
                    'id': generate_id(f'token.pos={label}'),
                    'name': label,
                    'description': spacy.explain(label) or ''
                } for label in spacy.info(model_name)['labels']['tagger']
            ]
        }
    )
 if nlp.has_pipe('ner') or nlp.has_pipe('entity_ruler'):
    tags.append(
        {
            'id': generate_id('ent'),
            'name': 'ent',
            'description': 'Encodes the start and end of a named entity',
            'properties': [
                {
                    'id': generate_id('ent.type'),
                    'name': 'type',
                    'description': 'Label indicating the type of the entity',
                    'labels': [
                        {
                            'id': generate_id('ent.type={}'.format(label)),
                            'name': label,
                            'description': spacy.explain(label) or ''
                        } for label in spacy.info(model_name)['labels']['ner']
                    ]
                }
            ]
        }
    )
 if nlp.has_pipe('parser') or nlp.has_pipe('senter') or nlp.has_pipe('sentencizer'):  # noqa
    # TODO: Check if all languages support sent.sentiment
    tags.append(
        {
            'id': generate_id('s'),
            'name': 's',
            'description': 'Encodes the start and end of a sentence',
            'properties': [
                {
                    'id': generate_id('s.sentiment'),
                    'name': 'sentiment',
                    'description': 'A scalar value indicating the positivity or negativity of the sentence.'  # noqa
                }
            ]
        }
    )
 tags.append(token)
 annotations = []
 chunk_offset = 0
 while text_chunks:
    text_chunk = text_chunks.pop(0)
    doc = nlp(text_chunk)
    if hasattr(doc, 'ents'):
        for ent in doc.ents:
            annotation = {
                'start': ent.start_char + chunk_offset,
                'end': ent.end_char + chunk_offset,
                'tag_id': generate_id('ent'),
                'properties': [
                    {
                        'property_id': generate_id('ent.type'),
                        'value': ent.label_
                    }
                ]
            }
            annotations.append(annotation)
    if hasattr(doc, 'sents'):
        for sent in doc.sents:
            annotation = {
                'start': sent.start_char + chunk_offset,
                'end': sent.end_char + chunk_offset,
                'tag_id': generate_id('s'),
                'properties': []
            }
            if hasattr(sent, 'sentiment'):
                annotation['properties'].append(
                    {
                        'property_id': generate_id('s.sentiment'),
                        'value': sent.sentiment
                    }
                )
            annotations.append(annotation)
    for token in doc:
        annotation = {
            'start': token.idx + chunk_offset,
            'end': token.idx + len(token.text) + chunk_offset,
            'tag_id': generate_id('token'),
            'properties': []
        }
        if hasattr(token, 'lemma_'):
            annotation['properties'].append(
                {
                    'property_id': generate_id('token.lemma'),
                    'value': token.lemma_
                }
            )
        if hasattr(token, 'pos_'):
            annotation['properties'].append(
                {
                    'property_id': generate_id('token.simple_pos'),
                    'value': token.pos_
                }
            )
        if hasattr(token, 'sentiment'):
            annotation['properties'].append(
                {
                    'property_id': generate_id('token.sentiment'),
                    'value': token.sentiment
                }
            )
        if hasattr(token, 'tag_'):
            annotation['properties'].append(
                {
                   'property_id': generate_id('token.pos'),
                   'value': token.tag_
                }
            )
        annotations.append(annotation)
    chunk_offset += len(text_chunk)
    text_chunk = None
 with open(args.output_file, 'w') as output_file:
    json.dump(
        {'meta': meta, 'tags': tags, 'annotations': annotations},
        output_file,
        indent=4
    )
--- a/71
+++ b/71
@ -1,71 +0,0 @@
 #!/usr/bin/env python3.5
 # coding=utf-8
 import argparse
 import os
 import spacy
 import textwrap
 parser = argparse.ArgumentParser(
    description='Tag a text file with spaCy and save it as a verticalized text file.'
 )
 parser.add_argument(
    'i',
    metavar='txt-sourcefile',
 )
 parser.add_argument(
    '-l',
    choices=['de', 'el', 'en', 'es', 'fr', 'it', 'nl', 'pt'],
    dest='lang',
    required=True
 )
 parser.add_argument(
    'o',
    metavar='vrt-destfile',
 )
 args = parser.parse_args()
 SPACY_MODELS = {
    'de': 'de_core_news_sm', 'el': 'el_core_news_sm', 'en': 'en_core_web_sm',
    'es': 'es_core_news_sm', 'fr': 'fr_core_news_sm', 'it': 'it_core_news_sm',
    'nl': 'nl_core_news_sm', 'pt': 'pt_core_news_sm'
 }
 # Set the language model for spacy
 nlp = spacy.load(SPACY_MODELS[args.lang])
 # Read text from the input file and if neccessary split it into parts with a
 # length of less than 1 million characters.
 with open(args.i) as input_file:
    text = input_file.read()
    texts = textwrap.wrap(text, 1000000, break_long_words=False)
    text = None
 # Create and open the output file
 output_file = open(args.o, 'w+')
 output_file.write(
    '<?xml version="1.0" encoding="UTF-8"?>\n<corpus>\n<text id="%s">\n' % (
        os.path.basename(args.i).rsplit(".", 1)[0]
    )
 )
 for text in texts:
    # Run spacy nlp over the text (partial string if above 1 million chars)
    doc = nlp(text)
    for sent in doc.sents:
        output_file.write('<s>\n')
        for token in sent:
            # Skip whitespace tokens like "\n" or "\t"
            if token.text.isspace():
                continue
            # Write all information in .vrt style to the output file
            # text, lemma, simple_pos, pos, ner
            output_file.write(
                token.text + '\t' + token.lemma_ + '\t'
                + token.pos_ + '\t' + token.tag_ + '\t'
                + (token.ent_type_ if token.ent_type_ != '' else 'NULL') + '\n'
            )
        output_file.write('</s>\n')
 output_file.write('</text>\n</corpus>')
 output_file.close()
--- a/45
+++ b/45
@ -0,0 +1,45 @@
 #!/usr/bin/env python3.7
 # coding=utf-8
 from argparse import ArgumentParser
 from stand_off_data import StandOffData
 import hashlib
 import json
 parser = ArgumentParser(
    description='Convert plain text and JSON stand off to a CWB vrt file'
 )
 parser.add_argument(
    '-s', '--stand-off-data-file',
    help='JSON stand off data input file',
    required=True
 )
 parser.add_argument(
    '-t', '--text-file',
    help='Plain text input file',
    required=True
 )
 parser.add_argument(
    '-o', '--output-file',
    help='Output file',
    required=True
 )
 args = parser.parse_args()
 with open(args.stand_off_data_file) as stand_of_data_file:
    stand_off_data = StandOffData(json.load(stand_of_data_file))
 with open(args.text_file, "rb") as text_file:
    text_md5 = hashlib.md5()
    for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''):
        text_md5.update(chunk)
    if text_md5.hexdigest() != stand_off_data.meta['file']['md5']:
        raise Exception('md5 not equal')
 with open(args.text_file, encoding=stand_off_data.meta['file']['encoding']) as text_file:  # noqa
    text = text_file.read()
 with open(args.output_file, 'w') as vrt_file:
    vrt_file.write(stand_off_data.to_vrt(text))
--- a/wrapper/nlp
+++ b/wrapper/nlp
@ -1,39 +1,38 @@
 #!/usr/bin/env python3
 # coding=utf-8
-import argparse
+from argparse import ArgumentParser
 import os
 import subprocess
 import sys
-container_image = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest'
+CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:v0.1.0'
-container_input_dir = '/input'
+CONTAINER_INPUT_DIR = '/input'
-container_output_dir = '/output'
+CONTAINER_OUTPUT_DIR = '/output'
-uid = str(os.getuid())
+CONTAINER_LOG_DIR = '/logs'
-gid = str(os.getgid())
+UID = str(os.getuid())
 GID = str(os.getgid())
-parser = argparse.ArgumentParser(add_help=False)
+parser = ArgumentParser(add_help=False)
-parser.add_argument(
+parser.add_argument('-i', '--input-dir')
-    '-i',
+parser.add_argument('-o', '--output-dir')
-    dest='input_dir',
+parser.add_argument('--log-dir')
    required=False
 )
 parser.add_argument(
    '-o',
    dest='output_dir',
    required=False
 )
 args, remaining_args = parser.parse_known_args()
-cmd = ['docker', 'run', '--rm', '-it', '-u', uid + ':' + gid]
+cmd = ['docker', 'run', '--rm', '-it', '-u', f'{UID}:{GID}']
 if args.input_dir is not None:
-    host_input_dir = os.path.abspath(args.input_dir)
+    mapping = f'{os.path.abspath(args.input_dir)}:{CONTAINER_INPUT_DIR}'
-    cmd += ['-v', host_input_dir + ':' + container_input_dir]
+    cmd += ['-v', mapping]
-    remaining_args += ['-i', container_input_dir]
+    remaining_args += ['-i', CONTAINER_INPUT_DIR]
 if args.output_dir is not None:
-    host_output_dir = os.path.abspath(args.output_dir)
+    mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}'
-    cmd += ['-v', host_output_dir + ':' + container_output_dir]
+    cmd += ['-v', mapping]
-    remaining_args += ['-o', container_output_dir]
+    remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
-cmd.append(container_image)
+if args.log_dir is not None:
    mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}'
    cmd += ['-v', mapping]
    remaining_args += ['--log-dir', CONTAINER_LOG_DIR]
 cmd.append(CONTAINER_IMAGE)
 cmd += remaining_args
-subprocess.run(cmd)
+sys.exit(subprocess.run(cmd).returncode)
Author	SHA1	Message	Date
Patrick Jentsch	6f0545e48b	Mark required arguments in scripts as required	2022-02-03 10:40:25 +01:00
Patrick Jentsch	a2e8e72e54	Bump spaCy version, bugfixes, codestyle	2022-01-27 16:50:22 +01:00
Patrick Jentsch	29ccfac4f6	optimizations	2021-08-11 16:47:29 +02:00
Patrick Jentsch	0ba0c14b72	First attempt	2021-08-10 14:43:55 +02:00
Patrick Jentsch	66516eeb89	WIP use the new package	2021-08-06 16:50:22 +02:00
Patrick Jentsch	a4b2fc3a65	Create package for stand-off-data-py	2021-07-22 16:59:29 +02:00
Patrick Jentsch	4dea95a108	Preliminary work	2021-07-13 16:31:53 +02:00
Patrick Jentsch	5139fd9727	Fix problem where encoding is not set	2021-06-22 12:46:01 +02:00
Patrick Jentsch	fd39246e4b	Update file handling. Now md5 is correct	2021-05-18 10:26:03 +02:00
Patrick Jentsch	bd5d8ddedb	Fix problems caused by wrong textwrap.wrap usage	2021-04-30 09:44:35 +02:00
Patrick Jentsch	f7b7da2b1f	restrict memory usage for nlp tasks	2021-04-22 08:46:28 +02:00
Patrick Jentsch	2813d1a222	Fix long text processing	2021-04-22 08:43:34 +02:00
Patrick Jentsch	cd976692d6	Don't process files in subdirectories	2021-04-12 13:24:31 +02:00
Patrick Jentsch	4e7669d009	Return the returncode	2021-04-12 09:26:21 +02:00
Patrick Jentsch	8105edfd1b	Add missing argument to wrapper script	2021-04-12 09:20:28 +02:00
Patrick Jentsch	72409bd12d	Fix race condition	2021-03-26 14:48:38 +01:00
Patrick Jentsch	54f336e620	Fix permissions	2021-03-26 10:09:45 +01:00
Patrick Jentsch	3b570e5df1	more pipeline help tweaks	2021-03-26 10:02:14 +01:00
Patrick Jentsch	dc62755d12	Update README and pipeline help	2021-03-26 10:01:51 +01:00
Patrick Jentsch	aa1bfa259d	Use JSON files for stand-off annotations.	2021-03-26 09:46:17 +01:00
Patrick Jentsch	d620c29f27	Fix version 1.0.0	2021-02-25 11:26:11 +01:00
Patrick Jentsch	2ced38504c	Use "buster" instead of "10" in FROM	2020-10-08 23:17:58 +02:00
Patrick Jentsch	f02c0953bf	Use new Dockerfile structure	2020-10-08 23:08:49 +02:00
Patrick Jentsch	5329446277	Update CI script	2020-10-07 17:09:09 +02:00
Patrick Jentsch	15e373db58	fix gitlab ci	2020-09-23 16:53:16 +02:00
Patrick Jentsch	8afdfb13b2	Use smaller models	2020-09-23 15:46:43 +02:00
Patrick Jentsch	1ed42f68ad	Remove clean stage from stages	2020-09-23 15:27:31 +02:00
Patrick Jentsch	42583fea46	Update to newer Version	2020-09-23 15:26:53 +02:00
Patrick Jentsch	5bd0feda5c	fix pipeline	2020-06-23 15:19:39 +02:00
Patrick Jentsch	5980a995e5	Add missing newline	2020-06-10 14:23:43 +02:00
Patrick Jentsch	fe7ab93513	Update nlp software metadata represantation	2020-06-10 13:14:34 +02:00
Stephan Porada	91708308bc	Add model version number	2020-05-20 15:35:45 +02:00
Stephan Porada	887e814020	Fix	2020-05-20 15:01:52 +02:00
Stephan Porada	3fc6ebff4c	Add stand off varaiant and metadata	2020-05-20 14:55:52 +02:00
Patrick Jentsch	bef51b7d81	Keep uncompressed output files after zip jobs.	2020-05-13 09:07:31 +02:00
Patrick Jentsch	68e86338d4	Bump versions	2020-04-06 09:21:38 +02:00
Patrick Jentsch	30d127f3af	Fix zip creation	2020-04-04 15:37:12 +02:00
Patrick Jentsch	e061a7426d	Update NLP Pipeline	2020-04-03 17:35:05 +02:00
stephan	41910afb79	Add nlp to filename	2020-02-18 10:17:24 +01:00
stephan	5d2fee029e	Some cosmetics	2020-02-17 14:58:18 +01:00
stephan	6e87e0decd	Add filename argument for zip results	2020-02-17 11:57:55 +01:00
Stephan Porada	79043f3dd7	Fix last errors	2020-02-12 14:25:08 +01:00
Stephan Porada	1a3e4a0a02	Fix check_encoding functionality	2020-02-12 14:16:36 +01:00
Stephan Porada	504861ae07	Update Dockerfile	2020-02-12 13:48:30 +01:00
Stephan Porada	88d03d4360	Add function to check the encoding of input text files.	2020-02-12 13:46:43 +01:00
Patrick Jentsch	6769be049a	Escape text and lemma	2020-02-04 13:12:31 +01:00
Patrick Jentsch	ec2cf1dcff	Fix zip switch integration	2020-02-03 15:26:04 +01:00
Patrick Jentsch	e4ef4835e5	Add a switch for zip functionality	2020-02-03 15:02:26 +01:00
Patrick Jentsch	5f20f9be40	Remove id xml attribute from output file	2020-01-27 15:59:32 +01:00
Patrick Jentsch	b0a402b3ac	Add zip creation	2020-01-20 15:09:38 +01:00
Patrick Jentsch	543a1ba29a	Bump version	2020-01-07 11:24:11 +01:00
Patrick Jentsch	d5a2d38c17	fix	2019-11-04 15:18:52 +01:00
Patrick Jentsch	4af9d9c899	Update	2019-11-04 15:15:41 +01:00
Patrick Jentsch	de8160a5b6	Update .gitlab-ci.yml	2019-09-19 09:25:29 +02:00
Patrick Jentsch	d564ed0464	Update .gitlab-ci.yml	2019-09-19 09:24:04 +02:00
Patrick Jentsch	abf6c430c3	Update .gitlab-ci.yml	2019-09-16 15:52:23 +02:00
Patrick Jentsch	19426a4c78	Set charset again!	2019-09-12 11:42:42 +02:00
Patrick Jentsch	a32184db5c	Codestyle changes.	2019-09-12 10:06:29 +02:00
Patrick Jentsch	a16b010bdc	Install models via an alternative way.	2019-09-12 09:56:13 +02:00
Patrick Jentsch	af293d6141	Codestyle	2019-09-11 16:15:41 +02:00
Patrick Jentsch	43717de313	Use latest image tag for master.	2019-09-11 13:39:51 +02:00
Patrick Jentsch	48fb20ae6b	Change the documentation style.	2019-09-11 13:34:01 +02:00
Patrick Jentsch	2f57b1a0af	Use fix version numbers.	2019-09-11 13:20:07 +02:00
Patrick Jentsch	e68d5c39ee	Update CI script and remove unused code.	2019-07-31 11:39:54 +02:00
		`@ -0,0 +1,2 @@`
							`# flake8: noqa`
							`from .models import StandOffData`