Bump spaCy version, bugfixes, codestyle

2025-10-11 02:42:08 +00:00 · 2022-01-27 16:50:22 +01:00
parent 29ccfac4f6
commit a2e8e72e54
9 changed files with 699 additions and 509 deletions
--- a/28
+++ b/28
@@ -9,7 +9,14 @@ ENV LANG=C.UTF-8
 RUN apt-get update \
 && apt-get install --no-install-recommends --yes \
-     wget
+      procps \
      python3.7 \
      python3-pip \
      wget \
 && python3 -m pip install \
      chardet \
      setuptools \
      wheel
 # Install the NLP pipeline and it's dependencies #
 ## Install pyFlow ##
@@ -21,12 +28,12 @@ RUN wget --no-check-certificate --quiet \
 && apt-get install --no-install-recommends --yes \
      python2.7 \
 && python2.7 setup.py build install \
- && cd .. \
+ && cd - > /dev/null \
 && rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"
 ## Install spaCy ##
-ENV SPACY_VERSION=3.0.5
+ENV SPACY_VERSION=3.2.1
 RUN apt-get install --no-install-recommends --yes \
      python3.7 \
      python3-pip \
@@ -38,23 +45,14 @@ RUN apt-get install --no-install-recommends --yes \
 && pip3 install "spacy==${SPACY_VERSION}"
-# Only models that include the following components are compatibel:
+ENV SPACY_MODELS="de_core_news_md,en_core_web_md,it_core_news_md,pl_core_news_md,zh_core_web_md"
-# lemmatizer, ner, parser, senter, tagger,
+ENV SPACY_MODELS_VERSION=3.2.0
 ENV SPACY_MODELS="de_core_news_md,en_core_web_md,it_core_news_md,nl_core_news_md,pl_core_news_md,zh_core_web_md"
 ENV SPACY_MODELS_VERSION=3.0.0
 RUN for spacy_model in $(echo ${SPACY_MODELS} | tr "," "\n"); do python3 -m spacy download "${spacy_model}-${SPACY_MODELS_VERSION}" --direct; done
 ## Further dependencies ##
 RUN apt-get install --no-install-recommends --yes \
      procps \
      zip
 COPY packages .
 RUN cd stand-off-data-py \
- && python3 setup.py build \
+ && python3 -m pip install . \
 && python3 setup.py install \
 && cd -
--- a/21
+++ b/21
@@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2021 Bielefeld University - CRC 1288 - INF
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/README.md
+++ b/README.md
@@ -1,48 +1,41 @@
 # NLP - Natural Language Processing
-This software implements a heavily parallelized pipeline for Natural Language Processing of text files. It is used for nopaque's NLP service but you can also use it standalone, for that purpose a convenient wrapper script is provided.
+This software implements a heavily parallelized pipeline for Natural Language Processing of text files. It is used for nopaque's NLP service but you can also use it standalone, for that purpose a convenient wrapper script is provided. The pipeline is designed to run on Linux operating systems, but with some tweaks it should also run on Windows with WSL installed.
 ## Software used in this pipeline implementation
- Official Debian Docker image (buster-slim) and programs from its free repositories: https://hub.docker.com/_/debian
+
 - Official Debian Docker image (buster-slim): https://hub.docker.com/_/debian
  - Software from Debian Buster's free repositories
 - pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
- spaCy (3.0.5): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1
+- spaCy (3.2.1): https://github.com/explosion/spaCy/releases/tag/v3.2.1
- spaCy medium sized models (3.0.0):
+- spaCy medium sized models (3.2.0):
-  - https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.0.0
+  - https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.2.0
-  - https://github.com/explosion/spacy-models/releases/tag/en_core_web_md-3.0.0
+  - https://github.com/explosion/spacy-models/releases/tag/en_core_web_md-3.2.0
-  - https://github.com/explosion/spacy-models/releases/tag/it_core_news_md-3.0.0
+  - https://github.com/explosion/spacy-models/releases/tag/it_core_news_md-3.2.0
-  - https://github.com/explosion/spacy-models/releases/tag/nl_core_news_md-3.0.0
+  - https://github.com/explosion/spacy-models/releases/tag/nl_core_news_md-3.2.0
-  - https://github.com/explosion/spacy-models/releases/tag/pl_core_news_md-3.0.0
+  - https://github.com/explosion/spacy-models/releases/tag/pl_core_news_md-3.2.0
-  - https://github.com/explosion/spacy-models/releases/tag/zh_core_web_md-3.0.0
+  - https://github.com/explosion/spacy-models/releases/tag/zh_core_web_md-3.2.0
-## Use this image
+## Installation
-1. Create input and output directories for the pipeline.
+1. Install Docker and Python 3.
-``` bash
+2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git`
-mkdir -p /<my_data_location>/input /<my_data_location>/output
+3. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:v0.1.0 nlp`
-```
+4. Add the wrapper script (`wrapper/nlp` relative to this README file) to your `${PATH}`.
 5. Create working directories for the pipeline: `mkdir -p /<my_data_location>/{input,output}`.
 2. Place your text files inside `/<my_data_location>/input`. Files should all contain text of the same language.
 ## Use the Pipeline
 1. Place your plain text files inside `/<my_data_location>/input`. Files should all contain text of the same language.
 2. Clear your `/<my_data_location>/output` directory.
 3. Start the pipeline process. Check the pipeline help (`nlp --help`) for more details.
-```
+```bash
 # Option one: Use the wrapper script
 ## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/raw/1.0.0/wrapper/nlp, make it executeable and add it to your ${PATH}
 cd /<my_data_location>
-nlp -i input -l <language_code> -o output <optional_pipeline_arguments>
+nlp \
-
+  --input-dir input \
-# Option two: Classic Docker style
+  --output-dir output \
-docker run \
+  -m <model_code> <optional_pipeline_arguments>
    --rm \
    -it \
    -u $(id -u $USER):$(id -g $USER) \
    -v /<my_data_location>/input:/input \
    -v /<my_data_location>/output:/output \
    gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0 \
        -i /input \
        -l <language_code>
        -o /output \
        <optional_pipeline_arguments>
 ```
 4. Check your results in the `/<my_data_location>/output` directory.
--- a/339
+++ b/339
@@ -1,73 +1,141 @@
 #!/usr/bin/env python2.7
 # coding=utf-8
-"""A NLP pipeline for text file processing."""
+''' A NLP pipeline for text file processing. '''
-
+__version__ = '0.1.0'
 __author__ = 'Patrick Jentsch <p.jentsch@uni-bielefeld.de>,' \
             'Stephan Porada <porada@posteo.de>'
 __version__ = '1.0.0'
 from argparse import ArgumentParser
 from pyflow import WorkflowRunner
-import multiprocessing
+import json
 import os
 import sys
-SPACY_MODELS = {'de': 'de_core_news_md',
+SPACY_MODELS = {
    'de': 'de_core_news_md',
    'en': 'en_core_web_md',
    'it': 'it_core_news_md',
    'nl': 'nl_core_news_md',
    'pl': 'pl_core_news_md',
-                'zh': 'zh_core_web_md'}
+    'zh': 'zh_core_web_md'
 }
-class NLPPipelineJob:
+class PipelineJob:
-    """An NLP pipeline job class
+    '''
    NLP pipeline job class.
-    Each input file of the pipeline is represented as an NLP pipeline job,
+    Each plain text input file of the pipeline is represented as an NLP
-    which holds all necessary information for the pipeline to process it.
+    pipeline job, which holds all necessary information for the pipeline to
    process it.
    Arguments:
    file -- Path to the file
-    output_dir -- Path to a directory, where job results a stored
+    output_dir -- Path to a directory, where job results are stored
-    """
+    '''
    def __init__(self, file, output_dir):
        self.file = file
-        self.name = os.path.basename(file).rsplit('.', 1)[0]
+        self.name = os.path.basename(file)[:-4]
        self.output_dir = output_dir
        catma_stand_off_data_file = file.rsplit('.', 1)[0] + '.catma-stand-off.json'  # noqa
        if os.path.exists(catma_stand_off_data_file):
            self.catma_stand_off_data_file = catma_stand_off_data_file
        else:
            self.catma_stand_off_data_file = None
-class NLPPipeline(WorkflowRunner):
+class NLPWorkflow(WorkflowRunner):
-    def __init__(self, input_dir, output_dir, check_encoding, lang, zip):
+    def __init__(self, job, model, check_encoding=False, id_prefix=''):
        self.job = job
        self.model = model
        self.check_encoding = check_encoding
        self.id_prefix = id_prefix
    def workflow(self):
        '''
        ' ##################################################
        ' # spacy                                          #
        ' ##################################################
        '''
        n_cores = 1
        mem_mb = min(1024, self.getMemMb())
        cmd = 'spacy-nlp'
        cmd += ' --input-file "{}"'.format(self.job.file)
        cmd += ' --output-file "{}"'.format(
            os.path.join(self.job.output_dir, '{}.json'.format(self.job.name))
        )
        cmd += ' -m "{}"'.format(self.model)
        if self.check_encoding:
            cmd += ' --check-encoding'
        cmd += ' --id-prefix "{}"'.format(self.id_prefix)
        self.addTask(
            'spacy',
            command=cmd,
            memMb=mem_mb,
            nCores=n_cores
        )
 class CreateVrtWorkflow(WorkflowRunner):
    def __init__(self, job):
        self.job = job
    def workflow(self):
        '''
        ' ##################################################
        ' # vrt-creator                                    #
        ' ##################################################
        '''
        n_cores = 1
        mem_mb = min(256, self.getMemMb())
        cmd = 'vrt-creator'
        cmd += ' --stand-off-data-file "{}"'.format(
            os.path.join(self.job.output_dir, '{}.json'.format(self.job.name))
        )
        cmd += ' --text-file "{}"'.format(self.job.file)
        cmd += ' --output-file "{}"'.format(
            os.path.join(self.job.output_dir, '{}.vrt'.format(self.job.name))
        )
        self.addTask(
            'vrt_creator',
            command=cmd,
            memMb=mem_mb,
            nCores=n_cores
        )
 class MainWorkflow(WorkflowRunner):
    def __init__(
        self,
        input_dir,
        model,
        output_dir,
        check_encoding=False,
        id_prefix=''
    ):
        self.input_dir = input_dir
        self.model = model
        self.output_dir = output_dir
        self.check_encoding = check_encoding
-        self.lang = lang
+        self.id_prefix = id_prefix
-        self.zip = zip
+        self.jobs = []
-        self.jobs = collect_jobs(self.input_dir, self.output_dir)
+
    def collect_jobs(self):
        self.jobs = []
        for file in os.listdir(self.input_dir):
            if os.path.isdir(os.path.join(self.input_dir, file)):
                continue
            if not file.lower().endswith('.txt'):
                continue
            job = PipelineJob(
                os.path.join(self.input_dir, file),
                os.path.join(self.output_dir, file)
            )
            self.jobs.append(job)
    def workflow(self):
        if not self.jobs:
            return
-        '''
+        # Create output and temporary directories
-        ' ##################################################
+        for job in self.jobs:
-        ' # setup output directory                         #
+            os.mkdir(job.output_dir)
        ' ##################################################
        '''
        setup_output_directory_tasks = []
        for i, job in enumerate(self.jobs):
            cmd = 'mkdir -p "{}"'.format(job.output_dir)
            lbl = 'setup_output_directory_-_{}'.format(i)
            task = self.addTask(command=cmd, label=lbl)
            setup_output_directory_tasks.append(task)
        '''
        ' ##################################################
@@ -75,106 +143,116 @@ class NLPPipeline(WorkflowRunner):
        ' ##################################################
        '''
        nlp_tasks = []
        n_cores = max(1, int(self.getNCores() / len(self.jobs)))
        mem_mb = min(n_cores * 2048, int(self.getMemMb() / len(self.jobs)))
        for i, job in enumerate(self.jobs):
-            output_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name))  # noqa
+            task = self.addWorkflowTask(
-            cmd = 'spacy-nlp'
+                'nlp_-_{}'.format(i),
-            cmd += ' -l "{}"'.format(self.lang)
+                NLPWorkflow(
-            cmd += ' --check-encoding' if self.check_encoding else ''
+                    job,
-            cmd += ' "{}"'.format(job.file)
+                    self.model,
-            cmd += ' "{}"'.format(output_file)
+                    check_encoding=self.check_encoding,
-            deps = 'setup_output_directory_-_{}'.format(i)
+                    id_prefix=self.id_prefix
-            lbl = 'nlp_-_{}'.format(i)
+                )
-            task = self.addTask(command=cmd, dependencies=deps, label=lbl,
+            )
                                memMb=mem_mb, nCores=n_cores)
            nlp_tasks.append(task)
        '''
        ' ##################################################
-        ' # vrt creation                                   #
+        ' # create vrt                                     #
        ' ##################################################
        '''
-        vrt_creation_tasks = []
+        create_vrt_tasks = []
        for i, job in enumerate(self.jobs):
-            output_file = os.path.join(job.output_dir, '{}.vrt'.format(job.name))  # noqa
+            task = self.addWorkflowTask(
-            nopaque_stand_off_data_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name))  # noqa
+                'create_vrt_-_{}'.format(i),
-            cmd = 'vrt-creator'
+                CreateVrtWorkflow(job),
-            cmd += ' "{}"'.format(job.file)
+                dependencies='nlp_-_{}'.format(i)
-            cmd += ' "{}"'.format(nopaque_stand_off_data_file)
+            )
-            if job.catma_stand_off_data_file is not None:
+            create_vrt_tasks.append(task)
                cmd += ' --catma-stand-off-data "{}"'.format(job.catma_stand_off_data_file)  # noqa
            cmd += ' "{}"'.format(output_file)
            deps = 'nlp_-_{}'.format(i)
            lbl = 'vrt_creation_-_{}'.format(i)
            task = self.addTask(command=cmd, dependencies=deps, label=lbl)
            vrt_creation_tasks.append(task)
-        '''
+        self.waitForTasks()
-        ' ##################################################
+        outputs = []
-        ' # zip creation                                   #
+        for job in self.jobs:
-        ' ##################################################
+            # Track output files
-        '''
+            relative_output_dir = os.path.relpath(
-        zip_creation_tasks = []
+                job.output_dir,
-        if self.zip is not None:
+                start=self.output_dir
-            cmd = 'cd "{}"'.format(self.output_dir)
+            )
-            cmd += ' && '
+            outputs.append(
-            cmd += 'zip'
+                {
-            cmd += ' -r'
+                    'description': 'JSON stand off data',
-            cmd += ' "{}.zip" .'.format(self.zip)
+                    'file': os.path.join(
-            cmd += ' -x "pyflow.data*"'
+                        relative_output_dir,
-            cmd += ' -i "*.vrt" "*.json"'
+                        '{}.json'.format(job.name)
-            cmd += ' && '
+                    ),
-            cmd += 'cd -'
+                    'mimetype': 'application/json'
-            deps = vrt_creation_tasks
+                }
-            lbl = 'zip_creation'
+            )
-            task = self.addTask(command=cmd, dependencies=deps, label=lbl)
+            outputs.append(
-            zip_creation_tasks.append(task)
+                {
-
+                    'description': 'CWB vrt file',
-
+                    'file': os.path.join(
-def collect_jobs(input_dir, output_dir):
+                        relative_output_dir,
-    jobs = []
+                        '{}.vrt'.format(job.name)
-    for file in os.listdir(input_dir):
+                    ),
-        if os.path.isdir(os.path.join(input_dir, file)):
+                    'mimetype': 'application/vrt+xml'
-            continue
+                }
-        if file.lower().endswith('.txt'):
+            )
-            job = NLPPipelineJob(os.path.join(input_dir, file),
+        with open(os.path.join(self.output_dir, 'outputs.json'), 'w') as f:
-                                 os.path.join(output_dir, file))
+            json.dump(outputs, f, indent=4)
            jobs.append(job)
    return jobs
 def parse_args():
-    parser = ArgumentParser(description='NLP pipeline for TXT file processing',
+    parser = ArgumentParser(
-                            prog='NLP pipeline')
+        description='NLP pipeline for plain text file processing'
-    parser.add_argument('-i', '--input-dir',
+    )
    parser.add_argument(
        '-i', '--input-dir',
        help='Input directory',
-                        required=True)
+        required=True
-    parser.add_argument('-o', '--output-dir',
+    )
    parser.add_argument(
        '-o', '--output-dir',
        help='Output directory',
-                        required=True)
+        required=True
-    parser.add_argument('-l', '--language',
+    )
    parser.add_argument(
        '-m', '--model',
        choices=SPACY_MODELS.keys(),
-                        help='Language of the input (2-character ISO 639-1 language codes)',  # noqa
+        help='The model to be used',
-                        required=True)
+        required=True
-    parser.add_argument('--check-encoding',
+    )
    parser.add_argument(
        '--check-encoding',
        action='store_true',
-                        help='Check encoding of the input file, UTF-8 is used instead')  # noqa
+        help='Check encoding of the input file, UTF-8 is used instead'
-    parser.add_argument('--log-dir',
+    )
-                        help='Logging directory')
+    parser.add_argument(
-    parser.add_argument('--mem-mb',
+        '--id-prefix',
-                        help='Amount of system memory to be used (Default: min(--n-cores * 2048, available system memory))',  # noqa
+        default='',
-                        type=int)
+        help='A prefix for all the ids within the stand off annotations'
-    parser.add_argument('--n-cores',
+    )
-                        default=min(4, multiprocessing.cpu_count()),
+    parser.add_argument(
-                        help='Number of CPU threads to be used (Default: min(4, number of CPUs))',  # noqa
+        '--log-dir',
-                        type=int)
+        help='Logging directory (Default: --output-dir)'
-    parser.add_argument('--zip',
+    )
-                        help='Create one zip file per filetype')
+    parser.add_argument(
-    parser.add_argument('-v', '--version',
+        '--mem-mb',
        help='Amount of system memory to be used '
             '(Default: min(--n-cores * 1024, available system memory))',
        type=int
    )
    parser.add_argument(
        '--n-cores',
        default=1,
        help='Number of CPU threads to be used',
        type=int
    )
    parser.add_argument(
        '-v', '--version',
        action='version',
        help='Returns the current version of the NLP pipeline',
-                        version='%(prog)s {}'.format(__version__))
+        version='%(prog)s {}'.format(__version__)
    )
    args = parser.parse_args()
    # Set some tricky default values and check for insufficient input
@@ -184,20 +262,27 @@ def parse_args():
        raise Exception('--n-cores must be greater or equal 1')
    if args.mem_mb is None:
        max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0])
-        args.mem_mb = min(args.n_cores * 2048, max_mem_mb)
+        args.mem_mb = min(args.n_cores * 1024, max_mem_mb)
-    if args.mem_mb < 2048:
+    if args.mem_mb < 1024:
-        raise Exception('--mem-mb must be greater or equal 2048')
+        raise Exception('--mem-mb must be greater or equal 1024')
    if args.zip is not None and args.zip.lower().endswith('.zip'):
        # Remove .zip file extension if provided
        args.zip = args.zip[:-4]
        args.zip = args.zip if args.zip else 'output'
    return args
 def main():
    args = parse_args()
-    nlp_pipeline = NLPPipeline(args.input_dir, args.output_dir, args.check_encoding, args.language, args.zip)  # noqa
+    main_workflow = MainWorkflow(
-    retval = nlp_pipeline.run(dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores)  # noqa
+        args.input_dir,
        args.model,
        args.output_dir,
        check_encoding=args.check_encoding,
        id_prefix=args.id_prefix
    )
    main_workflow.collect_jobs()
    retval = main_workflow.run(
        dataDirRoot=args.log_dir,
        memMb=args.mem_mb,
        nCores=args.n_cores
    )
    sys.exit(retval)
--- a/packages/stand-off-data-py/setup.py
+++ b/packages/stand-off-data-py/setup.py
@@ -1,14 +1,14 @@
 import setuptools
 setuptools.setup(
-    name='stand-off-data',
+    name='Stand off data',
    author='Patrick Jentsch',
    author_email='p.jentsch@uni-bielefeld.de',
    description='A python library to handle stand off data.',
    py_modules=['stand_off_data'],
    classifiers=[
        'Programming Language :: Python :: 3',
        'Operating System :: OS Independent',
    ],
    packages=setuptools.find_packages(),
    python_requires='>=3.5'
 )
--- a/packages/stand-off-data-py/stand_off_data/models.py
+++ b/packages/stand-off-data-py/stand_off_data/models.py
@@ -7,13 +7,15 @@ class StandOffData:
        self.lookup = {}
        for x in attrs.get('tags', []):
            self.add_tag_definition(x)
-        self.annotations = [TagAnnotation(x, self.lookup)
+        self.annotations = [
-                            for x in attrs.get('annotations', [])]
+            TagAnnotation(x, self.lookup)
            for x in attrs.get('annotations', [])
        ]
    def add_tag_definition(self, attrs):
        tag_definition = TagDefinition(attrs)
        if tag_definition.id in self.lookup:
-            raise Exception('Tag id already in use: {}'.format(self.to_dict()))
+            raise Exception(f'Tag id already in use: {self.to_dict()}')
        self.lookup[tag_definition.id] = tag_definition
    def to_dict(self):
@@ -42,7 +44,9 @@ class StandOffData:
            if ((p_attr.start >= next_p_attr.start) and (p_attr.start < next_p_attr.end)  # noqa
                or (p_attr.end > next_p_attr.start) and (p_attr.end <= next_p_attr.end)):  # noqa
                raise Exception(
-                    'Positional attribute overlaps another: {}<->{}'.format(p_attr.to_dict(), next_p_attr.to_dict()))
+                    'Positional attribute overlaps another: '
                    f'{p_attr.to_dict()}<->{next_p_attr.to_dict()}'
                )
        # Check for s_attr<->p_attr overlap
        for i, s_attr in enumerate(s_attrs):
            for p_attr in p_attrs:
@@ -56,8 +60,11 @@ class StandOffData:
                    s_attrs[i].end = p_attr.end
                # Check if s_attr starts/ends before/after p_attr
                if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start:
-                    # No further Checking needed (just because p_attrs are sorted)
+                    # No further Checking needed (because p_attrs are sorted)
                    break
        p_attr_buffer = {}
        for i, p_attr in enumerate(p_attrs):
            p_attr_buffer[p_attr.start] = i
        s_attr_start_buffer = {}
        s_attr_end_buffer = {}
        for i, s_attr in enumerate(s_attrs):
@@ -66,34 +73,56 @@ class StandOffData:
            else:
                s_attr_start_buffer[s_attr.start] = [i]
            if s_attr.end in s_attr_end_buffer:
-                s_attr_end_buffer[s_attr.end].append(i)
+                s_attr_end_buffer[s_attr.end].insert(0, i)
            else:
                s_attr_end_buffer[s_attr.end] = [i]
        vrt = ''
        vrt += '<text>\n'
-        for p_attr in p_attrs:
+        current_position = 0
-            # s_attr_ends
+        text_len = len(text)
-            for k in {k: v for k, v in s_attr_end_buffer.items() if k <= p_attr.start}:  # noqa
+        # As long as we have something in our buffers we process it
-                s_attr_indexes = s_attr_end_buffer.pop(k)
+        while current_position <= text_len:
            # s_attr endings
            # for k in {k: v for k, v in s_attr_end_buffer.items() if k <= current_position}:  # noqa
            if current_position in s_attr_end_buffer:
                # s_attr_indexes = s_attr_end_buffer.pop(k)
                s_attr_indexes = s_attr_end_buffer.pop(current_position)
                for s_attr_index in s_attr_indexes:
                    s_attr = s_attrs[s_attr_index]
-                    vrt += '</{}>\n'.format(escape(s_attr.name))
+                    vrt += f'</{escape(s_attr.name)}>\n'
-            # s_attr_starts
+            # s_attrs starts
-            for k in {k: v for k, v in s_attr_start_buffer.items() if k <= p_attr.start}:  # noqa
+            # for k in {k: v for k, v in s_attr_start_buffer.items() if k <= current_position}:  # noqa
-                s_attr_indexes = s_attr_start_buffer.pop(k)
+            if current_position in s_attr_start_buffer:
                # s_attr_indexes = s_attr_start_buffer.pop(k)
                s_attr_indexes = s_attr_start_buffer.pop(current_position)
                for s_attr_index in s_attr_indexes:
                    s_attr = s_attrs[s_attr_index]
-                    foo = ''
+                    vrt += f'<{escape(s_attr.name)}'
                    for property in s_attr.properties:
-                        foo += ' {}="{}"'.format(escape(property.name),
+                        vrt += f' {escape(property.name)}="{escape(str(property.value))}"'  # noqa
-                                                 escape(property.value))
+                    vrt += '>\n'
-                    vrt += '<{}{}>\n'.format(escape(s_attr.name), foo)
+            # p_attrs
-            foo = {'lemma': None, 'ner': None, 'pos': None, 'simple_pos': None, 'word': None}  # noqa
+            if current_position not in p_attr_buffer:
                current_position += 1
                continue
            p_attr_index = p_attr_buffer.pop(current_position)
            p_attr = p_attrs[p_attr_index]
            if text[p_attr.start:p_attr.end].isspace():
                current_position = p_attr.end
                continue
            _p_attr = {
                'lemma': 'None',
                'pos': 'None',
                'simple_pos': 'None',
                'word': 'None'
            }
            for property in p_attr.properties:
-                foo[property.name] = escape(property.value)
+                if property.name not in _p_attr:
-            foo['word'] = escape(text[p_attr.start:p_attr.end])
+                    continue
-            vrt += '{word}\t{pos}\t{lemma}\t{simple_pos}\t{ner}\n'.format(
+                _p_attr[property.name] = escape(str(property.value))
-                **foo)
+            _p_attr['word'] = escape(text[p_attr.start:p_attr.end])
            vrt += '{word}\t{pos}\t{lemma}\t{simple_pos}\n'.format(**_p_attr)
            current_position = p_attr.end
        vrt += '</text>\n'
        return vrt
@@ -110,15 +139,15 @@ class TagAnnotation:
        ]
        ''' Sanity checks '''
        if self.tag_id not in self.lookup:
-            raise Exception('Unknown tag: {}'.format(self.to_dict()))
+            raise Exception(f'Unknown tag: {self.to_dict()}')
        if self.end < self.start:
-            raise Exception('Annotation end less then start: '
+            raise Exception(f'Annotation end less then start: {self.to_dict()}')  # noqa
-                            '{}'.format(self.to_dict()))
+        # property_ids = [x.property_id for x in self.properties]
-        property_ids = [x.property_id for x in self.properties]
+        # for required_property_id, required_property in self.lookup[self.tag_id].required_properties.items():  # noqa
-        for required_property_id, required_property in self.lookup[self.tag_id].required_properties.items():  # noqa
+        #     if required_property_id not in property_ids:
-            if required_property_id not in property_ids:
+        #         raise Exception(
-                raise Exception('Missing required property: '
+        #             f'Missing required property: {required_property.to_dict()}'
-                                '{}'.format(required_property.to_dict()))
+        #         )
    @property
    def name(self):
@@ -134,33 +163,45 @@ class TagAnnotation:
    def __lt__(self, other):
        if self.start == other.start:
-            return self.name == 'token' and other.name != 'token'
+            if self.name == 'token' and other.name != 'token':
                return False
            elif self.name != 'token' and other.name == 'token':
                return True
            else:
                return self.end > other.end
        else:
            return self.start < other.start
    def __le__(self, other):
        if self.start == other.start:
-            return self.name == 'token' or other.name != 'token'
+            if self.name == 'token' and other.name != 'token':
                return False
            elif self.name != 'token' and other.name == 'token':
                return True
            else:
-            return self.start < other.start
+                return self.end >= other.end
        else:
            return self.start <= other.start
    def __eq__(self, other):
-        return self.start == other.start and self.name == other.name
+        if self.start == other.start:
            if self.name == 'token' and other.name != 'token':
                return False
            elif self.name != 'token' and other.name == 'token':
                return False
            else:
                return self.end == other.end
        else:
            return False
    def __ne__(self, other):
-        return self.start != other.start and self.name != other.name
+        return not self == other
    def __gt__(self, other):
-        if self.start == other.start:
+        return not self <= other
            return self.name != 'token' and other.name == 'token'
        else:
            return self.start > other.start
    def __ge__(self, other):
-        if self.start == other.start:
+        return not self < other
            return self.name != 'token' or other.name == 'token'
        else:
            return self.start > other.start
 class PropertyAnnotation:
@@ -171,7 +212,7 @@ class PropertyAnnotation:
        # TODO: Process attrs['possibleValues'] as self.labels (no id?)
        ''' Sanity checks '''
        if self.property_id not in self.lookup:
-            raise Exception('Unknown property: {}'.format(self.to_dict()))
+            raise Exception(f'Unknown property: {self.to_dict()}')
    @property
    def name(self):
@@ -197,14 +238,14 @@ class TagDefinition:
    def add_property_definition(self, attrs):
        property_definition = PropertyDefinition(attrs)
        if property_definition.id in self.properties:
-            raise Exception('Property id already in use: '
+            raise Exception(
-                            '{}'.format(property_definition.to_dict()))
+                f'Property id already in use: {property_definition.to_dict()}')
        self.properties[property_definition.id] = property_definition
-    @property
+    # @property
-    def required_properties(self):
+    # def required_properties(self):
-        return {property.id: property for property in self.properties.values()
+    #     return {property.id: property for property in self.properties.values()
-                if property.is_required}
+    #             if property.is_required}
    def to_dict(self):
        return {
@@ -223,9 +264,9 @@ class PropertyDefinition:
        self.flags = attrs.get('flags', [])
        self.labels = attrs.get('labels', [])
-    @property
+    # @property
-    def is_required(self):
+    # def is_required(self):
-        return 'required' in self.flags
+    #     return 'required' in self.flags
    @property
    def has_multiple_values(self):
--- a/297
+++ b/297
@@ -11,46 +11,64 @@ import textwrap
 import uuid
-def UUIDnopaque(name):
+spacy_models = {
-    return 'nopaque_{}'.format(
+    spacy.info(pipeline)['lang']: pipeline
-        uuid.uuid3(uuid.NAMESPACE_DNS,
+    for pipeline in spacy.info()['pipelines']
-                   '{}@nopaque.sfb1288.uni-bielefeld.de'.format(name))
+}
    )
 spacy_models = {spacy.info(pipeline)['lang']: pipeline
                for pipeline in spacy.info()['pipelines']}
 # Parse the given arguments
-parser = ArgumentParser(description='Create annotations for a given txt file')
+parser = ArgumentParser(
-parser.add_argument('input', help='Path to txt input file')
+    description='Create annotations for a given plain txt file'
-parser.add_argument('output', help='Path to JSON output file')
+)
-parser.add_argument('-l', '--language',
+parser.add_argument(
    '-i', '--input-file',
    help='Input file'
 )
 parser.add_argument(
    '-o', '--output-file',
    help='Output file',
    required=True
 )
 parser.add_argument(
    '-m', '--model',
    choices=spacy_models.keys(),
-                    help='Language of the input (2-character ISO 639-1 language codes)',  # noqa
+    help='The model to be used',
-                    required=True)
+    required=True
-parser.add_argument('-c', '--check-encoding',
+)
 parser.add_argument(
    '-c', '--check-encoding',
    action='store_true',
-                    help='Check encoding of the input file, UTF-8 is used instead')  # noqa
+    help='Check encoding of the input file, UTF-8 is used instead'
 )
 parser.add_argument(
    '--id-prefix',
    default='',
    help='A prefix for all the ids within the stand off annotations'
 )
 args = parser.parse_args()
-with open(args.input, "rb") as text_file:
+
 def generate_id(name):
    return f'{args.id_prefix}{uuid.uuid3(uuid.NAMESPACE_DNS, name)}'
 with open(args.input_file, "rb") as input_file:
    if args.check_encoding:
-        encoding = chardet.detect(text_file.read())['encoding']
+        encoding = chardet.detect(input_file.read())['encoding']
    else:
        encoding = 'utf-8'
-    text_file.seek(0)
+    input_file.seek(0)
    text_md5 = hashlib.md5()
-    for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''):
+    for chunk in iter(lambda: input_file.read(128 * text_md5.block_size), b''):
        text_md5.update(chunk)
 # Load the text contents from the input file
-with open(args.input, encoding=encoding) as text_file:
+with open(args.input_file, encoding=encoding) as input_file:
    # spaCy NLP is limited to strings with a maximum of 1 million characters at
    # once. So we split it into suitable chunks.
    text_chunks = textwrap.wrap(
-        text_file.read(),
+        input_file.read(),
        1000000,
        break_long_words=False,
        break_on_hyphens=False,
@@ -59,186 +77,197 @@ with open(args.input, encoding=encoding) as text_file:
        replace_whitespace=False
    )
-model = spacy_models[args.language]
+model_name = spacy_models[args.model]
-nlp = spacy.load(model)
+nlp = spacy.load(model_name)
 meta = {
    'generator': {
-        'name': 'nopaque NLP service',
+        'name': 'nopaque spacy NLP',
-        'version': '1.0.0',
+        'version': '0.1.0',
        'arguments': {
            'check_encoding': args.check_encoding,
-            'language': args.language
+            'model': args.model
        }
    },
    'file': {
        'encoding': encoding,
        'md5': text_md5.hexdigest(),
-        'name': os.path.basename(args.input)
+        'name': os.path.basename(args.input_file)
    }
 }
-tags = [
+tags = []
-    {
+token = {
-        'id': UUIDnopaque('token'),
+    'id': generate_id('token'),
    'name': 'token',
-        'description': 'An individual token — i.e. a word, punctuation symbol, whitespace, etc.',
+    'description': 'An individual token — i.e. a word, punctuation symbol, whitespace, etc.',  # noqa
-        'properties': [
+    'properties': []
 }
 # TODO: Check if all languages support token.sentiment
 token['properties'].append(
    {
-                'id': UUIDnopaque('token.lemma'),
+        'id': generate_id('token.sentiment'),
        'name': 'sentiment',
        'description': 'A scalar value indicating the positivity or negativity of the token.'  # noqa
    }
 )
 if nlp.has_pipe('lemmatizer'):
    token['properties'].append(
        {
            'id': generate_id('token.lemma'),
            'name': 'lemma',
-                'description': 'The base form of the word',
+            'description': 'The base form of the word'
-                'flags': ['required'],
+        }
-                'labels': []
+    )
-            },
+if nlp.has_pipe('morphologizer') or nlp.has_pipe('tagger'):
    token['properties'].append(
        {
-                'id': UUIDnopaque('token.pos'),
+            'id': generate_id('token.simple_pos'),
                'name': 'pos',
                'description': 'The detailed part-of-speech tag',
                'flags': ['required'],
                'labels': [
                    {
                        'id': UUIDnopaque('token.pos={}'.format(label)),
                        'name': label,
                        'description': spacy.explain(label) or ''
                    } for label in spacy.info(model)['labels']['tagger']
                ]
            },
            {
                'id': UUIDnopaque('token.simple_pos'),
            'name': 'simple_pos',
            'description': 'The simple UPOS part-of-speech tag',
                'flags': ['required'],
            'labels': [
                {
-                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'ADJ',
                    'description': 'adjective'
                },
                {
-                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'ADP',
                    'description': 'adposition'
                },
                {
-                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'ADV',
                    'description': 'adverb'
                },
                {
-                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'AUX',
                    'description': 'auxiliary verb'
                },
                {
-                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'CONJ',
                    'description': 'coordinating conjunction'
                },
                {
-                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'DET',
                    'description': 'determiner'
                },
                {
-                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'INTJ',
                    'description': 'interjection'
                },
                {
-                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'NOUN',
                    'description': 'noun'
                },
                {
-                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'NUM',
                    'description': 'numeral'
                },
                {
-                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'PART',
                    'description': 'particle'
                },
                {
-                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'PRON',
                    'description': 'pronoun'
                },
                {
-                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'PROPN',
                    'description': 'proper noun'
                },
                {
-                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'PUNCT',
                    'description': 'punctuation'
                },
                {
-                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'SCONJ',
                    'description': 'subordinating conjunction'
                },
                {
-                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'SYM',
                    'description': 'symbol'
                },
                {
-                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'VERB',
                    'description': 'verb'
                },
                {
-                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                    'id': generate_id('token.simple_pos=ADJ'),
                    'name': 'X',
                    'description': 'other'
                }
            ]
-            },
+        }
    )
 if nlp.has_pipe('tagger'):
    token['properties'].append(
        {
-                'id': UUIDnopaque('token.ner'),
+            'id': generate_id('token.pos'),
-                'name': 'ner',
+            'name': 'pos',
-                'description': 'Label indicating the type of the entity',
+            'description': 'The detailed part-of-speech tag',
                'flags': ['required'],
            'labels': [
                {
-                        'id': UUIDnopaque('token.ner={}'.format(label)),
+                    'id': generate_id(f'token.pos={label}'),
                    'name': label,
                    'description': spacy.explain(label) or ''
-                    } for label in spacy.info(model)['labels']['ner']
+                } for label in spacy.info(model_name)['labels']['tagger']
            ]
        }
-        ]
+    )
-    },
+if nlp.has_pipe('ner') or nlp.has_pipe('entity_ruler'):
    tags.append(
        {
-        'id': UUIDnopaque('s'),
+            'id': generate_id('ent'),
        'name': 's',
        'description': 'Encodes the start and end of a sentence',
        'properties': []
    },
    {
        'id': UUIDnopaque('ent'),
            'name': 'ent',
            'description': 'Encodes the start and end of a named entity',
            'properties': [
                {
-                'id': UUIDnopaque('ent.type'),
+                    'id': generate_id('ent.type'),
                    'name': 'type',
                    'description': 'Label indicating the type of the entity',
                'flags': ['required'],
                    'labels': [
                        {
-                        'id': UUIDnopaque('ent.type={}'.format(label)),
+                            'id': generate_id('ent.type={}'.format(label)),
                            'name': label,
                            'description': spacy.explain(label) or ''
-                    } for label in spacy.info(model)['labels']['ner']
+                        } for label in spacy.info(model_name)['labels']['ner']
                    ]
                }
            ]
        }
    )
 if nlp.has_pipe('parser') or nlp.has_pipe('senter') or nlp.has_pipe('sentencizer'):  # noqa
    # TODO: Check if all languages support sent.sentiment
    tags.append(
        {
            'id': generate_id('s'),
            'name': 's',
            'description': 'Encodes the start and end of a sentence',
            'properties': [
                {
                    'id': generate_id('s.sentiment'),
                    'name': 'sentiment',
                    'description': 'A scalar value indicating the positivity or negativity of the sentence.'  # noqa
                }
            ]
        }
    )
 tags.append(token)
 annotations = []
@@ -246,60 +275,78 @@ chunk_offset = 0
 while text_chunks:
    text_chunk = text_chunks.pop(0)
    doc = nlp(text_chunk)
-    for token in doc:
+    if hasattr(doc, 'ents'):
-        if token.is_space:
+        for ent in doc.ents:
            continue
        if token.is_sent_start:
            annotation = {'start': token.sent.start_char + chunk_offset,
                          'end': token.sent.end_char + chunk_offset,
                          'tag_id': UUIDnopaque('s'),
                          'properties': []}
            annotations.append(annotation)
        # Check if the token is the start of an entity
        if token.ent_iob == 3:
            for ent_candidate in token.sent.ents:
                if ent_candidate.start_char == token.idx:
                    ent = ent_candidate
            annotation = {
                'start': ent.start_char + chunk_offset,
                'end': ent.end_char + chunk_offset,
-                        'tag_id': UUIDnopaque('ent'),
+                'tag_id': generate_id('ent'),
                'properties': [
                    {
-                                'property_id': UUIDnopaque('ent.type'),
+                        'property_id': generate_id('ent.type'),
-                                'value': token.ent_type_
+                        'value': ent.label_
                    }
                ]
            }
            annotations.append(annotation)
-                    break
+    if hasattr(doc, 'sents'):
        for sent in doc.sents:
            annotation = {
                'start': sent.start_char + chunk_offset,
                'end': sent.end_char + chunk_offset,
                'tag_id': generate_id('s'),
                'properties': []
            }
            if hasattr(sent, 'sentiment'):
                annotation['properties'].append(
                    {
                        'property_id': generate_id('s.sentiment'),
                        'value': sent.sentiment
                    }
                )
            annotations.append(annotation)
    for token in doc:
        annotation = {
            'start': token.idx + chunk_offset,
            'end': token.idx + len(token.text) + chunk_offset,
-            'tag_id': UUIDnopaque('token'),
+            'tag_id': generate_id('token'),
-            'properties': [
+            'properties': []
        }
        if hasattr(token, 'lemma_'):
            annotation['properties'].append(
                {
-                   'property_id': UUIDnopaque('token.pos'),
+                    'property_id': generate_id('token.lemma'),
                   'value': token.tag_
                },
                {
                    'property_id': UUIDnopaque('token.lemma'),
                    'value': token.lemma_
-                },
+                }
            )
        if hasattr(token, 'pos_'):
            annotation['properties'].append(
                {
-                    'property_id': UUIDnopaque('token.simple_pos'),
+                    'property_id': generate_id('token.simple_pos'),
                    'value': token.pos_
-                },
+                }
            )
        if hasattr(token, 'sentiment'):
            annotation['properties'].append(
                {
-                    'property_id': UUIDnopaque('token.ner'),
+                    'property_id': generate_id('token.sentiment'),
-                    'value': token.ent_type_ if token.ent_type_ else 'None'
+                    'value': token.sentiment
                }
-            ]
+            )
        if hasattr(token, 'tag_'):
            annotation['properties'].append(
                {
                   'property_id': generate_id('token.pos'),
                   'value': token.tag_
                }
            )
        annotations.append(annotation)
    chunk_offset += len(text_chunk)
    text_chunk = None
-with open(args.output, 'w') as output_file:
+with open(args.output_file, 'w') as output_file:
-    json.dump({'meta': meta, 'tags': tags, 'annotations': annotations},
+    json.dump(
-              output_file, indent=4)
+        {'meta': meta, 'tags': tags, 'annotations': annotations},
        output_file,
        indent=4
    )
--- a/37
+++ b/37
@@ -6,31 +6,36 @@ from stand_off_data import StandOffData
 import hashlib
 import json
-
+parser = ArgumentParser(
-def main():
+    description='Convert plain text and JSON stand off to a CWB vrt file'
-    # Parse the given arguments
+)
-    parser = ArgumentParser(description='Create a vrt from JSON and txt')
+parser.add_argument(
-    parser.add_argument('text', help='Path to txt file')
+    '-s', '--stand-off-data-file',
-    parser.add_argument('stand_off_data', help='Path to JSON file')
+    help='JSON stand off data input file'
-    parser.add_argument('output', help='Path to vrt output file')
+)
 parser.add_argument(
    '-t', '--text-file',
    help='Plain text input file'
 )
 parser.add_argument(
    '-o', '--output-file',
    help='Output file',
    required=True
 )
 args = parser.parse_args()
-    with open(args.stand_off_data) as stand_of_data_file:
+with open(args.stand_off_data_file) as stand_of_data_file:
    stand_off_data = StandOffData(json.load(stand_of_data_file))
-    with open(args.text, "rb") as text_file:
+with open(args.text_file, "rb") as text_file:
    text_md5 = hashlib.md5()
-        for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''):  # noqa
+    for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''):
        text_md5.update(chunk)
    if text_md5.hexdigest() != stand_off_data.meta['file']['md5']:
        raise Exception('md5 not equal')
-    with open(args.text, encoding=stand_off_data.meta['file']['encoding']) as text_file:
+with open(args.text_file, encoding=stand_off_data.meta['file']['encoding']) as text_file:  # noqa
    text = text_file.read()
-    with open(args.output, 'w') as vrt_file:
+with open(args.output_file, 'w') as vrt_file:
    vrt_file.write(stand_off_data.to_vrt(text))
 if __name__ == '__main__':
    main()
--- a/wrapper/nlp
+++ b/wrapper/nlp
@@ -6,7 +6,7 @@ import os
 import subprocess
 import sys
-CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0b'
+CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:v0.1.0'
 CONTAINER_INPUT_DIR = '/input'
 CONTAINER_OUTPUT_DIR = '/output'
 CONTAINER_LOG_DIR = '/logs'
@@ -19,17 +19,17 @@ parser.add_argument('-o', '--output-dir')
 parser.add_argument('--log-dir')
 args, remaining_args = parser.parse_known_args()
-cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)]
+cmd = ['docker', 'run', '--rm', '-it', '-u', f'{UID}:{GID}']
 if args.input_dir is not None:
-    mapping = os.path.abspath(args.input_dir) + ':' + CONTAINER_INPUT_DIR
+    mapping = f'{os.path.abspath(args.input_dir)}:{CONTAINER_INPUT_DIR}'
    cmd += ['-v', mapping]
    remaining_args += ['-i', CONTAINER_INPUT_DIR]
 if args.output_dir is not None:
-    mapping = os.path.abspath(args.output_dir) + ':' + CONTAINER_OUTPUT_DIR
+    mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}'
    cmd += ['-v', mapping]
    remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
 if args.log_dir is not None:
-    mapping = os.path.abspath(args.log_dir) + ':' + CONTAINER_LOG_DIR
+    mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}'
    cmd += ['-v', mapping]
    remaining_args += ['--log-dir', CONTAINER_LOG_DIR]
 cmd.append(CONTAINER_IMAGE)