From a2e8e72e5424963bd0e9419286d5ed6c75b90a1b Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Thu, 27 Jan 2022 16:50:22 +0100 Subject: [PATCH] Bump spaCy version, bugfixes, codestyle --- Dockerfile | 28 +- LICENSE | 21 + README.md | 61 +- nlp | 359 +++++++----- packages/stand-off-data-py/setup.py | 4 +- .../models.py => stand_off_data.py} | 149 +++-- spacy-nlp | 523 ++++++++++-------- vrt-creator | 53 +- wrapper/nlp | 10 +- 9 files changed, 699 insertions(+), 509 deletions(-) create mode 100644 LICENSE rename packages/stand-off-data-py/{stand_off_data/models.py => stand_off_data.py} (58%) diff --git a/Dockerfile b/Dockerfile index f7d824c..9fbce60 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,7 +9,14 @@ ENV LANG=C.UTF-8 RUN apt-get update \ && apt-get install --no-install-recommends --yes \ - wget + procps \ + python3.7 \ + python3-pip \ + wget \ + && python3 -m pip install \ + chardet \ + setuptools \ + wheel # Install the NLP pipeline and it's dependencies # ## Install pyFlow ## @@ -21,12 +28,12 @@ RUN wget --no-check-certificate --quiet \ && apt-get install --no-install-recommends --yes \ python2.7 \ && python2.7 setup.py build install \ - && cd .. \ + && cd - > /dev/null \ && rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz" ## Install spaCy ## -ENV SPACY_VERSION=3.0.5 +ENV SPACY_VERSION=3.2.1 RUN apt-get install --no-install-recommends --yes \ python3.7 \ python3-pip \ @@ -38,23 +45,14 @@ RUN apt-get install --no-install-recommends --yes \ && pip3 install "spacy==${SPACY_VERSION}" -# Only models that include the following components are compatibel: -# lemmatizer, ner, parser, senter, tagger, -ENV SPACY_MODELS="de_core_news_md,en_core_web_md,it_core_news_md,nl_core_news_md,pl_core_news_md,zh_core_web_md" -ENV SPACY_MODELS_VERSION=3.0.0 +ENV SPACY_MODELS="de_core_news_md,en_core_web_md,it_core_news_md,pl_core_news_md,zh_core_web_md" +ENV SPACY_MODELS_VERSION=3.2.0 RUN for spacy_model in $(echo ${SPACY_MODELS} | tr "," "\n"); do python3 -m spacy download "${spacy_model}-${SPACY_MODELS_VERSION}" --direct; done -## Further dependencies ## -RUN apt-get install --no-install-recommends --yes \ - procps \ - zip - - COPY packages . RUN cd stand-off-data-py \ - && python3 setup.py build \ - && python3 setup.py install \ + && python3 -m pip install . \ && cd - diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..a374dbc --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Bielefeld University - CRC 1288 - INF + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index c932dcc..8c58a59 100644 --- a/README.md +++ b/README.md @@ -1,48 +1,41 @@ # NLP - Natural Language Processing -This software implements a heavily parallelized pipeline for Natural Language Processing of text files. It is used for nopaque's NLP service but you can also use it standalone, for that purpose a convenient wrapper script is provided. +This software implements a heavily parallelized pipeline for Natural Language Processing of text files. It is used for nopaque's NLP service but you can also use it standalone, for that purpose a convenient wrapper script is provided. The pipeline is designed to run on Linux operating systems, but with some tweaks it should also run on Windows with WSL installed. ## Software used in this pipeline implementation -- Official Debian Docker image (buster-slim) and programs from its free repositories: https://hub.docker.com/_/debian + +- Official Debian Docker image (buster-slim): https://hub.docker.com/_/debian + - Software from Debian Buster's free repositories - pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20 -- spaCy (3.0.5): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1 -- spaCy medium sized models (3.0.0): - - https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.0.0 - - https://github.com/explosion/spacy-models/releases/tag/en_core_web_md-3.0.0 - - https://github.com/explosion/spacy-models/releases/tag/it_core_news_md-3.0.0 - - https://github.com/explosion/spacy-models/releases/tag/nl_core_news_md-3.0.0 - - https://github.com/explosion/spacy-models/releases/tag/pl_core_news_md-3.0.0 - - https://github.com/explosion/spacy-models/releases/tag/zh_core_web_md-3.0.0 +- spaCy (3.2.1): https://github.com/explosion/spaCy/releases/tag/v3.2.1 +- spaCy medium sized models (3.2.0): + - https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.2.0 + - https://github.com/explosion/spacy-models/releases/tag/en_core_web_md-3.2.0 + - https://github.com/explosion/spacy-models/releases/tag/it_core_news_md-3.2.0 + - https://github.com/explosion/spacy-models/releases/tag/nl_core_news_md-3.2.0 + - https://github.com/explosion/spacy-models/releases/tag/pl_core_news_md-3.2.0 + - https://github.com/explosion/spacy-models/releases/tag/zh_core_web_md-3.2.0 -## Use this image +## Installation -1. Create input and output directories for the pipeline. -``` bash -mkdir -p //input //output -``` +1. Install Docker and Python 3. +2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git` +3. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:v0.1.0 nlp` +4. Add the wrapper script (`wrapper/nlp` relative to this README file) to your `${PATH}`. +5. Create working directories for the pipeline: `mkdir -p //{input,output}`. -2. Place your text files inside `//input`. Files should all contain text of the same language. +## Use the Pipeline + +1. Place your plain text files inside `//input`. Files should all contain text of the same language. +2. Clear your `//output` directory. 3. Start the pipeline process. Check the pipeline help (`nlp --help`) for more details. -``` -# Option one: Use the wrapper script -## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/raw/1.0.0/wrapper/nlp, make it executeable and add it to your ${PATH} +```bash cd / -nlp -i input -l -o output - -# Option two: Classic Docker style -docker run \ - --rm \ - -it \ - -u $(id -u $USER):$(id -g $USER) \ - -v //input:/input \ - -v //output:/output \ - gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0 \ - -i /input \ - -l - -o /output \ - +nlp \ + --input-dir input \ + --output-dir output \ + -m ``` - 4. Check your results in the `//output` directory. diff --git a/nlp b/nlp index 3ffefab..b0c6c54 100755 --- a/nlp +++ b/nlp @@ -1,73 +1,141 @@ #!/usr/bin/env python2.7 # coding=utf-8 -"""A NLP pipeline for text file processing.""" - -__author__ = 'Patrick Jentsch ,' \ - 'Stephan Porada ' -__version__ = '1.0.0' +''' A NLP pipeline for text file processing. ''' +__version__ = '0.1.0' from argparse import ArgumentParser from pyflow import WorkflowRunner -import multiprocessing +import json import os import sys -SPACY_MODELS = {'de': 'de_core_news_md', - 'en': 'en_core_web_md', - 'it': 'it_core_news_md', - 'nl': 'nl_core_news_md', - 'pl': 'pl_core_news_md', - 'zh': 'zh_core_web_md'} +SPACY_MODELS = { + 'de': 'de_core_news_md', + 'en': 'en_core_web_md', + 'it': 'it_core_news_md', + 'nl': 'nl_core_news_md', + 'pl': 'pl_core_news_md', + 'zh': 'zh_core_web_md' +} -class NLPPipelineJob: - """An NLP pipeline job class +class PipelineJob: + ''' + NLP pipeline job class. - Each input file of the pipeline is represented as an NLP pipeline job, - which holds all necessary information for the pipeline to process it. + Each plain text input file of the pipeline is represented as an NLP + pipeline job, which holds all necessary information for the pipeline to + process it. Arguments: file -- Path to the file - output_dir -- Path to a directory, where job results a stored - """ + output_dir -- Path to a directory, where job results are stored + ''' def __init__(self, file, output_dir): self.file = file - self.name = os.path.basename(file).rsplit('.', 1)[0] + self.name = os.path.basename(file)[:-4] self.output_dir = output_dir - catma_stand_off_data_file = file.rsplit('.', 1)[0] + '.catma-stand-off.json' # noqa - if os.path.exists(catma_stand_off_data_file): - self.catma_stand_off_data_file = catma_stand_off_data_file - else: - self.catma_stand_off_data_file = None -class NLPPipeline(WorkflowRunner): - def __init__(self, input_dir, output_dir, check_encoding, lang, zip): +class NLPWorkflow(WorkflowRunner): + def __init__(self, job, model, check_encoding=False, id_prefix=''): + self.job = job + self.model = model + self.check_encoding = check_encoding + self.id_prefix = id_prefix + + def workflow(self): + ''' + ' ################################################## + ' # spacy # + ' ################################################## + ''' + n_cores = 1 + mem_mb = min(1024, self.getMemMb()) + cmd = 'spacy-nlp' + cmd += ' --input-file "{}"'.format(self.job.file) + cmd += ' --output-file "{}"'.format( + os.path.join(self.job.output_dir, '{}.json'.format(self.job.name)) + ) + cmd += ' -m "{}"'.format(self.model) + if self.check_encoding: + cmd += ' --check-encoding' + cmd += ' --id-prefix "{}"'.format(self.id_prefix) + self.addTask( + 'spacy', + command=cmd, + memMb=mem_mb, + nCores=n_cores + ) + + +class CreateVrtWorkflow(WorkflowRunner): + def __init__(self, job): + self.job = job + + def workflow(self): + ''' + ' ################################################## + ' # vrt-creator # + ' ################################################## + ''' + n_cores = 1 + mem_mb = min(256, self.getMemMb()) + cmd = 'vrt-creator' + cmd += ' --stand-off-data-file "{}"'.format( + os.path.join(self.job.output_dir, '{}.json'.format(self.job.name)) + ) + cmd += ' --text-file "{}"'.format(self.job.file) + cmd += ' --output-file "{}"'.format( + os.path.join(self.job.output_dir, '{}.vrt'.format(self.job.name)) + ) + self.addTask( + 'vrt_creator', + command=cmd, + memMb=mem_mb, + nCores=n_cores + ) + + +class MainWorkflow(WorkflowRunner): + def __init__( + self, + input_dir, + model, + output_dir, + check_encoding=False, + id_prefix='' + ): self.input_dir = input_dir + self.model = model self.output_dir = output_dir self.check_encoding = check_encoding - self.lang = lang - self.zip = zip - self.jobs = collect_jobs(self.input_dir, self.output_dir) + self.id_prefix = id_prefix + self.jobs = [] + + def collect_jobs(self): + self.jobs = [] + for file in os.listdir(self.input_dir): + if os.path.isdir(os.path.join(self.input_dir, file)): + continue + if not file.lower().endswith('.txt'): + continue + job = PipelineJob( + os.path.join(self.input_dir, file), + os.path.join(self.output_dir, file) + ) + self.jobs.append(job) def workflow(self): if not self.jobs: return - ''' - ' ################################################## - ' # setup output directory # - ' ################################################## - ''' - setup_output_directory_tasks = [] - for i, job in enumerate(self.jobs): - cmd = 'mkdir -p "{}"'.format(job.output_dir) - lbl = 'setup_output_directory_-_{}'.format(i) - task = self.addTask(command=cmd, label=lbl) - setup_output_directory_tasks.append(task) + # Create output and temporary directories + for job in self.jobs: + os.mkdir(job.output_dir) ''' ' ################################################## @@ -75,106 +143,116 @@ class NLPPipeline(WorkflowRunner): ' ################################################## ''' nlp_tasks = [] - n_cores = max(1, int(self.getNCores() / len(self.jobs))) - mem_mb = min(n_cores * 2048, int(self.getMemMb() / len(self.jobs))) for i, job in enumerate(self.jobs): - output_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name)) # noqa - cmd = 'spacy-nlp' - cmd += ' -l "{}"'.format(self.lang) - cmd += ' --check-encoding' if self.check_encoding else '' - cmd += ' "{}"'.format(job.file) - cmd += ' "{}"'.format(output_file) - deps = 'setup_output_directory_-_{}'.format(i) - lbl = 'nlp_-_{}'.format(i) - task = self.addTask(command=cmd, dependencies=deps, label=lbl, - memMb=mem_mb, nCores=n_cores) + task = self.addWorkflowTask( + 'nlp_-_{}'.format(i), + NLPWorkflow( + job, + self.model, + check_encoding=self.check_encoding, + id_prefix=self.id_prefix + ) + ) nlp_tasks.append(task) ''' ' ################################################## - ' # vrt creation # + ' # create vrt # ' ################################################## ''' - vrt_creation_tasks = [] + create_vrt_tasks = [] for i, job in enumerate(self.jobs): - output_file = os.path.join(job.output_dir, '{}.vrt'.format(job.name)) # noqa - nopaque_stand_off_data_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name)) # noqa - cmd = 'vrt-creator' - cmd += ' "{}"'.format(job.file) - cmd += ' "{}"'.format(nopaque_stand_off_data_file) - if job.catma_stand_off_data_file is not None: - cmd += ' --catma-stand-off-data "{}"'.format(job.catma_stand_off_data_file) # noqa - cmd += ' "{}"'.format(output_file) - deps = 'nlp_-_{}'.format(i) - lbl = 'vrt_creation_-_{}'.format(i) - task = self.addTask(command=cmd, dependencies=deps, label=lbl) - vrt_creation_tasks.append(task) + task = self.addWorkflowTask( + 'create_vrt_-_{}'.format(i), + CreateVrtWorkflow(job), + dependencies='nlp_-_{}'.format(i) + ) + create_vrt_tasks.append(task) - ''' - ' ################################################## - ' # zip creation # - ' ################################################## - ''' - zip_creation_tasks = [] - if self.zip is not None: - cmd = 'cd "{}"'.format(self.output_dir) - cmd += ' && ' - cmd += 'zip' - cmd += ' -r' - cmd += ' "{}.zip" .'.format(self.zip) - cmd += ' -x "pyflow.data*"' - cmd += ' -i "*.vrt" "*.json"' - cmd += ' && ' - cmd += 'cd -' - deps = vrt_creation_tasks - lbl = 'zip_creation' - task = self.addTask(command=cmd, dependencies=deps, label=lbl) - zip_creation_tasks.append(task) - - -def collect_jobs(input_dir, output_dir): - jobs = [] - for file in os.listdir(input_dir): - if os.path.isdir(os.path.join(input_dir, file)): - continue - if file.lower().endswith('.txt'): - job = NLPPipelineJob(os.path.join(input_dir, file), - os.path.join(output_dir, file)) - jobs.append(job) - return jobs + self.waitForTasks() + outputs = [] + for job in self.jobs: + # Track output files + relative_output_dir = os.path.relpath( + job.output_dir, + start=self.output_dir + ) + outputs.append( + { + 'description': 'JSON stand off data', + 'file': os.path.join( + relative_output_dir, + '{}.json'.format(job.name) + ), + 'mimetype': 'application/json' + } + ) + outputs.append( + { + 'description': 'CWB vrt file', + 'file': os.path.join( + relative_output_dir, + '{}.vrt'.format(job.name) + ), + 'mimetype': 'application/vrt+xml' + } + ) + with open(os.path.join(self.output_dir, 'outputs.json'), 'w') as f: + json.dump(outputs, f, indent=4) def parse_args(): - parser = ArgumentParser(description='NLP pipeline for TXT file processing', - prog='NLP pipeline') - parser.add_argument('-i', '--input-dir', - help='Input directory', - required=True) - parser.add_argument('-o', '--output-dir', - help='Output directory', - required=True) - parser.add_argument('-l', '--language', - choices=SPACY_MODELS.keys(), - help='Language of the input (2-character ISO 639-1 language codes)', # noqa - required=True) - parser.add_argument('--check-encoding', - action='store_true', - help='Check encoding of the input file, UTF-8 is used instead') # noqa - parser.add_argument('--log-dir', - help='Logging directory') - parser.add_argument('--mem-mb', - help='Amount of system memory to be used (Default: min(--n-cores * 2048, available system memory))', # noqa - type=int) - parser.add_argument('--n-cores', - default=min(4, multiprocessing.cpu_count()), - help='Number of CPU threads to be used (Default: min(4, number of CPUs))', # noqa - type=int) - parser.add_argument('--zip', - help='Create one zip file per filetype') - parser.add_argument('-v', '--version', - action='version', - help='Returns the current version of the NLP pipeline', - version='%(prog)s {}'.format(__version__)) + parser = ArgumentParser( + description='NLP pipeline for plain text file processing' + ) + parser.add_argument( + '-i', '--input-dir', + help='Input directory', + required=True + ) + parser.add_argument( + '-o', '--output-dir', + help='Output directory', + required=True + ) + parser.add_argument( + '-m', '--model', + choices=SPACY_MODELS.keys(), + help='The model to be used', + required=True + ) + parser.add_argument( + '--check-encoding', + action='store_true', + help='Check encoding of the input file, UTF-8 is used instead' + ) + parser.add_argument( + '--id-prefix', + default='', + help='A prefix for all the ids within the stand off annotations' + ) + parser.add_argument( + '--log-dir', + help='Logging directory (Default: --output-dir)' + ) + parser.add_argument( + '--mem-mb', + help='Amount of system memory to be used ' + '(Default: min(--n-cores * 1024, available system memory))', + type=int + ) + parser.add_argument( + '--n-cores', + default=1, + help='Number of CPU threads to be used', + type=int + ) + parser.add_argument( + '-v', '--version', + action='version', + help='Returns the current version of the NLP pipeline', + version='%(prog)s {}'.format(__version__) + ) args = parser.parse_args() # Set some tricky default values and check for insufficient input @@ -184,20 +262,27 @@ def parse_args(): raise Exception('--n-cores must be greater or equal 1') if args.mem_mb is None: max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0]) - args.mem_mb = min(args.n_cores * 2048, max_mem_mb) - if args.mem_mb < 2048: - raise Exception('--mem-mb must be greater or equal 2048') - if args.zip is not None and args.zip.lower().endswith('.zip'): - # Remove .zip file extension if provided - args.zip = args.zip[:-4] - args.zip = args.zip if args.zip else 'output' + args.mem_mb = min(args.n_cores * 1024, max_mem_mb) + if args.mem_mb < 1024: + raise Exception('--mem-mb must be greater or equal 1024') return args def main(): args = parse_args() - nlp_pipeline = NLPPipeline(args.input_dir, args.output_dir, args.check_encoding, args.language, args.zip) # noqa - retval = nlp_pipeline.run(dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores) # noqa + main_workflow = MainWorkflow( + args.input_dir, + args.model, + args.output_dir, + check_encoding=args.check_encoding, + id_prefix=args.id_prefix + ) + main_workflow.collect_jobs() + retval = main_workflow.run( + dataDirRoot=args.log_dir, + memMb=args.mem_mb, + nCores=args.n_cores + ) sys.exit(retval) diff --git a/packages/stand-off-data-py/setup.py b/packages/stand-off-data-py/setup.py index 2d263ad..b7d5cc3 100644 --- a/packages/stand-off-data-py/setup.py +++ b/packages/stand-off-data-py/setup.py @@ -1,14 +1,14 @@ import setuptools setuptools.setup( - name='stand-off-data', + name='Stand off data', author='Patrick Jentsch', author_email='p.jentsch@uni-bielefeld.de', description='A python library to handle stand off data.', + py_modules=['stand_off_data'], classifiers=[ 'Programming Language :: Python :: 3', 'Operating System :: OS Independent', ], - packages=setuptools.find_packages(), python_requires='>=3.5' ) diff --git a/packages/stand-off-data-py/stand_off_data/models.py b/packages/stand-off-data-py/stand_off_data.py similarity index 58% rename from packages/stand-off-data-py/stand_off_data/models.py rename to packages/stand-off-data-py/stand_off_data.py index 0206119..cb10163 100644 --- a/packages/stand-off-data-py/stand_off_data/models.py +++ b/packages/stand-off-data-py/stand_off_data.py @@ -7,13 +7,15 @@ class StandOffData: self.lookup = {} for x in attrs.get('tags', []): self.add_tag_definition(x) - self.annotations = [TagAnnotation(x, self.lookup) - for x in attrs.get('annotations', [])] + self.annotations = [ + TagAnnotation(x, self.lookup) + for x in attrs.get('annotations', []) + ] def add_tag_definition(self, attrs): tag_definition = TagDefinition(attrs) if tag_definition.id in self.lookup: - raise Exception('Tag id already in use: {}'.format(self.to_dict())) + raise Exception(f'Tag id already in use: {self.to_dict()}') self.lookup[tag_definition.id] = tag_definition def to_dict(self): @@ -42,7 +44,9 @@ class StandOffData: if ((p_attr.start >= next_p_attr.start) and (p_attr.start < next_p_attr.end) # noqa or (p_attr.end > next_p_attr.start) and (p_attr.end <= next_p_attr.end)): # noqa raise Exception( - 'Positional attribute overlaps another: {}<->{}'.format(p_attr.to_dict(), next_p_attr.to_dict())) + 'Positional attribute overlaps another: ' + f'{p_attr.to_dict()}<->{next_p_attr.to_dict()}' + ) # Check for s_attr<->p_attr overlap for i, s_attr in enumerate(s_attrs): for p_attr in p_attrs: @@ -56,8 +60,11 @@ class StandOffData: s_attrs[i].end = p_attr.end # Check if s_attr starts/ends before/after p_attr if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start: - # No further Checking needed (just because p_attrs are sorted) + # No further Checking needed (because p_attrs are sorted) break + p_attr_buffer = {} + for i, p_attr in enumerate(p_attrs): + p_attr_buffer[p_attr.start] = i s_attr_start_buffer = {} s_attr_end_buffer = {} for i, s_attr in enumerate(s_attrs): @@ -66,34 +73,56 @@ class StandOffData: else: s_attr_start_buffer[s_attr.start] = [i] if s_attr.end in s_attr_end_buffer: - s_attr_end_buffer[s_attr.end].append(i) + s_attr_end_buffer[s_attr.end].insert(0, i) else: s_attr_end_buffer[s_attr.end] = [i] vrt = '' vrt += '\n' - for p_attr in p_attrs: - # s_attr_ends - for k in {k: v for k, v in s_attr_end_buffer.items() if k <= p_attr.start}: # noqa - s_attr_indexes = s_attr_end_buffer.pop(k) + current_position = 0 + text_len = len(text) + # As long as we have something in our buffers we process it + while current_position <= text_len: + # s_attr endings + # for k in {k: v for k, v in s_attr_end_buffer.items() if k <= current_position}: # noqa + if current_position in s_attr_end_buffer: + # s_attr_indexes = s_attr_end_buffer.pop(k) + s_attr_indexes = s_attr_end_buffer.pop(current_position) for s_attr_index in s_attr_indexes: s_attr = s_attrs[s_attr_index] - vrt += '\n'.format(escape(s_attr.name)) - # s_attr_starts - for k in {k: v for k, v in s_attr_start_buffer.items() if k <= p_attr.start}: # noqa - s_attr_indexes = s_attr_start_buffer.pop(k) + vrt += f'\n' + # s_attrs starts + # for k in {k: v for k, v in s_attr_start_buffer.items() if k <= current_position}: # noqa + if current_position in s_attr_start_buffer: + # s_attr_indexes = s_attr_start_buffer.pop(k) + s_attr_indexes = s_attr_start_buffer.pop(current_position) for s_attr_index in s_attr_indexes: s_attr = s_attrs[s_attr_index] - foo = '' + vrt += f'<{escape(s_attr.name)}' for property in s_attr.properties: - foo += ' {}="{}"'.format(escape(property.name), - escape(property.value)) - vrt += '<{}{}>\n'.format(escape(s_attr.name), foo) - foo = {'lemma': None, 'ner': None, 'pos': None, 'simple_pos': None, 'word': None} # noqa + vrt += f' {escape(property.name)}="{escape(str(property.value))}"' # noqa + vrt += '>\n' + # p_attrs + if current_position not in p_attr_buffer: + current_position += 1 + continue + p_attr_index = p_attr_buffer.pop(current_position) + p_attr = p_attrs[p_attr_index] + if text[p_attr.start:p_attr.end].isspace(): + current_position = p_attr.end + continue + _p_attr = { + 'lemma': 'None', + 'pos': 'None', + 'simple_pos': 'None', + 'word': 'None' + } for property in p_attr.properties: - foo[property.name] = escape(property.value) - foo['word'] = escape(text[p_attr.start:p_attr.end]) - vrt += '{word}\t{pos}\t{lemma}\t{simple_pos}\t{ner}\n'.format( - **foo) + if property.name not in _p_attr: + continue + _p_attr[property.name] = escape(str(property.value)) + _p_attr['word'] = escape(text[p_attr.start:p_attr.end]) + vrt += '{word}\t{pos}\t{lemma}\t{simple_pos}\n'.format(**_p_attr) + current_position = p_attr.end vrt += '\n' return vrt @@ -110,15 +139,15 @@ class TagAnnotation: ] ''' Sanity checks ''' if self.tag_id not in self.lookup: - raise Exception('Unknown tag: {}'.format(self.to_dict())) + raise Exception(f'Unknown tag: {self.to_dict()}') if self.end < self.start: - raise Exception('Annotation end less then start: ' - '{}'.format(self.to_dict())) - property_ids = [x.property_id for x in self.properties] - for required_property_id, required_property in self.lookup[self.tag_id].required_properties.items(): # noqa - if required_property_id not in property_ids: - raise Exception('Missing required property: ' - '{}'.format(required_property.to_dict())) + raise Exception(f'Annotation end less then start: {self.to_dict()}') # noqa + # property_ids = [x.property_id for x in self.properties] + # for required_property_id, required_property in self.lookup[self.tag_id].required_properties.items(): # noqa + # if required_property_id not in property_ids: + # raise Exception( + # f'Missing required property: {required_property.to_dict()}' + # ) @property def name(self): @@ -134,33 +163,45 @@ class TagAnnotation: def __lt__(self, other): if self.start == other.start: - return self.name == 'token' and other.name != 'token' + if self.name == 'token' and other.name != 'token': + return False + elif self.name != 'token' and other.name == 'token': + return True + else: + return self.end > other.end else: return self.start < other.start def __le__(self, other): if self.start == other.start: - return self.name == 'token' or other.name != 'token' + if self.name == 'token' and other.name != 'token': + return False + elif self.name != 'token' and other.name == 'token': + return True + else: + return self.end >= other.end else: - return self.start < other.start + return self.start <= other.start def __eq__(self, other): - return self.start == other.start and self.name == other.name + if self.start == other.start: + if self.name == 'token' and other.name != 'token': + return False + elif self.name != 'token' and other.name == 'token': + return False + else: + return self.end == other.end + else: + return False def __ne__(self, other): - return self.start != other.start and self.name != other.name + return not self == other def __gt__(self, other): - if self.start == other.start: - return self.name != 'token' and other.name == 'token' - else: - return self.start > other.start + return not self <= other def __ge__(self, other): - if self.start == other.start: - return self.name != 'token' or other.name == 'token' - else: - return self.start > other.start + return not self < other class PropertyAnnotation: @@ -171,7 +212,7 @@ class PropertyAnnotation: # TODO: Process attrs['possibleValues'] as self.labels (no id?) ''' Sanity checks ''' if self.property_id not in self.lookup: - raise Exception('Unknown property: {}'.format(self.to_dict())) + raise Exception(f'Unknown property: {self.to_dict()}') @property def name(self): @@ -197,14 +238,14 @@ class TagDefinition: def add_property_definition(self, attrs): property_definition = PropertyDefinition(attrs) if property_definition.id in self.properties: - raise Exception('Property id already in use: ' - '{}'.format(property_definition.to_dict())) + raise Exception( + f'Property id already in use: {property_definition.to_dict()}') self.properties[property_definition.id] = property_definition - @property - def required_properties(self): - return {property.id: property for property in self.properties.values() - if property.is_required} + # @property + # def required_properties(self): + # return {property.id: property for property in self.properties.values() + # if property.is_required} def to_dict(self): return { @@ -223,9 +264,9 @@ class PropertyDefinition: self.flags = attrs.get('flags', []) self.labels = attrs.get('labels', []) - @property - def is_required(self): - return 'required' in self.flags + # @property + # def is_required(self): + # return 'required' in self.flags @property def has_multiple_values(self): diff --git a/spacy-nlp b/spacy-nlp index be4dfa3..1b9cf9a 100755 --- a/spacy-nlp +++ b/spacy-nlp @@ -11,46 +11,64 @@ import textwrap import uuid -def UUIDnopaque(name): - return 'nopaque_{}'.format( - uuid.uuid3(uuid.NAMESPACE_DNS, - '{}@nopaque.sfb1288.uni-bielefeld.de'.format(name)) - ) - - -spacy_models = {spacy.info(pipeline)['lang']: pipeline - for pipeline in spacy.info()['pipelines']} +spacy_models = { + spacy.info(pipeline)['lang']: pipeline + for pipeline in spacy.info()['pipelines'] +} # Parse the given arguments -parser = ArgumentParser(description='Create annotations for a given txt file') -parser.add_argument('input', help='Path to txt input file') -parser.add_argument('output', help='Path to JSON output file') -parser.add_argument('-l', '--language', - choices=spacy_models.keys(), - help='Language of the input (2-character ISO 639-1 language codes)', # noqa - required=True) -parser.add_argument('-c', '--check-encoding', - action='store_true', - help='Check encoding of the input file, UTF-8 is used instead') # noqa +parser = ArgumentParser( + description='Create annotations for a given plain txt file' +) +parser.add_argument( + '-i', '--input-file', + help='Input file' +) +parser.add_argument( + '-o', '--output-file', + help='Output file', + required=True +) +parser.add_argument( + '-m', '--model', + choices=spacy_models.keys(), + help='The model to be used', + required=True +) +parser.add_argument( + '-c', '--check-encoding', + action='store_true', + help='Check encoding of the input file, UTF-8 is used instead' +) +parser.add_argument( + '--id-prefix', + default='', + help='A prefix for all the ids within the stand off annotations' +) args = parser.parse_args() -with open(args.input, "rb") as text_file: + +def generate_id(name): + return f'{args.id_prefix}{uuid.uuid3(uuid.NAMESPACE_DNS, name)}' + + +with open(args.input_file, "rb") as input_file: if args.check_encoding: - encoding = chardet.detect(text_file.read())['encoding'] + encoding = chardet.detect(input_file.read())['encoding'] else: encoding = 'utf-8' - text_file.seek(0) + input_file.seek(0) text_md5 = hashlib.md5() - for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''): + for chunk in iter(lambda: input_file.read(128 * text_md5.block_size), b''): text_md5.update(chunk) # Load the text contents from the input file -with open(args.input, encoding=encoding) as text_file: +with open(args.input_file, encoding=encoding) as input_file: # spaCy NLP is limited to strings with a maximum of 1 million characters at # once. So we split it into suitable chunks. text_chunks = textwrap.wrap( - text_file.read(), + input_file.read(), 1000000, break_long_words=False, break_on_hyphens=False, @@ -59,186 +77,197 @@ with open(args.input, encoding=encoding) as text_file: replace_whitespace=False ) -model = spacy_models[args.language] -nlp = spacy.load(model) +model_name = spacy_models[args.model] +nlp = spacy.load(model_name) meta = { 'generator': { - 'name': 'nopaque NLP service', - 'version': '1.0.0', + 'name': 'nopaque spacy NLP', + 'version': '0.1.0', 'arguments': { 'check_encoding': args.check_encoding, - 'language': args.language + 'model': args.model } }, 'file': { 'encoding': encoding, 'md5': text_md5.hexdigest(), - 'name': os.path.basename(args.input) + 'name': os.path.basename(args.input_file) } } -tags = [ +tags = [] +token = { + 'id': generate_id('token'), + 'name': 'token', + 'description': 'An individual token — i.e. a word, punctuation symbol, whitespace, etc.', # noqa + 'properties': [] +} +# TODO: Check if all languages support token.sentiment +token['properties'].append( { - 'id': UUIDnopaque('token'), - 'name': 'token', - 'description': 'An individual token — i.e. a word, punctuation symbol, whitespace, etc.', - 'properties': [ - { - 'id': UUIDnopaque('token.lemma'), - 'name': 'lemma', - 'description': 'The base form of the word', - 'flags': ['required'], - 'labels': [] - }, - { - 'id': UUIDnopaque('token.pos'), - 'name': 'pos', - 'description': 'The detailed part-of-speech tag', - 'flags': ['required'], - 'labels': [ - { - 'id': UUIDnopaque('token.pos={}'.format(label)), - 'name': label, - 'description': spacy.explain(label) or '' - } for label in spacy.info(model)['labels']['tagger'] - ] - }, - { - 'id': UUIDnopaque('token.simple_pos'), - 'name': 'simple_pos', - 'description': 'The simple UPOS part-of-speech tag', - 'flags': ['required'], - 'labels': [ - { - 'id': UUIDnopaque('token.simple_pos=ADJ'), - 'name': 'ADJ', - 'description': 'adjective' - }, - { - 'id': UUIDnopaque('token.simple_pos=ADJ'), - 'name': 'ADP', - 'description': 'adposition' - }, - { - 'id': UUIDnopaque('token.simple_pos=ADJ'), - 'name': 'ADV', - 'description': 'adverb' - }, - { - 'id': UUIDnopaque('token.simple_pos=ADJ'), - 'name': 'AUX', - 'description': 'auxiliary verb' - }, - { - 'id': UUIDnopaque('token.simple_pos=ADJ'), - 'name': 'CONJ', - 'description': 'coordinating conjunction' - }, - { - 'id': UUIDnopaque('token.simple_pos=ADJ'), - 'name': 'DET', - 'description': 'determiner' - }, - { - 'id': UUIDnopaque('token.simple_pos=ADJ'), - 'name': 'INTJ', - 'description': 'interjection' - }, - { - 'id': UUIDnopaque('token.simple_pos=ADJ'), - 'name': 'NOUN', - 'description': 'noun' - }, - { - 'id': UUIDnopaque('token.simple_pos=ADJ'), - 'name': 'NUM', - 'description': 'numeral' - }, - { - 'id': UUIDnopaque('token.simple_pos=ADJ'), - 'name': 'PART', - 'description': 'particle' - }, - { - 'id': UUIDnopaque('token.simple_pos=ADJ'), - 'name': 'PRON', - 'description': 'pronoun' - }, - { - 'id': UUIDnopaque('token.simple_pos=ADJ'), - 'name': 'PROPN', - 'description': 'proper noun' - }, - { - 'id': UUIDnopaque('token.simple_pos=ADJ'), - 'name': 'PUNCT', - 'description': 'punctuation' - }, - { - 'id': UUIDnopaque('token.simple_pos=ADJ'), - 'name': 'SCONJ', - 'description': 'subordinating conjunction' - }, - { - 'id': UUIDnopaque('token.simple_pos=ADJ'), - 'name': 'SYM', - 'description': 'symbol' - }, - { - 'id': UUIDnopaque('token.simple_pos=ADJ'), - 'name': 'VERB', - 'description': 'verb' - }, - { - 'id': UUIDnopaque('token.simple_pos=ADJ'), - 'name': 'X', - 'description': 'other' - } - ] - }, - { - 'id': UUIDnopaque('token.ner'), - 'name': 'ner', - 'description': 'Label indicating the type of the entity', - 'flags': ['required'], - 'labels': [ - { - 'id': UUIDnopaque('token.ner={}'.format(label)), - 'name': label, - 'description': spacy.explain(label) or '' - } for label in spacy.info(model)['labels']['ner'] - ] - } - ] - }, - { - 'id': UUIDnopaque('s'), - 'name': 's', - 'description': 'Encodes the start and end of a sentence', - 'properties': [] - }, - { - 'id': UUIDnopaque('ent'), - 'name': 'ent', - 'description': 'Encodes the start and end of a named entity', - 'properties': [ - { - 'id': UUIDnopaque('ent.type'), - 'name': 'type', - 'description': 'Label indicating the type of the entity', - 'flags': ['required'], - 'labels': [ - { - 'id': UUIDnopaque('ent.type={}'.format(label)), - 'name': label, - 'description': spacy.explain(label) or '' - } for label in spacy.info(model)['labels']['ner'] - ] - } - ] + 'id': generate_id('token.sentiment'), + 'name': 'sentiment', + 'description': 'A scalar value indicating the positivity or negativity of the token.' # noqa } -] +) +if nlp.has_pipe('lemmatizer'): + token['properties'].append( + { + 'id': generate_id('token.lemma'), + 'name': 'lemma', + 'description': 'The base form of the word' + } + ) +if nlp.has_pipe('morphologizer') or nlp.has_pipe('tagger'): + token['properties'].append( + { + 'id': generate_id('token.simple_pos'), + 'name': 'simple_pos', + 'description': 'The simple UPOS part-of-speech tag', + 'labels': [ + { + 'id': generate_id('token.simple_pos=ADJ'), + 'name': 'ADJ', + 'description': 'adjective' + }, + { + 'id': generate_id('token.simple_pos=ADJ'), + 'name': 'ADP', + 'description': 'adposition' + }, + { + 'id': generate_id('token.simple_pos=ADJ'), + 'name': 'ADV', + 'description': 'adverb' + }, + { + 'id': generate_id('token.simple_pos=ADJ'), + 'name': 'AUX', + 'description': 'auxiliary verb' + }, + { + 'id': generate_id('token.simple_pos=ADJ'), + 'name': 'CONJ', + 'description': 'coordinating conjunction' + }, + { + 'id': generate_id('token.simple_pos=ADJ'), + 'name': 'DET', + 'description': 'determiner' + }, + { + 'id': generate_id('token.simple_pos=ADJ'), + 'name': 'INTJ', + 'description': 'interjection' + }, + { + 'id': generate_id('token.simple_pos=ADJ'), + 'name': 'NOUN', + 'description': 'noun' + }, + { + 'id': generate_id('token.simple_pos=ADJ'), + 'name': 'NUM', + 'description': 'numeral' + }, + { + 'id': generate_id('token.simple_pos=ADJ'), + 'name': 'PART', + 'description': 'particle' + }, + { + 'id': generate_id('token.simple_pos=ADJ'), + 'name': 'PRON', + 'description': 'pronoun' + }, + { + 'id': generate_id('token.simple_pos=ADJ'), + 'name': 'PROPN', + 'description': 'proper noun' + }, + { + 'id': generate_id('token.simple_pos=ADJ'), + 'name': 'PUNCT', + 'description': 'punctuation' + }, + { + 'id': generate_id('token.simple_pos=ADJ'), + 'name': 'SCONJ', + 'description': 'subordinating conjunction' + }, + { + 'id': generate_id('token.simple_pos=ADJ'), + 'name': 'SYM', + 'description': 'symbol' + }, + { + 'id': generate_id('token.simple_pos=ADJ'), + 'name': 'VERB', + 'description': 'verb' + }, + { + 'id': generate_id('token.simple_pos=ADJ'), + 'name': 'X', + 'description': 'other' + } + ] + } + ) +if nlp.has_pipe('tagger'): + token['properties'].append( + { + 'id': generate_id('token.pos'), + 'name': 'pos', + 'description': 'The detailed part-of-speech tag', + 'labels': [ + { + 'id': generate_id(f'token.pos={label}'), + 'name': label, + 'description': spacy.explain(label) or '' + } for label in spacy.info(model_name)['labels']['tagger'] + ] + } + ) +if nlp.has_pipe('ner') or nlp.has_pipe('entity_ruler'): + tags.append( + { + 'id': generate_id('ent'), + 'name': 'ent', + 'description': 'Encodes the start and end of a named entity', + 'properties': [ + { + 'id': generate_id('ent.type'), + 'name': 'type', + 'description': 'Label indicating the type of the entity', + 'labels': [ + { + 'id': generate_id('ent.type={}'.format(label)), + 'name': label, + 'description': spacy.explain(label) or '' + } for label in spacy.info(model_name)['labels']['ner'] + ] + } + ] + } + ) +if nlp.has_pipe('parser') or nlp.has_pipe('senter') or nlp.has_pipe('sentencizer'): # noqa + # TODO: Check if all languages support sent.sentiment + tags.append( + { + 'id': generate_id('s'), + 'name': 's', + 'description': 'Encodes the start and end of a sentence', + 'properties': [ + { + 'id': generate_id('s.sentiment'), + 'name': 'sentiment', + 'description': 'A scalar value indicating the positivity or negativity of the sentence.' # noqa + } + ] + } + ) +tags.append(token) annotations = [] @@ -246,60 +275,78 @@ chunk_offset = 0 while text_chunks: text_chunk = text_chunks.pop(0) doc = nlp(text_chunk) - for token in doc: - if token.is_space: - continue - if token.is_sent_start: - annotation = {'start': token.sent.start_char + chunk_offset, - 'end': token.sent.end_char + chunk_offset, - 'tag_id': UUIDnopaque('s'), - 'properties': []} - annotations.append(annotation) - # Check if the token is the start of an entity - if token.ent_iob == 3: - for ent_candidate in token.sent.ents: - if ent_candidate.start_char == token.idx: - ent = ent_candidate - annotation = { - 'start': ent.start_char + chunk_offset, - 'end': ent.end_char + chunk_offset, - 'tag_id': UUIDnopaque('ent'), - 'properties': [ - { - 'property_id': UUIDnopaque('ent.type'), - 'value': token.ent_type_ - } - ] + if hasattr(doc, 'ents'): + for ent in doc.ents: + annotation = { + 'start': ent.start_char + chunk_offset, + 'end': ent.end_char + chunk_offset, + 'tag_id': generate_id('ent'), + 'properties': [ + { + 'property_id': generate_id('ent.type'), + 'value': ent.label_ } - annotations.append(annotation) - break + ] + } + annotations.append(annotation) + if hasattr(doc, 'sents'): + for sent in doc.sents: + annotation = { + 'start': sent.start_char + chunk_offset, + 'end': sent.end_char + chunk_offset, + 'tag_id': generate_id('s'), + 'properties': [] + } + if hasattr(sent, 'sentiment'): + annotation['properties'].append( + { + 'property_id': generate_id('s.sentiment'), + 'value': sent.sentiment + } + ) + annotations.append(annotation) + for token in doc: annotation = { 'start': token.idx + chunk_offset, 'end': token.idx + len(token.text) + chunk_offset, - 'tag_id': UUIDnopaque('token'), - 'properties': [ - { - 'property_id': UUIDnopaque('token.pos'), - 'value': token.tag_ - }, - { - 'property_id': UUIDnopaque('token.lemma'), - 'value': token.lemma_ - }, - { - 'property_id': UUIDnopaque('token.simple_pos'), - 'value': token.pos_ - }, - { - 'property_id': UUIDnopaque('token.ner'), - 'value': token.ent_type_ if token.ent_type_ else 'None' - } - ] + 'tag_id': generate_id('token'), + 'properties': [] } + if hasattr(token, 'lemma_'): + annotation['properties'].append( + { + 'property_id': generate_id('token.lemma'), + 'value': token.lemma_ + } + ) + if hasattr(token, 'pos_'): + annotation['properties'].append( + { + 'property_id': generate_id('token.simple_pos'), + 'value': token.pos_ + } + ) + if hasattr(token, 'sentiment'): + annotation['properties'].append( + { + 'property_id': generate_id('token.sentiment'), + 'value': token.sentiment + } + ) + if hasattr(token, 'tag_'): + annotation['properties'].append( + { + 'property_id': generate_id('token.pos'), + 'value': token.tag_ + } + ) annotations.append(annotation) chunk_offset += len(text_chunk) text_chunk = None -with open(args.output, 'w') as output_file: - json.dump({'meta': meta, 'tags': tags, 'annotations': annotations}, - output_file, indent=4) +with open(args.output_file, 'w') as output_file: + json.dump( + {'meta': meta, 'tags': tags, 'annotations': annotations}, + output_file, + indent=4 + ) diff --git a/vrt-creator b/vrt-creator index 3443168..3efb8f5 100755 --- a/vrt-creator +++ b/vrt-creator @@ -6,31 +6,36 @@ from stand_off_data import StandOffData import hashlib import json +parser = ArgumentParser( + description='Convert plain text and JSON stand off to a CWB vrt file' +) +parser.add_argument( + '-s', '--stand-off-data-file', + help='JSON stand off data input file' +) +parser.add_argument( + '-t', '--text-file', + help='Plain text input file' +) +parser.add_argument( + '-o', '--output-file', + help='Output file', + required=True +) +args = parser.parse_args() -def main(): - # Parse the given arguments - parser = ArgumentParser(description='Create a vrt from JSON and txt') - parser.add_argument('text', help='Path to txt file') - parser.add_argument('stand_off_data', help='Path to JSON file') - parser.add_argument('output', help='Path to vrt output file') - args = parser.parse_args() +with open(args.stand_off_data_file) as stand_of_data_file: + stand_off_data = StandOffData(json.load(stand_of_data_file)) - with open(args.stand_off_data) as stand_of_data_file: - stand_off_data = StandOffData(json.load(stand_of_data_file)) +with open(args.text_file, "rb") as text_file: + text_md5 = hashlib.md5() + for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''): + text_md5.update(chunk) + if text_md5.hexdigest() != stand_off_data.meta['file']['md5']: + raise Exception('md5 not equal') - with open(args.text, "rb") as text_file: - text_md5 = hashlib.md5() - for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''): # noqa - text_md5.update(chunk) - if text_md5.hexdigest() != stand_off_data.meta['file']['md5']: - raise Exception('md5 not equal') +with open(args.text_file, encoding=stand_off_data.meta['file']['encoding']) as text_file: # noqa + text = text_file.read() - with open(args.text, encoding=stand_off_data.meta['file']['encoding']) as text_file: - text = text_file.read() - - with open(args.output, 'w') as vrt_file: - vrt_file.write(stand_off_data.to_vrt(text)) - - -if __name__ == '__main__': - main() +with open(args.output_file, 'w') as vrt_file: + vrt_file.write(stand_off_data.to_vrt(text)) diff --git a/wrapper/nlp b/wrapper/nlp index 7e0a3a4..54c04fa 100755 --- a/wrapper/nlp +++ b/wrapper/nlp @@ -6,7 +6,7 @@ import os import subprocess import sys -CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0b' +CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:v0.1.0' CONTAINER_INPUT_DIR = '/input' CONTAINER_OUTPUT_DIR = '/output' CONTAINER_LOG_DIR = '/logs' @@ -19,17 +19,17 @@ parser.add_argument('-o', '--output-dir') parser.add_argument('--log-dir') args, remaining_args = parser.parse_known_args() -cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)] +cmd = ['docker', 'run', '--rm', '-it', '-u', f'{UID}:{GID}'] if args.input_dir is not None: - mapping = os.path.abspath(args.input_dir) + ':' + CONTAINER_INPUT_DIR + mapping = f'{os.path.abspath(args.input_dir)}:{CONTAINER_INPUT_DIR}' cmd += ['-v', mapping] remaining_args += ['-i', CONTAINER_INPUT_DIR] if args.output_dir is not None: - mapping = os.path.abspath(args.output_dir) + ':' + CONTAINER_OUTPUT_DIR + mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}' cmd += ['-v', mapping] remaining_args += ['-o', CONTAINER_OUTPUT_DIR] if args.log_dir is not None: - mapping = os.path.abspath(args.log_dir) + ':' + CONTAINER_LOG_DIR + mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}' cmd += ['-v', mapping] remaining_args += ['--log-dir', CONTAINER_LOG_DIR] cmd.append(CONTAINER_IMAGE)