Compare commits

..

6 Commits

Author SHA1 Message Date
Patrick Jentsch
a2e8e72e54 Bump spaCy version, bugfixes, codestyle 2022-01-27 16:50:22 +01:00
Patrick Jentsch
29ccfac4f6 optimizations 2021-08-11 16:47:29 +02:00
Patrick Jentsch
0ba0c14b72 First attempt 2021-08-10 14:43:55 +02:00
Patrick Jentsch
66516eeb89 WIP use the new package 2021-08-06 16:50:22 +02:00
Patrick Jentsch
a4b2fc3a65 Create package for stand-off-data-py 2021-07-22 16:59:29 +02:00
Patrick Jentsch
4dea95a108 Preliminary work 2021-07-13 16:31:53 +02:00
10 changed files with 908 additions and 408 deletions

View File

@ -9,7 +9,14 @@ ENV LANG=C.UTF-8
RUN apt-get update \ RUN apt-get update \
&& apt-get install --no-install-recommends --yes \ && apt-get install --no-install-recommends --yes \
wget procps \
python3.7 \
python3-pip \
wget \
&& python3 -m pip install \
chardet \
setuptools \
wheel
# Install the NLP pipeline and it's dependencies # # Install the NLP pipeline and it's dependencies #
## Install pyFlow ## ## Install pyFlow ##
@ -21,12 +28,12 @@ RUN wget --no-check-certificate --quiet \
&& apt-get install --no-install-recommends --yes \ && apt-get install --no-install-recommends --yes \
python2.7 \ python2.7 \
&& python2.7 setup.py build install \ && python2.7 setup.py build install \
&& cd .. \ && cd - > /dev/null \
&& rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz" && rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"
## Install spaCy ## ## Install spaCy ##
ENV SPACY_VERSION=3.0.5 ENV SPACY_VERSION=3.2.1
RUN apt-get install --no-install-recommends --yes \ RUN apt-get install --no-install-recommends --yes \
python3.7 \ python3.7 \
python3-pip \ python3-pip \
@ -38,17 +45,15 @@ RUN apt-get install --no-install-recommends --yes \
&& pip3 install "spacy==${SPACY_VERSION}" && pip3 install "spacy==${SPACY_VERSION}"
# Only models that include the following components are compatibel: ENV SPACY_MODELS="de_core_news_md,en_core_web_md,it_core_news_md,pl_core_news_md,zh_core_web_md"
# lemmatizer, ner, parser, senter, tagger, ENV SPACY_MODELS_VERSION=3.2.0
ENV SPACY_MODELS="de_core_news_md,en_core_web_md,it_core_news_md,nl_core_news_md,pl_core_news_md,zh_core_web_md"
ENV SPACY_MODELS_VERSION=3.0.0
RUN for spacy_model in $(echo ${SPACY_MODELS} | tr "," "\n"); do python3 -m spacy download "${spacy_model}-${SPACY_MODELS_VERSION}" --direct; done RUN for spacy_model in $(echo ${SPACY_MODELS} | tr "," "\n"); do python3 -m spacy download "${spacy_model}-${SPACY_MODELS_VERSION}" --direct; done
## Further dependencies ## COPY packages .
RUN apt-get install --no-install-recommends --yes \ RUN cd stand-off-data-py \
procps \ && python3 -m pip install . \
zip && cd -
## Install Pipeline ## ## Install Pipeline ##

21
LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2021 Bielefeld University - CRC 1288 - INF
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -1,48 +1,41 @@
# NLP - Natural Language Processing # NLP - Natural Language Processing
This software implements a heavily parallelized pipeline for Natural Language Processing of text files. It is used for nopaque's NLP service but you can also use it standalone, for that purpose a convenient wrapper script is provided. This software implements a heavily parallelized pipeline for Natural Language Processing of text files. It is used for nopaque's NLP service but you can also use it standalone, for that purpose a convenient wrapper script is provided. The pipeline is designed to run on Linux operating systems, but with some tweaks it should also run on Windows with WSL installed.
## Software used in this pipeline implementation ## Software used in this pipeline implementation
- Official Debian Docker image (buster-slim) and programs from its free repositories: https://hub.docker.com/_/debian
- Official Debian Docker image (buster-slim): https://hub.docker.com/_/debian
- Software from Debian Buster's free repositories
- pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20 - pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
- spaCy (3.0.5): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1 - spaCy (3.2.1): https://github.com/explosion/spaCy/releases/tag/v3.2.1
- spaCy medium sized models (3.0.0): - spaCy medium sized models (3.2.0):
- https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.0.0 - https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.2.0
- https://github.com/explosion/spacy-models/releases/tag/en_core_web_md-3.0.0 - https://github.com/explosion/spacy-models/releases/tag/en_core_web_md-3.2.0
- https://github.com/explosion/spacy-models/releases/tag/it_core_news_md-3.0.0 - https://github.com/explosion/spacy-models/releases/tag/it_core_news_md-3.2.0
- https://github.com/explosion/spacy-models/releases/tag/nl_core_news_md-3.0.0 - https://github.com/explosion/spacy-models/releases/tag/nl_core_news_md-3.2.0
- https://github.com/explosion/spacy-models/releases/tag/pl_core_news_md-3.0.0 - https://github.com/explosion/spacy-models/releases/tag/pl_core_news_md-3.2.0
- https://github.com/explosion/spacy-models/releases/tag/zh_core_web_md-3.0.0 - https://github.com/explosion/spacy-models/releases/tag/zh_core_web_md-3.2.0
## Use this image ## Installation
1. Create input and output directories for the pipeline. 1. Install Docker and Python 3.
``` bash 2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git`
mkdir -p /<my_data_location>/input /<my_data_location>/output 3. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:v0.1.0 nlp`
``` 4. Add the wrapper script (`wrapper/nlp` relative to this README file) to your `${PATH}`.
5. Create working directories for the pipeline: `mkdir -p /<my_data_location>/{input,output}`.
2. Place your text files inside `/<my_data_location>/input`. Files should all contain text of the same language.
## Use the Pipeline
1. Place your plain text files inside `/<my_data_location>/input`. Files should all contain text of the same language.
2. Clear your `/<my_data_location>/output` directory.
3. Start the pipeline process. Check the pipeline help (`nlp --help`) for more details. 3. Start the pipeline process. Check the pipeline help (`nlp --help`) for more details.
``` ```bash
# Option one: Use the wrapper script
## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/raw/1.0.0/wrapper/nlp, make it executeable and add it to your ${PATH}
cd /<my_data_location> cd /<my_data_location>
nlp -i input -l <language_code> -o output <optional_pipeline_arguments> nlp \
--input-dir input \
# Option two: Classic Docker style --output-dir output \
docker run \ -m <model_code> <optional_pipeline_arguments>
--rm \
-it \
-u $(id -u $USER):$(id -g $USER) \
-v /<my_data_location>/input:/input \
-v /<my_data_location>/output:/output \
gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0 \
-i /input \
-l <language_code>
-o /output \
<optional_pipeline_arguments>
``` ```
4. Check your results in the `/<my_data_location>/output` directory. 4. Check your results in the `/<my_data_location>/output` directory.

332
nlp
View File

@ -1,68 +1,141 @@
#!/usr/bin/env python2.7 #!/usr/bin/env python2.7
# coding=utf-8 # coding=utf-8
"""A NLP pipeline for text file processing.""" ''' A NLP pipeline for text file processing. '''
__version__ = '0.1.0'
__author__ = 'Patrick Jentsch <p.jentsch@uni-bielefeld.de>,' \
'Stephan Porada <porada@posteo.de>'
__version__ = '1.0.0'
from argparse import ArgumentParser from argparse import ArgumentParser
from pyflow import WorkflowRunner from pyflow import WorkflowRunner
import multiprocessing import json
import os import os
import sys import sys
SPACY_MODELS = {'de': 'de_core_news_md', SPACY_MODELS = {
'de': 'de_core_news_md',
'en': 'en_core_web_md', 'en': 'en_core_web_md',
'it': 'it_core_news_md', 'it': 'it_core_news_md',
'nl': 'nl_core_news_md', 'nl': 'nl_core_news_md',
'pl': 'pl_core_news_md', 'pl': 'pl_core_news_md',
'zh': 'zh_core_web_md'} 'zh': 'zh_core_web_md'
}
class NLPPipelineJob: class PipelineJob:
"""An NLP pipeline job class '''
NLP pipeline job class.
Each input file of the pipeline is represented as an NLP pipeline job, Each plain text input file of the pipeline is represented as an NLP
which holds all necessary information for the pipeline to process it. pipeline job, which holds all necessary information for the pipeline to
process it.
Arguments: Arguments:
file -- Path to the file file -- Path to the file
output_dir -- Path to a directory, where job results a stored output_dir -- Path to a directory, where job results are stored
""" '''
def __init__(self, file, output_dir): def __init__(self, file, output_dir):
self.file = file self.file = file
self.name = os.path.basename(file).rsplit('.', 1)[0] self.name = os.path.basename(file)[:-4]
self.output_dir = output_dir self.output_dir = output_dir
class NLPPipeline(WorkflowRunner): class NLPWorkflow(WorkflowRunner):
def __init__(self, input_dir, output_dir, check_encoding, lang, zip): def __init__(self, job, model, check_encoding=False, id_prefix=''):
self.job = job
self.model = model
self.check_encoding = check_encoding
self.id_prefix = id_prefix
def workflow(self):
'''
' ##################################################
' # spacy #
' ##################################################
'''
n_cores = 1
mem_mb = min(1024, self.getMemMb())
cmd = 'spacy-nlp'
cmd += ' --input-file "{}"'.format(self.job.file)
cmd += ' --output-file "{}"'.format(
os.path.join(self.job.output_dir, '{}.json'.format(self.job.name))
)
cmd += ' -m "{}"'.format(self.model)
if self.check_encoding:
cmd += ' --check-encoding'
cmd += ' --id-prefix "{}"'.format(self.id_prefix)
self.addTask(
'spacy',
command=cmd,
memMb=mem_mb,
nCores=n_cores
)
class CreateVrtWorkflow(WorkflowRunner):
def __init__(self, job):
self.job = job
def workflow(self):
'''
' ##################################################
' # vrt-creator #
' ##################################################
'''
n_cores = 1
mem_mb = min(256, self.getMemMb())
cmd = 'vrt-creator'
cmd += ' --stand-off-data-file "{}"'.format(
os.path.join(self.job.output_dir, '{}.json'.format(self.job.name))
)
cmd += ' --text-file "{}"'.format(self.job.file)
cmd += ' --output-file "{}"'.format(
os.path.join(self.job.output_dir, '{}.vrt'.format(self.job.name))
)
self.addTask(
'vrt_creator',
command=cmd,
memMb=mem_mb,
nCores=n_cores
)
class MainWorkflow(WorkflowRunner):
def __init__(
self,
input_dir,
model,
output_dir,
check_encoding=False,
id_prefix=''
):
self.input_dir = input_dir self.input_dir = input_dir
self.model = model
self.output_dir = output_dir self.output_dir = output_dir
self.check_encoding = check_encoding self.check_encoding = check_encoding
self.lang = lang self.id_prefix = id_prefix
self.zip = zip self.jobs = []
self.jobs = collect_jobs(self.input_dir, self.output_dir)
def collect_jobs(self):
self.jobs = []
for file in os.listdir(self.input_dir):
if os.path.isdir(os.path.join(self.input_dir, file)):
continue
if not file.lower().endswith('.txt'):
continue
job = PipelineJob(
os.path.join(self.input_dir, file),
os.path.join(self.output_dir, file)
)
self.jobs.append(job)
def workflow(self): def workflow(self):
if not self.jobs: if not self.jobs:
return return
''' # Create output and temporary directories
' ################################################## for job in self.jobs:
' # setup output directory # os.mkdir(job.output_dir)
' ##################################################
'''
setup_output_directory_tasks = []
for i, job in enumerate(self.jobs):
cmd = 'mkdir -p "{}"'.format(job.output_dir)
lbl = 'setup_output_directory_-_{}'.format(i)
task = self.addTask(command=cmd, label=lbl)
setup_output_directory_tasks.append(task)
''' '''
' ################################################## ' ##################################################
@ -70,104 +143,116 @@ class NLPPipeline(WorkflowRunner):
' ################################################## ' ##################################################
''' '''
nlp_tasks = [] nlp_tasks = []
n_cores = max(1, int(self.getNCores() / len(self.jobs)))
mem_mb = min(n_cores * 2048, int(self.getMemMb() / len(self.jobs)))
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
output_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name)) # noqa task = self.addWorkflowTask(
cmd = 'spacy-nlp' 'nlp_-_{}'.format(i),
cmd += ' -l "{}"'.format(self.lang) NLPWorkflow(
cmd += ' --check-encoding' if self.check_encoding else '' job,
cmd += ' "{}"'.format(job.file) self.model,
cmd += ' "{}"'.format(output_file) check_encoding=self.check_encoding,
deps = 'setup_output_directory_-_{}'.format(i) id_prefix=self.id_prefix
lbl = 'nlp_-_{}'.format(i) )
task = self.addTask(command=cmd, dependencies=deps, label=lbl, )
memMb=mem_mb, nCores=n_cores)
nlp_tasks.append(task) nlp_tasks.append(task)
''' '''
' ################################################## ' ##################################################
' # vrt creation # ' # create vrt #
' ################################################## ' ##################################################
''' '''
vrt_creation_tasks = [] create_vrt_tasks = []
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
output_file = os.path.join(job.output_dir, '{}.vrt'.format(job.name)) # noqa task = self.addWorkflowTask(
nlp_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name)) # noqa 'create_vrt_-_{}'.format(i),
cmd = 'vrt-creator' CreateVrtWorkflow(job),
cmd += ' "{}"'.format(job.file) dependencies='nlp_-_{}'.format(i)
cmd += ' "{}"'.format(nlp_file) )
cmd += ' "{}"'.format(output_file) create_vrt_tasks.append(task)
deps = 'nlp_-_{}'.format(i)
lbl = 'vrt_creation_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
vrt_creation_tasks.append(task)
''' self.waitForTasks()
' ################################################## outputs = []
' # zip creation # for job in self.jobs:
' ################################################## # Track output files
''' relative_output_dir = os.path.relpath(
zip_creation_tasks = [] job.output_dir,
if self.zip is not None: start=self.output_dir
cmd = 'cd "{}"'.format(self.output_dir) )
cmd += ' && ' outputs.append(
cmd += 'zip' {
cmd += ' -r' 'description': 'JSON stand off data',
cmd += ' "{}.zip" .'.format(self.zip) 'file': os.path.join(
cmd += ' -x "pyflow.data*"' relative_output_dir,
cmd += ' -i "*.vrt" "*.json"' '{}.json'.format(job.name)
cmd += ' && ' ),
cmd += 'cd -' 'mimetype': 'application/json'
deps = vrt_creation_tasks }
lbl = 'zip_creation' )
task = self.addTask(command=cmd, dependencies=deps, label=lbl) outputs.append(
zip_creation_tasks.append(task) {
'description': 'CWB vrt file',
'file': os.path.join(
def collect_jobs(input_dir, output_dir): relative_output_dir,
jobs = [] '{}.vrt'.format(job.name)
for file in os.listdir(input_dir): ),
if os.path.isdir(os.path.join(input_dir, file)): 'mimetype': 'application/vrt+xml'
continue }
if file.lower().endswith('.txt'): )
job = NLPPipelineJob(os.path.join(input_dir, file), with open(os.path.join(self.output_dir, 'outputs.json'), 'w') as f:
os.path.join(output_dir, file)) json.dump(outputs, f, indent=4)
jobs.append(job)
return jobs
def parse_args(): def parse_args():
parser = ArgumentParser(description='NLP pipeline for TXT file processing', parser = ArgumentParser(
prog='NLP pipeline') description='NLP pipeline for plain text file processing'
parser.add_argument('-i', '--input-dir', )
parser.add_argument(
'-i', '--input-dir',
help='Input directory', help='Input directory',
required=True) required=True
parser.add_argument('-o', '--output-dir', )
parser.add_argument(
'-o', '--output-dir',
help='Output directory', help='Output directory',
required=True) required=True
parser.add_argument('-l', '--language', )
parser.add_argument(
'-m', '--model',
choices=SPACY_MODELS.keys(), choices=SPACY_MODELS.keys(),
help='Language of the input (2-character ISO 639-1 language codes)', # noqa help='The model to be used',
required=True) required=True
parser.add_argument('--check-encoding', )
parser.add_argument(
'--check-encoding',
action='store_true', action='store_true',
help='Check encoding of the input file, UTF-8 is used instead') # noqa help='Check encoding of the input file, UTF-8 is used instead'
parser.add_argument('--log-dir', )
help='Logging directory') parser.add_argument(
parser.add_argument('--mem-mb', '--id-prefix',
help='Amount of system memory to be used (Default: min(--n-cores * 2048, available system memory))', # noqa default='',
type=int) help='A prefix for all the ids within the stand off annotations'
parser.add_argument('--n-cores', )
default=min(4, multiprocessing.cpu_count()), parser.add_argument(
help='Number of CPU threads to be used (Default: min(4, number of CPUs))', # noqa '--log-dir',
type=int) help='Logging directory (Default: --output-dir)'
parser.add_argument('--zip', )
help='Create one zip file per filetype') parser.add_argument(
parser.add_argument('-v', '--version', '--mem-mb',
help='Amount of system memory to be used '
'(Default: min(--n-cores * 1024, available system memory))',
type=int
)
parser.add_argument(
'--n-cores',
default=1,
help='Number of CPU threads to be used',
type=int
)
parser.add_argument(
'-v', '--version',
action='version', action='version',
help='Returns the current version of the NLP pipeline', help='Returns the current version of the NLP pipeline',
version='%(prog)s {}'.format(__version__)) version='%(prog)s {}'.format(__version__)
)
args = parser.parse_args() args = parser.parse_args()
# Set some tricky default values and check for insufficient input # Set some tricky default values and check for insufficient input
@ -177,20 +262,27 @@ def parse_args():
raise Exception('--n-cores must be greater or equal 1') raise Exception('--n-cores must be greater or equal 1')
if args.mem_mb is None: if args.mem_mb is None:
max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0]) max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0])
args.mem_mb = min(args.n_cores * 2048, max_mem_mb) args.mem_mb = min(args.n_cores * 1024, max_mem_mb)
if args.mem_mb < 2048: if args.mem_mb < 1024:
raise Exception('--mem-mb must be greater or equal 2048') raise Exception('--mem-mb must be greater or equal 1024')
if args.zip is not None and args.zip.lower().endswith('.zip'):
# Remove .zip file extension if provided
args.zip = args.zip[:-4]
args.zip = args.zip if args.zip else 'output'
return args return args
def main(): def main():
args = parse_args() args = parse_args()
nlp_pipeline = NLPPipeline(args.input_dir, args.output_dir, args.check_encoding, args.language, args.zip) # noqa main_workflow = MainWorkflow(
retval = nlp_pipeline.run(dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores) # noqa args.input_dir,
args.model,
args.output_dir,
check_encoding=args.check_encoding,
id_prefix=args.id_prefix
)
main_workflow.collect_jobs()
retval = main_workflow.run(
dataDirRoot=args.log_dir,
memMb=args.mem_mb,
nCores=args.n_cores
)
sys.exit(retval) sys.exit(retval)

View File

@ -0,0 +1,14 @@
import setuptools
setuptools.setup(
name='Stand off data',
author='Patrick Jentsch',
author_email='p.jentsch@uni-bielefeld.de',
description='A python library to handle stand off data.',
py_modules=['stand_off_data'],
classifiers=[
'Programming Language :: Python :: 3',
'Operating System :: OS Independent',
],
python_requires='>=3.5'
)

View File

@ -0,0 +1,282 @@
from xml.sax.saxutils import escape
class StandOffData:
def __init__(self, attrs={}):
self.meta = attrs.get('meta', {})
self.lookup = {}
for x in attrs.get('tags', []):
self.add_tag_definition(x)
self.annotations = [
TagAnnotation(x, self.lookup)
for x in attrs.get('annotations', [])
]
def add_tag_definition(self, attrs):
tag_definition = TagDefinition(attrs)
if tag_definition.id in self.lookup:
raise Exception(f'Tag id already in use: {self.to_dict()}')
self.lookup[tag_definition.id] = tag_definition
def to_dict(self):
return {
'meta': self.meta,
'lookup': {k: v.to_dict() for k, v in self.lookup.items()},
'annotations': [x.to_dict() for x in self.annotations]
}
def to_vrt(self, text):
# Devide annotations into CWB's verticalized text format (.vrt) logic
p_attrs = [] # positional attributes
s_attrs = [] # structural attributes
for annotation in self.annotations:
if annotation.name == 'token':
p_attrs.append(annotation)
else:
s_attrs.append(annotation)
# Sort annotations, necessary for the next checks
p_attrs.sort()
s_attrs.sort()
# Check for p_attr<->p_attr overlap
for i, p_attr in enumerate(p_attrs[:-1]):
next_p_attr = p_attrs[i + 1]
# Check if first_p_attr starts/ends within second_p_attr
if ((p_attr.start >= next_p_attr.start) and (p_attr.start < next_p_attr.end) # noqa
or (p_attr.end > next_p_attr.start) and (p_attr.end <= next_p_attr.end)): # noqa
raise Exception(
'Positional attribute overlaps another: '
f'{p_attr.to_dict()}<->{next_p_attr.to_dict()}'
)
# Check for s_attr<->p_attr overlap
for i, s_attr in enumerate(s_attrs):
for p_attr in p_attrs:
# Check if s_attr starts within p_attr
if s_attr.start > p_attr.start and s_attr.start < p_attr.end:
# Change s_attr start to p_attr's start
s_attrs[i].start = p_attr.start
# Check if s_attr ends within p_attr
if s_attr.end < p_attr.end and s_attr.end > p_attr.start:
# Change s_attr end to p_attr's end
s_attrs[i].end = p_attr.end
# Check if s_attr starts/ends before/after p_attr
if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start:
# No further Checking needed (because p_attrs are sorted)
break
p_attr_buffer = {}
for i, p_attr in enumerate(p_attrs):
p_attr_buffer[p_attr.start] = i
s_attr_start_buffer = {}
s_attr_end_buffer = {}
for i, s_attr in enumerate(s_attrs):
if s_attr.start in s_attr_start_buffer:
s_attr_start_buffer[s_attr.start].append(i)
else:
s_attr_start_buffer[s_attr.start] = [i]
if s_attr.end in s_attr_end_buffer:
s_attr_end_buffer[s_attr.end].insert(0, i)
else:
s_attr_end_buffer[s_attr.end] = [i]
vrt = ''
vrt += '<text>\n'
current_position = 0
text_len = len(text)
# As long as we have something in our buffers we process it
while current_position <= text_len:
# s_attr endings
# for k in {k: v for k, v in s_attr_end_buffer.items() if k <= current_position}: # noqa
if current_position in s_attr_end_buffer:
# s_attr_indexes = s_attr_end_buffer.pop(k)
s_attr_indexes = s_attr_end_buffer.pop(current_position)
for s_attr_index in s_attr_indexes:
s_attr = s_attrs[s_attr_index]
vrt += f'</{escape(s_attr.name)}>\n'
# s_attrs starts
# for k in {k: v for k, v in s_attr_start_buffer.items() if k <= current_position}: # noqa
if current_position in s_attr_start_buffer:
# s_attr_indexes = s_attr_start_buffer.pop(k)
s_attr_indexes = s_attr_start_buffer.pop(current_position)
for s_attr_index in s_attr_indexes:
s_attr = s_attrs[s_attr_index]
vrt += f'<{escape(s_attr.name)}'
for property in s_attr.properties:
vrt += f' {escape(property.name)}="{escape(str(property.value))}"' # noqa
vrt += '>\n'
# p_attrs
if current_position not in p_attr_buffer:
current_position += 1
continue
p_attr_index = p_attr_buffer.pop(current_position)
p_attr = p_attrs[p_attr_index]
if text[p_attr.start:p_attr.end].isspace():
current_position = p_attr.end
continue
_p_attr = {
'lemma': 'None',
'pos': 'None',
'simple_pos': 'None',
'word': 'None'
}
for property in p_attr.properties:
if property.name not in _p_attr:
continue
_p_attr[property.name] = escape(str(property.value))
_p_attr['word'] = escape(text[p_attr.start:p_attr.end])
vrt += '{word}\t{pos}\t{lemma}\t{simple_pos}\n'.format(**_p_attr)
current_position = p_attr.end
vrt += '</text>\n'
return vrt
class TagAnnotation:
def __init__(self, attrs, lookup):
self.lookup = lookup
self.tag_id = attrs['tag_id']
self.start = attrs['start']
self.end = attrs['end']
self.properties = [
PropertyAnnotation(x, self.lookup[self.tag_id].properties)
for x in attrs.get('properties', [])
]
''' Sanity checks '''
if self.tag_id not in self.lookup:
raise Exception(f'Unknown tag: {self.to_dict()}')
if self.end < self.start:
raise Exception(f'Annotation end less then start: {self.to_dict()}') # noqa
# property_ids = [x.property_id for x in self.properties]
# for required_property_id, required_property in self.lookup[self.tag_id].required_properties.items(): # noqa
# if required_property_id not in property_ids:
# raise Exception(
# f'Missing required property: {required_property.to_dict()}'
# )
@property
def name(self):
return self.lookup[self.tag_id].name
def to_dict(self):
return {
'tag_id': self.tag_id,
'start': self.start,
'end': self.end,
'properties': [x.to_dict() for x in self.properties]
}
def __lt__(self, other):
if self.start == other.start:
if self.name == 'token' and other.name != 'token':
return False
elif self.name != 'token' and other.name == 'token':
return True
else:
return self.end > other.end
else:
return self.start < other.start
def __le__(self, other):
if self.start == other.start:
if self.name == 'token' and other.name != 'token':
return False
elif self.name != 'token' and other.name == 'token':
return True
else:
return self.end >= other.end
else:
return self.start <= other.start
def __eq__(self, other):
if self.start == other.start:
if self.name == 'token' and other.name != 'token':
return False
elif self.name != 'token' and other.name == 'token':
return False
else:
return self.end == other.end
else:
return False
def __ne__(self, other):
return not self == other
def __gt__(self, other):
return not self <= other
def __ge__(self, other):
return not self < other
class PropertyAnnotation:
def __init__(self, attrs, lookup):
self.lookup = lookup
self.property_id = attrs['property_id']
self.value = attrs['value']
# TODO: Process attrs['possibleValues'] as self.labels (no id?)
''' Sanity checks '''
if self.property_id not in self.lookup:
raise Exception(f'Unknown property: {self.to_dict()}')
@property
def name(self):
return self.lookup[self.property_id].name
def to_dict(self):
return {
'property_id': self.property_id,
'tag_id': self.tag_id,
'value': self.value
}
class TagDefinition:
def __init__(self, attrs):
self.id = attrs['id']
self.name = attrs['name']
self.description = attrs.get('description', '')
self.properties = {}
for x in attrs.get('properties', []):
self.add_property_definition(x)
def add_property_definition(self, attrs):
property_definition = PropertyDefinition(attrs)
if property_definition.id in self.properties:
raise Exception(
f'Property id already in use: {property_definition.to_dict()}')
self.properties[property_definition.id] = property_definition
# @property
# def required_properties(self):
# return {property.id: property for property in self.properties.values()
# if property.is_required}
def to_dict(self):
return {
'id': self.id,
'name': self.name,
'description': self.description,
'properties': {k: v.to_dict() for k, v in self.properties.items()}
}
class PropertyDefinition:
def __init__(self, attrs):
self.id = attrs['id']
self.name = attrs['name']
self.description = attrs.get('description', '')
self.flags = attrs.get('flags', [])
self.labels = attrs.get('labels', [])
# @property
# def is_required(self):
# return 'required' in self.flags
@property
def has_multiple_values(self):
return 'multiple' in self.flags
def to_dict(self):
return {
'id': self.id,
'name': self.name,
'description': self.description,
'flags': self.flags,
'labels': self.labels
}

View File

@ -0,0 +1,2 @@
# flake8: noqa
from .models import StandOffData

376
spacy-nlp
View File

@ -8,41 +8,67 @@ import json
import os import os
import spacy import spacy
import textwrap import textwrap
import uuid
spacy_models = {spacy.info(pipeline)['lang']: pipeline spacy_models = {
for pipeline in spacy.info()['pipelines']} spacy.info(pipeline)['lang']: pipeline
for pipeline in spacy.info()['pipelines']
}
# Parse the given arguments # Parse the given arguments
parser = ArgumentParser(description='Create annotations for a given txt file') parser = ArgumentParser(
parser.add_argument('input', help='Path to txt input file') description='Create annotations for a given plain txt file'
parser.add_argument('output', help='Path to JSON output file') )
parser.add_argument('-l', '--language', parser.add_argument(
'-i', '--input-file',
help='Input file'
)
parser.add_argument(
'-o', '--output-file',
help='Output file',
required=True
)
parser.add_argument(
'-m', '--model',
choices=spacy_models.keys(), choices=spacy_models.keys(),
help='Language of the input (2-character ISO 639-1 language codes)', # noqa help='The model to be used',
required=True) required=True
parser.add_argument('-c', '--check-encoding', )
parser.add_argument(
'-c', '--check-encoding',
action='store_true', action='store_true',
help='Check encoding of the input file, UTF-8 is used instead') # noqa help='Check encoding of the input file, UTF-8 is used instead'
)
parser.add_argument(
'--id-prefix',
default='',
help='A prefix for all the ids within the stand off annotations'
)
args = parser.parse_args() args = parser.parse_args()
with open(args.input, "rb") as text_file:
def generate_id(name):
return f'{args.id_prefix}{uuid.uuid3(uuid.NAMESPACE_DNS, name)}'
with open(args.input_file, "rb") as input_file:
if args.check_encoding: if args.check_encoding:
encoding = chardet.detect(text_file.read())['encoding'] encoding = chardet.detect(input_file.read())['encoding']
else: else:
encoding = 'utf-8' encoding = 'utf-8'
text_file.seek(0) input_file.seek(0)
text_md5 = hashlib.md5() text_md5 = hashlib.md5()
for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''): for chunk in iter(lambda: input_file.read(128 * text_md5.block_size), b''):
text_md5.update(chunk) text_md5.update(chunk)
# Load the text contents from the input file # Load the text contents from the input file
with open(args.input, encoding=encoding) as text_file: with open(args.input_file, encoding=encoding) as input_file:
# spaCy NLP is limited to strings with a maximum of 1 million characters at # spaCy NLP is limited to strings with a maximum of 1 million characters at
# once. So we split it into suitable chunks. # once. So we split it into suitable chunks.
text_chunks = textwrap.wrap( text_chunks = textwrap.wrap(
text_file.read(), input_file.read(),
1000000, 1000000,
break_long_words=False, break_long_words=False,
break_on_hyphens=False, break_on_hyphens=False,
@ -51,84 +77,197 @@ with open(args.input, encoding=encoding) as text_file:
replace_whitespace=False replace_whitespace=False
) )
model = spacy_models[args.language] model_name = spacy_models[args.model]
nlp = spacy.load(model) nlp = spacy.load(model_name)
meta = { meta = {
'generator': { 'generator': {
'name': 'nopaque NLP service', 'name': 'nopaque spacy NLP',
'version': '1.0.0', 'version': '0.1.0',
'arguments': { 'arguments': {
'check_encoding': args.check_encoding, 'check_encoding': args.check_encoding,
'language': args.language 'model': args.model
} }
}, },
'file': { 'file': {
'encoding': encoding, 'encoding': encoding,
'md5': text_md5.hexdigest(), 'md5': text_md5.hexdigest(),
'name': os.path.basename(args.input) 'name': os.path.basename(args.input_file)
} }
} }
tags = []
tags = { token = {
'token': { 'id': generate_id('token'),
'description': '', 'name': 'token',
'properties': { 'description': 'An individual token — i.e. a word, punctuation symbol, whitespace, etc.', # noqa
'lemma': { 'properties': []
'description': 'The base form of the word', }
'flags': ['required'], # TODO: Check if all languages support token.sentiment
'tagset': None token['properties'].append(
}, {
'pos': { 'id': generate_id('token.sentiment'),
'description': 'The detailed part-of-speech tag', 'name': 'sentiment',
'flags': ['required'], 'description': 'A scalar value indicating the positivity or negativity of the token.' # noqa
'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['tagger']} # noqa }
}, )
'simple_pos': { if nlp.has_pipe('lemmatizer'):
token['properties'].append(
{
'id': generate_id('token.lemma'),
'name': 'lemma',
'description': 'The base form of the word'
}
)
if nlp.has_pipe('morphologizer') or nlp.has_pipe('tagger'):
token['properties'].append(
{
'id': generate_id('token.simple_pos'),
'name': 'simple_pos',
'description': 'The simple UPOS part-of-speech tag', 'description': 'The simple UPOS part-of-speech tag',
'flags': ['required'], 'labels': [
'tagset': { {
'ADJ': 'adjective', 'id': generate_id('token.simple_pos=ADJ'),
'ADP': 'adposition', 'name': 'ADJ',
'ADV': 'adverb', 'description': 'adjective'
'AUX': 'auxiliary verb',
'CONJ': 'coordinating conjunction',
'DET': 'determiner',
'INTJ': 'interjection',
'NOUN': 'noun',
'NUM': 'numeral',
'PART': 'particle',
'PRON': 'pronoun',
'PROPN': 'proper noun',
'PUNCT': 'punctuation',
'SCONJ': 'subordinating conjunction',
'SYM': 'symbol',
'VERB': 'verb',
'X': 'other'
}
}, },
'ner': { {
'description': 'Label indicating the type of the entity', 'id': generate_id('token.simple_pos=ADJ'),
'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['ner']} # noqa 'name': 'ADP',
} 'description': 'adposition'
}
}, },
's': { {
'description': 'Encodes the start and end of a sentence', 'id': generate_id('token.simple_pos=ADJ'),
'properties': None 'name': 'ADV',
'description': 'adverb'
}, },
'ent': { {
'id': generate_id('token.simple_pos=ADJ'),
'name': 'AUX',
'description': 'auxiliary verb'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'CONJ',
'description': 'coordinating conjunction'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'DET',
'description': 'determiner'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'INTJ',
'description': 'interjection'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'NOUN',
'description': 'noun'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'NUM',
'description': 'numeral'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'PART',
'description': 'particle'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'PRON',
'description': 'pronoun'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'PROPN',
'description': 'proper noun'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'PUNCT',
'description': 'punctuation'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'SCONJ',
'description': 'subordinating conjunction'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'SYM',
'description': 'symbol'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'VERB',
'description': 'verb'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'X',
'description': 'other'
}
]
}
)
if nlp.has_pipe('tagger'):
token['properties'].append(
{
'id': generate_id('token.pos'),
'name': 'pos',
'description': 'The detailed part-of-speech tag',
'labels': [
{
'id': generate_id(f'token.pos={label}'),
'name': label,
'description': spacy.explain(label) or ''
} for label in spacy.info(model_name)['labels']['tagger']
]
}
)
if nlp.has_pipe('ner') or nlp.has_pipe('entity_ruler'):
tags.append(
{
'id': generate_id('ent'),
'name': 'ent',
'description': 'Encodes the start and end of a named entity', 'description': 'Encodes the start and end of a named entity',
'properties': { 'properties': [
'type': { {
'id': generate_id('ent.type'),
'name': 'type',
'description': 'Label indicating the type of the entity', 'description': 'Label indicating the type of the entity',
'flags': ['required'], 'labels': [
'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['ner']} # noqa {
'id': generate_id('ent.type={}'.format(label)),
'name': label,
'description': spacy.explain(label) or ''
} for label in spacy.info(model_name)['labels']['ner']
]
} }
]
} }
)
if nlp.has_pipe('parser') or nlp.has_pipe('senter') or nlp.has_pipe('sentencizer'): # noqa
# TODO: Check if all languages support sent.sentiment
tags.append(
{
'id': generate_id('s'),
'name': 's',
'description': 'Encodes the start and end of a sentence',
'properties': [
{
'id': generate_id('s.sentiment'),
'name': 'sentiment',
'description': 'A scalar value indicating the positivity or negativity of the sentence.' # noqa
} }
]
} }
)
tags.append(token)
annotations = [] annotations = []
@ -136,37 +275,78 @@ chunk_offset = 0
while text_chunks: while text_chunks:
text_chunk = text_chunks.pop(0) text_chunk = text_chunks.pop(0)
doc = nlp(text_chunk) doc = nlp(text_chunk)
for token in doc: if hasattr(doc, 'ents'):
if token.is_space: for ent in doc.ents:
continue annotation = {
if token.is_sent_start: 'start': ent.start_char + chunk_offset,
annotation = {'start': token.sent.start_char + chunk_offset,
'end': token.sent.end_char + chunk_offset,
'tag': 's'}
annotations.append(annotation)
# Check if the token is the start of an entity
if token.ent_iob == 3:
for ent_candidate in token.sent.ents:
if ent_candidate.start_char == token.idx:
ent = ent_candidate
annotation = {'start': ent.start_char + chunk_offset,
'end': ent.end_char + chunk_offset, 'end': ent.end_char + chunk_offset,
'tag': 'ent', 'tag_id': generate_id('ent'),
'properties': {'type': token.ent_type_}} 'properties': [
{
'property_id': generate_id('ent.type'),
'value': ent.label_
}
]
}
annotations.append(annotation) annotations.append(annotation)
break if hasattr(doc, 'sents'):
annotation = {'start': token.idx + chunk_offset, for sent in doc.sents:
annotation = {
'start': sent.start_char + chunk_offset,
'end': sent.end_char + chunk_offset,
'tag_id': generate_id('s'),
'properties': []
}
if hasattr(sent, 'sentiment'):
annotation['properties'].append(
{
'property_id': generate_id('s.sentiment'),
'value': sent.sentiment
}
)
annotations.append(annotation)
for token in doc:
annotation = {
'start': token.idx + chunk_offset,
'end': token.idx + len(token.text) + chunk_offset, 'end': token.idx + len(token.text) + chunk_offset,
'tag': 'token', 'tag_id': generate_id('token'),
'properties': {'pos': token.tag_, 'properties': []
'lemma': token.lemma_, }
'simple_pos': token.pos_}} if hasattr(token, 'lemma_'):
if token.ent_type_: annotation['properties'].append(
annotation['properties']['ner'] = token.ent_type_ {
'property_id': generate_id('token.lemma'),
'value': token.lemma_
}
)
if hasattr(token, 'pos_'):
annotation['properties'].append(
{
'property_id': generate_id('token.simple_pos'),
'value': token.pos_
}
)
if hasattr(token, 'sentiment'):
annotation['properties'].append(
{
'property_id': generate_id('token.sentiment'),
'value': token.sentiment
}
)
if hasattr(token, 'tag_'):
annotation['properties'].append(
{
'property_id': generate_id('token.pos'),
'value': token.tag_
}
)
annotations.append(annotation) annotations.append(annotation)
chunk_offset += len(text_chunk) chunk_offset += len(text_chunk)
text_chunk = None text_chunk = None
with open(args.output, 'w') as output_file: with open(args.output_file, 'w') as output_file:
json.dump({'meta': meta, 'tags': tags, 'annotations': annotations}, json.dump(
output_file, indent=4) {'meta': meta, 'tags': tags, 'annotations': annotations},
output_file,
indent=4
)

View File

@ -2,129 +2,40 @@
# coding=utf-8 # coding=utf-8
from argparse import ArgumentParser from argparse import ArgumentParser
from xml.sax.saxutils import escape from stand_off_data import StandOffData
import hashlib import hashlib
import json import json
parser = ArgumentParser(
# Two global ressources - Not very elegant but it works for now description='Convert plain text and JSON stand off to a CWB vrt file'
stand_off_data = None
text = None
def meta_to_string():
string = ''
string += '<generator software="{} ({})" arguments="check_encoding: {}; language: {}"/>\n'.format( # noqa
stand_off_data['meta']['generator']['name'],
stand_off_data['meta']['generator']['version'],
stand_off_data['meta']['generator']['arguments']['check_encoding'],
stand_off_data['meta']['generator']['arguments']['language']
) )
string += '<file encoding="{}" name="{}" md5="{}"/>\n'.format( parser.add_argument(
stand_off_data['meta']['file']['encoding'], '-s', '--stand-off-data-file',
stand_off_data['meta']['file']['name'], help='JSON stand off data input file'
stand_off_data['meta']['file']['md5']
) )
return string parser.add_argument(
'-t', '--text-file',
help='Plain text input file'
def tags_to_string(): )
return '' parser.add_argument(
'-o', '--output-file',
help='Output file',
def annotations_to_string(end=float('inf')): required=True
string = ''
while stand_off_data['annotations']:
if stand_off_data['annotations'][0]['start'] >= end:
break
annotation = stand_off_data['annotations'].pop(0)
#######################################################################
# Check for malformed annotations #
#######################################################################
if 'tag' not in annotation:
raise Exception('Annotation tag is missing')
if annotation['tag'] not in stand_off_data['tags']:
raise Exception('Unknown annotation tag: ' + annotation['tag'])
tag_model = stand_off_data['tags'][annotation['tag']]
if 'properties' in tag_model:
properties_model = tag_model['properties']
if properties_model is not None:
required_properties = filter(lambda x: 'flags' in x and 'required' in x['flags'], properties_model) # noqa
if required_properties and annotation['properties'] is None:
raise Exception('There are required properties but the "Properties" attribute is missing') # noqa
for property in required_properties:
if property not in annotation['properties']:
raise Exception('Required property is missing: ' + property) # noqa
#######################################################################
# Process tokens ~ cwb's positional attributes #
#######################################################################
if annotation['tag'] == 'token':
string += '{}\t{}\t{}\t{}\t{}\n'.format(
escape(text[annotation['start']:annotation['end']]),
escape(annotation['properties']['pos']),
escape(annotation['properties']['lemma']),
escape(annotation['properties']['simple_pos']),
escape(annotation['properties']['ner'] if 'ner' in annotation['properties'] else 'None') # noqa
) )
#######################################################################
# Process other tags ~ cwb's structural attributes #
#######################################################################
else:
properties = ''
if 'properties' in annotation and annotation['properties'] is not None: # noqa
for property, value in annotation['properties'].items():
if not value:
continue
if properties_model and property in properties_model:
if 'flags' in properties_model and 'multiple' in properties_model['flags']: # noqa
properties += ' {}="|{}|"'.format(property, '|'.join(value)) # noqa
else:
properties += ' {}="{}"'.format(property, value)
string += '<' + annotation['tag'] + properties + '>\n'
string += annotations_to_string(end=min(annotation['end'], end))
string += '</' + annotation['tag'] + '>\n'
return string
def main():
global stand_off_data
global text
# Parse the given arguments
parser = ArgumentParser(description='Create a vrt from JSON and txt')
parser.add_argument('text', help='Path to txt file')
parser.add_argument('stand_off_data', help='Path to JSON file')
parser.add_argument('output', help='Path to vrt output file')
args = parser.parse_args() args = parser.parse_args()
with open(args.stand_off_data) as stand_of_data_file: with open(args.stand_off_data_file) as stand_of_data_file:
stand_off_data = json.load(stand_of_data_file) stand_off_data = StandOffData(json.load(stand_of_data_file))
with open(args.text, "rb") as text_file: with open(args.text_file, "rb") as text_file:
text_md5 = hashlib.md5() text_md5 = hashlib.md5()
for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''): # noqa for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''):
text_md5.update(chunk) text_md5.update(chunk)
if text_md5.hexdigest() != stand_off_data['meta']['file']['md5']: if text_md5.hexdigest() != stand_off_data.meta['file']['md5']:
raise Exception('md5 not equal') raise Exception('md5 not equal')
with open(args.text, encoding=stand_off_data['meta']['file']['encoding']) as text_file: # noqa with open(args.text_file, encoding=stand_off_data.meta['file']['encoding']) as text_file: # noqa
text = text_file.read() text = text_file.read()
vrt = '' with open(args.output_file, 'w') as vrt_file:
vrt += '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n' vrt_file.write(stand_off_data.to_vrt(text))
vrt += '<corpus>\n'
vrt += '<text>\n'
vrt += meta_to_string()
vrt += tags_to_string()
vrt += annotations_to_string()
vrt += '</text>\n'
vrt += '</corpus>'
with open(args.output, 'w') as vrt_file:
vrt_file.write(vrt)
if __name__ == '__main__':
main()

View File

@ -6,7 +6,7 @@ import os
import subprocess import subprocess
import sys import sys
CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0b' CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:v0.1.0'
CONTAINER_INPUT_DIR = '/input' CONTAINER_INPUT_DIR = '/input'
CONTAINER_OUTPUT_DIR = '/output' CONTAINER_OUTPUT_DIR = '/output'
CONTAINER_LOG_DIR = '/logs' CONTAINER_LOG_DIR = '/logs'
@ -19,17 +19,17 @@ parser.add_argument('-o', '--output-dir')
parser.add_argument('--log-dir') parser.add_argument('--log-dir')
args, remaining_args = parser.parse_known_args() args, remaining_args = parser.parse_known_args()
cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)] cmd = ['docker', 'run', '--rm', '-it', '-u', f'{UID}:{GID}']
if args.input_dir is not None: if args.input_dir is not None:
mapping = os.path.abspath(args.input_dir) + ':' + CONTAINER_INPUT_DIR mapping = f'{os.path.abspath(args.input_dir)}:{CONTAINER_INPUT_DIR}'
cmd += ['-v', mapping] cmd += ['-v', mapping]
remaining_args += ['-i', CONTAINER_INPUT_DIR] remaining_args += ['-i', CONTAINER_INPUT_DIR]
if args.output_dir is not None: if args.output_dir is not None:
mapping = os.path.abspath(args.output_dir) + ':' + CONTAINER_OUTPUT_DIR mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}'
cmd += ['-v', mapping] cmd += ['-v', mapping]
remaining_args += ['-o', CONTAINER_OUTPUT_DIR] remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
if args.log_dir is not None: if args.log_dir is not None:
mapping = os.path.abspath(args.log_dir) + ':' + CONTAINER_LOG_DIR mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}'
cmd += ['-v', mapping] cmd += ['-v', mapping]
remaining_args += ['--log-dir', CONTAINER_LOG_DIR] remaining_args += ['--log-dir', CONTAINER_LOG_DIR]
cmd.append(CONTAINER_IMAGE) cmd.append(CONTAINER_IMAGE)