Compare commits

...

64 Commits
1.0 ... master

Author SHA1 Message Date
Patrick Jentsch
6f0545e48b Mark required arguments in scripts as required 2022-02-03 10:40:25 +01:00
Patrick Jentsch
a2e8e72e54 Bump spaCy version, bugfixes, codestyle 2022-01-27 16:50:22 +01:00
Patrick Jentsch
29ccfac4f6 optimizations 2021-08-11 16:47:29 +02:00
Patrick Jentsch
0ba0c14b72 First attempt 2021-08-10 14:43:55 +02:00
Patrick Jentsch
66516eeb89 WIP use the new package 2021-08-06 16:50:22 +02:00
Patrick Jentsch
a4b2fc3a65 Create package for stand-off-data-py 2021-07-22 16:59:29 +02:00
Patrick Jentsch
4dea95a108 Preliminary work 2021-07-13 16:31:53 +02:00
Patrick Jentsch
5139fd9727 Fix problem where encoding is not set 2021-06-22 12:46:01 +02:00
Patrick Jentsch
fd39246e4b Update file handling. Now md5 is correct 2021-05-18 10:26:03 +02:00
Patrick Jentsch
bd5d8ddedb Fix problems caused by wrong textwrap.wrap usage 2021-04-30 09:44:35 +02:00
Patrick Jentsch
f7b7da2b1f restrict memory usage for nlp tasks 2021-04-22 08:46:28 +02:00
Patrick Jentsch
2813d1a222 Fix long text processing 2021-04-22 08:43:34 +02:00
Patrick Jentsch
cd976692d6 Don't process files in subdirectories 2021-04-12 13:24:31 +02:00
Patrick Jentsch
4e7669d009 Return the returncode 2021-04-12 09:26:21 +02:00
Patrick Jentsch
8105edfd1b Add missing argument to wrapper script 2021-04-12 09:20:28 +02:00
Patrick Jentsch
72409bd12d Fix race condition 2021-03-26 14:48:38 +01:00
Patrick Jentsch
54f336e620 Fix permissions 2021-03-26 10:09:45 +01:00
Patrick Jentsch
3b570e5df1 more pipeline help tweaks 2021-03-26 10:02:14 +01:00
Patrick Jentsch
dc62755d12 Update README and pipeline help 2021-03-26 10:01:51 +01:00
Patrick Jentsch
aa1bfa259d Use JSON files for stand-off annotations. 2021-03-26 09:46:17 +01:00
Patrick Jentsch
d620c29f27 Fix version 1.0.0 2021-02-25 11:26:11 +01:00
Patrick Jentsch
2ced38504c Use "buster" instead of "10" in FROM 2020-10-08 23:17:58 +02:00
Patrick Jentsch
f02c0953bf Use new Dockerfile structure 2020-10-08 23:08:49 +02:00
Patrick Jentsch
5329446277 Update CI script 2020-10-07 17:09:09 +02:00
Patrick Jentsch
15e373db58 fix gitlab ci 2020-09-23 16:53:16 +02:00
Patrick Jentsch
8afdfb13b2 Use smaller models 2020-09-23 15:46:43 +02:00
Patrick Jentsch
1ed42f68ad Remove clean stage from stages 2020-09-23 15:27:31 +02:00
Patrick Jentsch
42583fea46 Update to newer Version 2020-09-23 15:26:53 +02:00
Patrick Jentsch
5bd0feda5c fix pipeline 2020-06-23 15:19:39 +02:00
Patrick Jentsch
5980a995e5 Add missing newline 2020-06-10 14:23:43 +02:00
Patrick Jentsch
fe7ab93513 Update nlp software metadata represantation 2020-06-10 13:14:34 +02:00
Stephan Porada
91708308bc Add model version number 2020-05-20 15:35:45 +02:00
Stephan Porada
887e814020 Fix 2020-05-20 15:01:52 +02:00
Stephan Porada
3fc6ebff4c Add stand off varaiant and metadata 2020-05-20 14:55:52 +02:00
Patrick Jentsch
bef51b7d81 Keep uncompressed output files after zip jobs. 2020-05-13 09:07:31 +02:00
Patrick Jentsch
68e86338d4 Bump versions 2020-04-06 09:21:38 +02:00
Patrick Jentsch
30d127f3af Fix zip creation 2020-04-04 15:37:12 +02:00
Patrick Jentsch
e061a7426d Update NLP Pipeline 2020-04-03 17:35:05 +02:00
stephan
41910afb79 Add nlp to filename 2020-02-18 10:17:24 +01:00
stephan
5d2fee029e Some cosmetics 2020-02-17 14:58:18 +01:00
stephan
6e87e0decd Add filename argument for zip results 2020-02-17 11:57:55 +01:00
Stephan Porada
79043f3dd7 Fix last errors 2020-02-12 14:25:08 +01:00
Stephan Porada
1a3e4a0a02 Fix check_encoding functionality 2020-02-12 14:16:36 +01:00
Stephan Porada
504861ae07 Update Dockerfile 2020-02-12 13:48:30 +01:00
Stephan Porada
88d03d4360 Add function to check the encoding of input text files. 2020-02-12 13:46:43 +01:00
Patrick Jentsch
6769be049a Escape text and lemma 2020-02-04 13:12:31 +01:00
Patrick Jentsch
ec2cf1dcff Fix zip switch integration 2020-02-03 15:26:04 +01:00
Patrick Jentsch
e4ef4835e5 Add a switch for zip functionality 2020-02-03 15:02:26 +01:00
Patrick Jentsch
5f20f9be40 Remove id xml attribute from output file 2020-01-27 15:59:32 +01:00
Patrick Jentsch
b0a402b3ac Add zip creation 2020-01-20 15:09:38 +01:00
Patrick Jentsch
543a1ba29a Bump version 2020-01-07 11:24:11 +01:00
Patrick Jentsch
d5a2d38c17 fix 2019-11-04 15:18:52 +01:00
Patrick Jentsch
4af9d9c899 Update 2019-11-04 15:15:41 +01:00
Patrick Jentsch
de8160a5b6 Update .gitlab-ci.yml 2019-09-19 09:25:29 +02:00
Patrick Jentsch
d564ed0464 Update .gitlab-ci.yml 2019-09-19 09:24:04 +02:00
Patrick Jentsch
abf6c430c3 Update .gitlab-ci.yml 2019-09-16 15:52:23 +02:00
Patrick Jentsch
19426a4c78 Set charset again! 2019-09-12 11:42:42 +02:00
Patrick Jentsch
a32184db5c Codestyle changes. 2019-09-12 10:06:29 +02:00
Patrick Jentsch
a16b010bdc Install models via an alternative way. 2019-09-12 09:56:13 +02:00
Patrick Jentsch
af293d6141 Codestyle 2019-09-11 16:15:41 +02:00
Patrick Jentsch
43717de313 Use latest image tag for master. 2019-09-11 13:39:51 +02:00
Patrick Jentsch
48fb20ae6b Change the documentation style. 2019-09-11 13:34:01 +02:00
Patrick Jentsch
2f57b1a0af Use fix version numbers. 2019-09-11 13:20:07 +02:00
Patrick Jentsch
e68d5c39ee Update CI script and remove unused code. 2019-07-31 11:39:54 +02:00
12 changed files with 1129 additions and 312 deletions

@ -1,42 +1,68 @@
image: docker:latest
image: docker:19.03.13
services:
- docker:dind
- docker:19.03.13-dind
stages:
- build
- push
before_script:
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
variables:
DOCKER_TLS_CERTDIR: "/certs"
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME-$CI_COMMIT_SHA
Build:
.reg_setup:
before_script:
- apk add --no-cache curl
- curl --fail --show-error --location "https://github.com/genuinetools/reg/releases/download/v$REG_VERSION/reg-linux-amd64" --output /usr/local/bin/reg
- echo "$REG_SHA256 /usr/local/bin/reg" | sha256sum -c -
- chmod a+x /usr/local/bin/reg
variables:
REG_SHA256: ade837fc5224acd8c34732bf54a94f579b47851cc6a7fd5899a98386b782e228
REG_VERSION: 0.16.1
build_image:
script:
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
- docker build --pull -t $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA .
- docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
- docker build -t $INTERMEDIATE_IMAGE_TAG .
- docker push $INTERMEDIATE_IMAGE_TAG
stage: build
tags:
- docker
- docker
Push latest:
push_master:
extends:
- .reg_setup
only:
- master
script:
- docker pull $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
- docker tag $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA $CI_REGISTRY_IMAGE:latest
- docker push $CI_REGISTRY_IMAGE:latest
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
- docker pull $INTERMEDIATE_IMAGE_TAG
- /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG
- docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG
- docker push $IMAGE_TAG
stage: push
tags:
- docker
- docker
variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:latest
Push tag:
push_other:
extends:
- .reg_setup
except:
- master
only:
- branches
- tags
script:
- docker pull $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
- docker tag $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
- docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
- docker pull $INTERMEDIATE_IMAGE_TAG
- /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG
- docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG
- docker push $IMAGE_TAG
stage: push
tags:
- docker
- docker
variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME

@ -1,43 +1,67 @@
FROM debian:9-slim
FROM debian:buster-slim
LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>"
LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de"
ENV DEBIAN_FRONTEND=noninteractive
ENV LANG=C.UTF-8
RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
python2.7 \
python3.5 \
python3-dev \
python3-pip \
python3-setuptools \
wget
# Install pyFlow
ENV PYFLOW_VERSION 1.1.20
RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \
tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \
cd pyflow-"$PYFLOW_VERSION" && \
python2.7 setup.py build install && \
cd .. && \
rm -r pyflow-"$PYFLOW_VERSION" pyflow-"$PYFLOW_VERSION".tar.gz
RUN apt-get update \
&& apt-get install --no-install-recommends --yes \
procps \
python3.7 \
python3-pip \
wget \
&& python3 -m pip install \
chardet \
setuptools \
wheel
# Install spaCy
RUN pip3 install wheel && pip3 install -U spacy && \
python3.5 -m spacy download de && \
python3.5 -m spacy download en && \
python3.5 -m spacy download es && \
python3.5 -m spacy download fr && \
python3.5 -m spacy download it && \
python3.5 -m spacy download pt
# Install the NLP pipeline and it's dependencies #
## Install pyFlow ##
ENV PYFLOW_VERSION=1.1.20
RUN wget --no-check-certificate --quiet \
"https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \
&& tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
&& cd "pyflow-${PYFLOW_VERSION}" \
&& apt-get install --no-install-recommends --yes \
python2.7 \
&& python2.7 setup.py build install \
&& cd - > /dev/null \
&& rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"
COPY nlp /usr/local/bin
COPY spacy_nlp /usr/local/bin
RUN mkdir /input /output && \
chmod a+rw /input /output
## Install spaCy ##
ENV SPACY_VERSION=3.2.1
RUN apt-get install --no-install-recommends --yes \
python3.7 \
python3-pip \
&& pip3 install \
chardet \
setuptools \
wheel \
&& pip3 install --upgrade pip \
&& pip3 install "spacy==${SPACY_VERSION}"
ENV SPACY_MODELS="de_core_news_md,en_core_web_md,it_core_news_md,pl_core_news_md,zh_core_web_md"
ENV SPACY_MODELS_VERSION=3.2.0
RUN for spacy_model in $(echo ${SPACY_MODELS} | tr "," "\n"); do python3 -m spacy download "${spacy_model}-${SPACY_MODELS_VERSION}" --direct; done
COPY packages .
RUN cd stand-off-data-py \
&& python3 -m pip install . \
&& cd -
## Install Pipeline ##
COPY nlp spacy-nlp vrt-creator /usr/local/bin/
RUN rm -r /var/lib/apt/lists/*
ENTRYPOINT ["nlp"]
CMD ["--help"]

21
LICENSE Normal file

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2021 Bielefeld University - CRC 1288 - INF
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

107
README.md

@ -1,74 +1,41 @@
# Natural language processing
# NLP - Natural Language Processing
This repository provides all code that is needed to build a container image for natural language processing utilizing [spaCy](https://spacy.io).
This software implements a heavily parallelized pipeline for Natural Language Processing of text files. It is used for nopaque's NLP service but you can also use it standalone, for that purpose a convenient wrapper script is provided. The pipeline is designed to run on Linux operating systems, but with some tweaks it should also run on Windows with WSL installed.
## Build image
## Software used in this pipeline implementation
1. Clone this repository and navigate into it:
- Official Debian Docker image (buster-slim): https://hub.docker.com/_/debian
- Software from Debian Buster's free repositories
- pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
- spaCy (3.2.1): https://github.com/explosion/spaCy/releases/tag/v3.2.1
- spaCy medium sized models (3.2.0):
- https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.2.0
- https://github.com/explosion/spacy-models/releases/tag/en_core_web_md-3.2.0
- https://github.com/explosion/spacy-models/releases/tag/it_core_news_md-3.2.0
- https://github.com/explosion/spacy-models/releases/tag/nl_core_news_md-3.2.0
- https://github.com/explosion/spacy-models/releases/tag/pl_core_news_md-3.2.0
- https://github.com/explosion/spacy-models/releases/tag/zh_core_web_md-3.2.0
## Installation
1. Install Docker and Python 3.
2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git`
3. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:v0.1.0 nlp`
4. Add the wrapper script (`wrapper/nlp` relative to this README file) to your `${PATH}`.
5. Create working directories for the pipeline: `mkdir -p /<my_data_location>/{input,output}`.
## Use the Pipeline
1. Place your plain text files inside `/<my_data_location>/input`. Files should all contain text of the same language.
2. Clear your `/<my_data_location>/output` directory.
3. Start the pipeline process. Check the pipeline help (`nlp --help`) for more details.
```bash
cd /<my_data_location>
nlp \
--input-dir input \
--output-dir output \
-m <model_code> <optional_pipeline_arguments>
```
git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git && cd nlp
```
2. Build image:
```
docker build -t sfb1288inf/nlp:latest .
```
Alternatively build from the GitLab repository without cloning:
1. Build image:
```
docker build -t sfb1288inf/nlp:latest https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
```
## Download prebuilt image
The GitLab registry provides a prebuilt image. It is automatically created, utilizing the conquaire build servers.
1. Download image:
```
docker pull gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest
```
## Run
1. Create input and output directories for the NLP software:
```
mkdir -p /<mydatalocation>/files_for_nlp /<mydatalocation>/files_from_nlp
```
2. Place your text files inside the `/<mydatalocation>/files_for_nlp` directory. Files should all contain text of the same language.
3. Start the NLP process.
```
docker run \
--rm \
-it \
-u $(id -u $USER):$(id -g $USER) \
-v /<mydatalocation>/files_for_nlp:/input \
-v /<mydatalocation>/files_from_nlp:/output \
sfb1288inf/nlp:latest \
-i /input \
-l <languagecode> \
-o /output
```
The arguments below `sfb1288inf/nlp:latest` are described in the [NLP arguments](#nlp-arguments) part.
If you want to use the prebuilt image, replace `sfb1288inf/nlp:latest` with `gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest`.
4. Check your results in the `/<mydatalocation>/files_from_nlp` directory.
### NLP arguments
`-i path`
* Sets the input directory using the specified path.
* required = True
`-o path`
* Sets the output directory using the specified path.
* required = True
`-l languagecode`
* Tells spaCy which language will be used.
* options = de (German), el (Greek), en (English), es (Spanish), fr (French), it (Italian), nl (Dutch), pt (Portuguese)
* required = True
4. Check your results in the `/<my_data_location>/output` directory.

343
nlp

@ -1,133 +1,288 @@
#!/usr/bin/env python2.7
# coding=utf-8
"""
nlp
''' A NLP pipeline for text file processing. '''
__version__ = '0.1.0'
Usage: For usage instructions run with option --help
Author: Patrick Jentsch <p.jentsch@uni-bielefeld.de>
"""
import argparse
import multiprocessing
from argparse import ArgumentParser
from pyflow import WorkflowRunner
import json
import os
import sys
from pyflow import WorkflowRunner
def parse_arguments():
parser = argparse.ArgumentParser(
description='Performs NLP of documents utilizing spaCy. The results are served as verticalized text files.'
)
SPACY_MODELS = {
'de': 'de_core_news_md',
'en': 'en_core_web_md',
'it': 'it_core_news_md',
'nl': 'nl_core_news_md',
'pl': 'pl_core_news_md',
'zh': 'zh_core_web_md'
}
parser.add_argument(
'-i',
dest='input_dir',
required=True
)
parser.add_argument(
'-l',
choices=['de', 'el', 'en', 'es', 'fr', 'it', 'nl', 'pt'],
dest='lang',
required=True
)
parser.add_argument(
'-o',
dest='output_dir',
required=True
)
parser.add_argument(
'--nCores',
default=min(4, multiprocessing.cpu_count()),
dest='n_cores',
help='total number of cores available',
required=False,
type=int
)
return parser.parse_args()
class PipelineJob:
'''
NLP pipeline job class.
Each plain text input file of the pipeline is represented as an NLP
pipeline job, which holds all necessary information for the pipeline to
process it.
Arguments:
file -- Path to the file
output_dir -- Path to a directory, where job results are stored
'''
def __init__(self, file, output_dir):
self.file = file
self.name = os.path.basename(file)[:-4]
self.output_dir = output_dir
class NLPWorkflow(WorkflowRunner):
def __init__(self, args):
self.jobs = analyze_jobs(args.input_dir, args.output_dir)
self.lang = args.lang
self.n_cores = args.n_cores
def __init__(self, job, model, check_encoding=False, id_prefix=''):
self.job = job
self.model = model
self.check_encoding = check_encoding
self.id_prefix = id_prefix
def workflow(self):
if len(self.jobs) == 0:
'''
' ##################################################
' # spacy #
' ##################################################
'''
n_cores = 1
mem_mb = min(1024, self.getMemMb())
cmd = 'spacy-nlp'
cmd += ' --input-file "{}"'.format(self.job.file)
cmd += ' --output-file "{}"'.format(
os.path.join(self.job.output_dir, '{}.json'.format(self.job.name))
)
cmd += ' -m "{}"'.format(self.model)
if self.check_encoding:
cmd += ' --check-encoding'
cmd += ' --id-prefix "{}"'.format(self.id_prefix)
self.addTask(
'spacy',
command=cmd,
memMb=mem_mb,
nCores=n_cores
)
class CreateVrtWorkflow(WorkflowRunner):
def __init__(self, job):
self.job = job
def workflow(self):
'''
' ##################################################
' # vrt-creator #
' ##################################################
'''
n_cores = 1
mem_mb = min(256, self.getMemMb())
cmd = 'vrt-creator'
cmd += ' --stand-off-data-file "{}"'.format(
os.path.join(self.job.output_dir, '{}.json'.format(self.job.name))
)
cmd += ' --text-file "{}"'.format(self.job.file)
cmd += ' --output-file "{}"'.format(
os.path.join(self.job.output_dir, '{}.vrt'.format(self.job.name))
)
self.addTask(
'vrt_creator',
command=cmd,
memMb=mem_mb,
nCores=n_cores
)
class MainWorkflow(WorkflowRunner):
def __init__(
self,
input_dir,
model,
output_dir,
check_encoding=False,
id_prefix=''
):
self.input_dir = input_dir
self.model = model
self.output_dir = output_dir
self.check_encoding = check_encoding
self.id_prefix = id_prefix
self.jobs = []
def collect_jobs(self):
self.jobs = []
for file in os.listdir(self.input_dir):
if os.path.isdir(os.path.join(self.input_dir, file)):
continue
if not file.lower().endswith('.txt'):
continue
job = PipelineJob(
os.path.join(self.input_dir, file),
os.path.join(self.output_dir, file)
)
self.jobs.append(job)
def workflow(self):
if not self.jobs:
return
# Create output and temporary directories
for job in self.jobs:
os.mkdir(job.output_dir)
'''
' ##################################################
' # Create output directories #
' # nlp #
' ##################################################
'''
create_output_directories_jobs = []
for index, job in enumerate(self.jobs):
cmd = 'mkdir -p "%s"' % (job['output_dir'])
create_output_directories_jobs.append(
self.addTask(
command=cmd,
label='create_output_directories_job_-_%i' % (index)
nlp_tasks = []
for i, job in enumerate(self.jobs):
task = self.addWorkflowTask(
'nlp_-_{}'.format(i),
NLPWorkflow(
job,
self.model,
check_encoding=self.check_encoding,
id_prefix=self.id_prefix
)
)
nlp_tasks.append(task)
'''
' ##################################################
' # Natural language processing #
' # create vrt #
' ##################################################
'''
nlp_jobs = []
nlp_job_n_cores = min(
self.n_cores,
max(1, int(self.n_cores / len(self.jobs)))
)
for index, job in enumerate(self.jobs):
cmd = 'spacy_nlp -l "%s" "%s" "%s"' % (
self.lang,
job['path'],
os.path.join(job['output_dir'], job['name'] + '.vrt')
create_vrt_tasks = []
for i, job in enumerate(self.jobs):
task = self.addWorkflowTask(
'create_vrt_-_{}'.format(i),
CreateVrtWorkflow(job),
dependencies='nlp_-_{}'.format(i)
)
nlp_jobs.append(
self.addTask(
command=cmd,
dependencies='create_output_directories_job_-_%i' % (index),
label='nlp_job_-_%i' % (index),
nCores=nlp_job_n_cores
)
create_vrt_tasks.append(task)
self.waitForTasks()
outputs = []
for job in self.jobs:
# Track output files
relative_output_dir = os.path.relpath(
job.output_dir,
start=self.output_dir
)
def analyze_jobs(input_dir, output_dir):
jobs = []
for file in os.listdir(input_dir):
if os.path.isdir(os.path.join(input_dir, file)):
jobs += analyze_jobs(
os.path.join(input_dir, file),
os.path.join(output_dir, file),
)
elif file.endswith('.txt'):
jobs.append(
outputs.append(
{
'filename': file,
'name': file.rsplit('.', 1)[0],
'output_dir': os.path.join(output_dir, file),
'path': os.path.join(input_dir, file)
'description': 'JSON stand off data',
'file': os.path.join(
relative_output_dir,
'{}.json'.format(job.name)
),
'mimetype': 'application/json'
}
)
outputs.append(
{
'description': 'CWB vrt file',
'file': os.path.join(
relative_output_dir,
'{}.vrt'.format(job.name)
),
'mimetype': 'application/vrt+xml'
}
)
with open(os.path.join(self.output_dir, 'outputs.json'), 'w') as f:
json.dump(outputs, f, indent=4)
return jobs
def parse_args():
parser = ArgumentParser(
description='NLP pipeline for plain text file processing'
)
parser.add_argument(
'-i', '--input-dir',
help='Input directory',
required=True
)
parser.add_argument(
'-o', '--output-dir',
help='Output directory',
required=True
)
parser.add_argument(
'-m', '--model',
choices=SPACY_MODELS.keys(),
help='The model to be used',
required=True
)
parser.add_argument(
'--check-encoding',
action='store_true',
help='Check encoding of the input file, UTF-8 is used instead'
)
parser.add_argument(
'--id-prefix',
default='',
help='A prefix for all the ids within the stand off annotations'
)
parser.add_argument(
'--log-dir',
help='Logging directory (Default: --output-dir)'
)
parser.add_argument(
'--mem-mb',
help='Amount of system memory to be used '
'(Default: min(--n-cores * 1024, available system memory))',
type=int
)
parser.add_argument(
'--n-cores',
default=1,
help='Number of CPU threads to be used',
type=int
)
parser.add_argument(
'-v', '--version',
action='version',
help='Returns the current version of the NLP pipeline',
version='%(prog)s {}'.format(__version__)
)
args = parser.parse_args()
# Set some tricky default values and check for insufficient input
if args.log_dir is None:
args.log_dir = args.output_dir
if args.n_cores < 1:
raise Exception('--n-cores must be greater or equal 1')
if args.mem_mb is None:
max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0])
args.mem_mb = min(args.n_cores * 1024, max_mem_mb)
if args.mem_mb < 1024:
raise Exception('--mem-mb must be greater or equal 1024')
return args
def main():
args = parse_arguments()
wflow = NLPWorkflow(args)
retval = wflow.run(dataDirRoot=args.output_dir, nCores=args.n_cores)
args = parse_args()
main_workflow = MainWorkflow(
args.input_dir,
args.model,
args.output_dir,
check_encoding=args.check_encoding,
id_prefix=args.id_prefix
)
main_workflow.collect_jobs()
retval = main_workflow.run(
dataDirRoot=args.log_dir,
memMb=args.mem_mb,
nCores=args.n_cores
)
sys.exit(retval)

@ -0,0 +1,14 @@
import setuptools
setuptools.setup(
name='Stand off data',
author='Patrick Jentsch',
author_email='p.jentsch@uni-bielefeld.de',
description='A python library to handle stand off data.',
py_modules=['stand_off_data'],
classifiers=[
'Programming Language :: Python :: 3',
'Operating System :: OS Independent',
],
python_requires='>=3.5'
)

@ -0,0 +1,282 @@
from xml.sax.saxutils import escape
class StandOffData:
def __init__(self, attrs={}):
self.meta = attrs.get('meta', {})
self.lookup = {}
for x in attrs.get('tags', []):
self.add_tag_definition(x)
self.annotations = [
TagAnnotation(x, self.lookup)
for x in attrs.get('annotations', [])
]
def add_tag_definition(self, attrs):
tag_definition = TagDefinition(attrs)
if tag_definition.id in self.lookup:
raise Exception(f'Tag id already in use: {self.to_dict()}')
self.lookup[tag_definition.id] = tag_definition
def to_dict(self):
return {
'meta': self.meta,
'lookup': {k: v.to_dict() for k, v in self.lookup.items()},
'annotations': [x.to_dict() for x in self.annotations]
}
def to_vrt(self, text):
# Devide annotations into CWB's verticalized text format (.vrt) logic
p_attrs = [] # positional attributes
s_attrs = [] # structural attributes
for annotation in self.annotations:
if annotation.name == 'token':
p_attrs.append(annotation)
else:
s_attrs.append(annotation)
# Sort annotations, necessary for the next checks
p_attrs.sort()
s_attrs.sort()
# Check for p_attr<->p_attr overlap
for i, p_attr in enumerate(p_attrs[:-1]):
next_p_attr = p_attrs[i + 1]
# Check if first_p_attr starts/ends within second_p_attr
if ((p_attr.start >= next_p_attr.start) and (p_attr.start < next_p_attr.end) # noqa
or (p_attr.end > next_p_attr.start) and (p_attr.end <= next_p_attr.end)): # noqa
raise Exception(
'Positional attribute overlaps another: '
f'{p_attr.to_dict()}<->{next_p_attr.to_dict()}'
)
# Check for s_attr<->p_attr overlap
for i, s_attr in enumerate(s_attrs):
for p_attr in p_attrs:
# Check if s_attr starts within p_attr
if s_attr.start > p_attr.start and s_attr.start < p_attr.end:
# Change s_attr start to p_attr's start
s_attrs[i].start = p_attr.start
# Check if s_attr ends within p_attr
if s_attr.end < p_attr.end and s_attr.end > p_attr.start:
# Change s_attr end to p_attr's end
s_attrs[i].end = p_attr.end
# Check if s_attr starts/ends before/after p_attr
if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start:
# No further Checking needed (because p_attrs are sorted)
break
p_attr_buffer = {}
for i, p_attr in enumerate(p_attrs):
p_attr_buffer[p_attr.start] = i
s_attr_start_buffer = {}
s_attr_end_buffer = {}
for i, s_attr in enumerate(s_attrs):
if s_attr.start in s_attr_start_buffer:
s_attr_start_buffer[s_attr.start].append(i)
else:
s_attr_start_buffer[s_attr.start] = [i]
if s_attr.end in s_attr_end_buffer:
s_attr_end_buffer[s_attr.end].insert(0, i)
else:
s_attr_end_buffer[s_attr.end] = [i]
vrt = ''
vrt += '<text>\n'
current_position = 0
text_len = len(text)
# As long as we have something in our buffers we process it
while current_position <= text_len:
# s_attr endings
# for k in {k: v for k, v in s_attr_end_buffer.items() if k <= current_position}: # noqa
if current_position in s_attr_end_buffer:
# s_attr_indexes = s_attr_end_buffer.pop(k)
s_attr_indexes = s_attr_end_buffer.pop(current_position)
for s_attr_index in s_attr_indexes:
s_attr = s_attrs[s_attr_index]
vrt += f'</{escape(s_attr.name)}>\n'
# s_attrs starts
# for k in {k: v for k, v in s_attr_start_buffer.items() if k <= current_position}: # noqa
if current_position in s_attr_start_buffer:
# s_attr_indexes = s_attr_start_buffer.pop(k)
s_attr_indexes = s_attr_start_buffer.pop(current_position)
for s_attr_index in s_attr_indexes:
s_attr = s_attrs[s_attr_index]
vrt += f'<{escape(s_attr.name)}'
for property in s_attr.properties:
vrt += f' {escape(property.name)}="{escape(str(property.value))}"' # noqa
vrt += '>\n'
# p_attrs
if current_position not in p_attr_buffer:
current_position += 1
continue
p_attr_index = p_attr_buffer.pop(current_position)
p_attr = p_attrs[p_attr_index]
if text[p_attr.start:p_attr.end].isspace():
current_position = p_attr.end
continue
_p_attr = {
'lemma': 'None',
'pos': 'None',
'simple_pos': 'None',
'word': 'None'
}
for property in p_attr.properties:
if property.name not in _p_attr:
continue
_p_attr[property.name] = escape(str(property.value))
_p_attr['word'] = escape(text[p_attr.start:p_attr.end])
vrt += '{word}\t{pos}\t{lemma}\t{simple_pos}\n'.format(**_p_attr)
current_position = p_attr.end
vrt += '</text>\n'
return vrt
class TagAnnotation:
def __init__(self, attrs, lookup):
self.lookup = lookup
self.tag_id = attrs['tag_id']
self.start = attrs['start']
self.end = attrs['end']
self.properties = [
PropertyAnnotation(x, self.lookup[self.tag_id].properties)
for x in attrs.get('properties', [])
]
''' Sanity checks '''
if self.tag_id not in self.lookup:
raise Exception(f'Unknown tag: {self.to_dict()}')
if self.end < self.start:
raise Exception(f'Annotation end less then start: {self.to_dict()}') # noqa
# property_ids = [x.property_id for x in self.properties]
# for required_property_id, required_property in self.lookup[self.tag_id].required_properties.items(): # noqa
# if required_property_id not in property_ids:
# raise Exception(
# f'Missing required property: {required_property.to_dict()}'
# )
@property
def name(self):
return self.lookup[self.tag_id].name
def to_dict(self):
return {
'tag_id': self.tag_id,
'start': self.start,
'end': self.end,
'properties': [x.to_dict() for x in self.properties]
}
def __lt__(self, other):
if self.start == other.start:
if self.name == 'token' and other.name != 'token':
return False
elif self.name != 'token' and other.name == 'token':
return True
else:
return self.end > other.end
else:
return self.start < other.start
def __le__(self, other):
if self.start == other.start:
if self.name == 'token' and other.name != 'token':
return False
elif self.name != 'token' and other.name == 'token':
return True
else:
return self.end >= other.end
else:
return self.start <= other.start
def __eq__(self, other):
if self.start == other.start:
if self.name == 'token' and other.name != 'token':
return False
elif self.name != 'token' and other.name == 'token':
return False
else:
return self.end == other.end
else:
return False
def __ne__(self, other):
return not self == other
def __gt__(self, other):
return not self <= other
def __ge__(self, other):
return not self < other
class PropertyAnnotation:
def __init__(self, attrs, lookup):
self.lookup = lookup
self.property_id = attrs['property_id']
self.value = attrs['value']
# TODO: Process attrs['possibleValues'] as self.labels (no id?)
''' Sanity checks '''
if self.property_id not in self.lookup:
raise Exception(f'Unknown property: {self.to_dict()}')
@property
def name(self):
return self.lookup[self.property_id].name
def to_dict(self):
return {
'property_id': self.property_id,
'tag_id': self.tag_id,
'value': self.value
}
class TagDefinition:
def __init__(self, attrs):
self.id = attrs['id']
self.name = attrs['name']
self.description = attrs.get('description', '')
self.properties = {}
for x in attrs.get('properties', []):
self.add_property_definition(x)
def add_property_definition(self, attrs):
property_definition = PropertyDefinition(attrs)
if property_definition.id in self.properties:
raise Exception(
f'Property id already in use: {property_definition.to_dict()}')
self.properties[property_definition.id] = property_definition
# @property
# def required_properties(self):
# return {property.id: property for property in self.properties.values()
# if property.is_required}
def to_dict(self):
return {
'id': self.id,
'name': self.name,
'description': self.description,
'properties': {k: v.to_dict() for k, v in self.properties.items()}
}
class PropertyDefinition:
def __init__(self, attrs):
self.id = attrs['id']
self.name = attrs['name']
self.description = attrs.get('description', '')
self.flags = attrs.get('flags', [])
self.labels = attrs.get('labels', [])
# @property
# def is_required(self):
# return 'required' in self.flags
@property
def has_multiple_values(self):
return 'multiple' in self.flags
def to_dict(self):
return {
'id': self.id,
'name': self.name,
'description': self.description,
'flags': self.flags,
'labels': self.labels
}

@ -0,0 +1,2 @@
# flake8: noqa
from .models import StandOffData

353
spacy-nlp Executable file

@ -0,0 +1,353 @@
#!/usr/bin/env python3.7
# coding=utf-8
from argparse import ArgumentParser
import chardet
import hashlib
import json
import os
import spacy
import textwrap
import uuid
spacy_models = {
spacy.info(pipeline)['lang']: pipeline
for pipeline in spacy.info()['pipelines']
}
# Parse the given arguments
parser = ArgumentParser(
description='Create annotations for a given plain txt file'
)
parser.add_argument(
'-i', '--input-file',
help='Input file',
required=True
)
parser.add_argument(
'-o', '--output-file',
help='Output file',
required=True
)
parser.add_argument(
'-m', '--model',
choices=spacy_models.keys(),
help='The model to be used',
required=True
)
parser.add_argument(
'-c', '--check-encoding',
action='store_true',
help='Check encoding of the input file, UTF-8 is used instead'
)
parser.add_argument(
'--id-prefix',
default='',
help='A prefix for all the ids within the stand off annotations'
)
args = parser.parse_args()
def generate_id(name):
return f'{args.id_prefix}{uuid.uuid3(uuid.NAMESPACE_DNS, name)}'
with open(args.input_file, "rb") as input_file:
if args.check_encoding:
encoding = chardet.detect(input_file.read())['encoding']
else:
encoding = 'utf-8'
input_file.seek(0)
text_md5 = hashlib.md5()
for chunk in iter(lambda: input_file.read(128 * text_md5.block_size), b''):
text_md5.update(chunk)
# Load the text contents from the input file
with open(args.input_file, encoding=encoding) as input_file:
# spaCy NLP is limited to strings with a maximum of 1 million characters at
# once. So we split it into suitable chunks.
text_chunks = textwrap.wrap(
input_file.read(),
1000000,
break_long_words=False,
break_on_hyphens=False,
drop_whitespace=False,
expand_tabs=False,
replace_whitespace=False
)
model_name = spacy_models[args.model]
nlp = spacy.load(model_name)
meta = {
'generator': {
'name': 'nopaque spacy NLP',
'version': '0.1.0',
'arguments': {
'check_encoding': args.check_encoding,
'model': args.model
}
},
'file': {
'encoding': encoding,
'md5': text_md5.hexdigest(),
'name': os.path.basename(args.input_file)
}
}
tags = []
token = {
'id': generate_id('token'),
'name': 'token',
'description': 'An individual token — i.e. a word, punctuation symbol, whitespace, etc.', # noqa
'properties': []
}
# TODO: Check if all languages support token.sentiment
token['properties'].append(
{
'id': generate_id('token.sentiment'),
'name': 'sentiment',
'description': 'A scalar value indicating the positivity or negativity of the token.' # noqa
}
)
if nlp.has_pipe('lemmatizer'):
token['properties'].append(
{
'id': generate_id('token.lemma'),
'name': 'lemma',
'description': 'The base form of the word'
}
)
if nlp.has_pipe('morphologizer') or nlp.has_pipe('tagger'):
token['properties'].append(
{
'id': generate_id('token.simple_pos'),
'name': 'simple_pos',
'description': 'The simple UPOS part-of-speech tag',
'labels': [
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'ADJ',
'description': 'adjective'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'ADP',
'description': 'adposition'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'ADV',
'description': 'adverb'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'AUX',
'description': 'auxiliary verb'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'CONJ',
'description': 'coordinating conjunction'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'DET',
'description': 'determiner'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'INTJ',
'description': 'interjection'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'NOUN',
'description': 'noun'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'NUM',
'description': 'numeral'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'PART',
'description': 'particle'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'PRON',
'description': 'pronoun'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'PROPN',
'description': 'proper noun'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'PUNCT',
'description': 'punctuation'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'SCONJ',
'description': 'subordinating conjunction'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'SYM',
'description': 'symbol'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'VERB',
'description': 'verb'
},
{
'id': generate_id('token.simple_pos=ADJ'),
'name': 'X',
'description': 'other'
}
]
}
)
if nlp.has_pipe('tagger'):
token['properties'].append(
{
'id': generate_id('token.pos'),
'name': 'pos',
'description': 'The detailed part-of-speech tag',
'labels': [
{
'id': generate_id(f'token.pos={label}'),
'name': label,
'description': spacy.explain(label) or ''
} for label in spacy.info(model_name)['labels']['tagger']
]
}
)
if nlp.has_pipe('ner') or nlp.has_pipe('entity_ruler'):
tags.append(
{
'id': generate_id('ent'),
'name': 'ent',
'description': 'Encodes the start and end of a named entity',
'properties': [
{
'id': generate_id('ent.type'),
'name': 'type',
'description': 'Label indicating the type of the entity',
'labels': [
{
'id': generate_id('ent.type={}'.format(label)),
'name': label,
'description': spacy.explain(label) or ''
} for label in spacy.info(model_name)['labels']['ner']
]
}
]
}
)
if nlp.has_pipe('parser') or nlp.has_pipe('senter') or nlp.has_pipe('sentencizer'): # noqa
# TODO: Check if all languages support sent.sentiment
tags.append(
{
'id': generate_id('s'),
'name': 's',
'description': 'Encodes the start and end of a sentence',
'properties': [
{
'id': generate_id('s.sentiment'),
'name': 'sentiment',
'description': 'A scalar value indicating the positivity or negativity of the sentence.' # noqa
}
]
}
)
tags.append(token)
annotations = []
chunk_offset = 0
while text_chunks:
text_chunk = text_chunks.pop(0)
doc = nlp(text_chunk)
if hasattr(doc, 'ents'):
for ent in doc.ents:
annotation = {
'start': ent.start_char + chunk_offset,
'end': ent.end_char + chunk_offset,
'tag_id': generate_id('ent'),
'properties': [
{
'property_id': generate_id('ent.type'),
'value': ent.label_
}
]
}
annotations.append(annotation)
if hasattr(doc, 'sents'):
for sent in doc.sents:
annotation = {
'start': sent.start_char + chunk_offset,
'end': sent.end_char + chunk_offset,
'tag_id': generate_id('s'),
'properties': []
}
if hasattr(sent, 'sentiment'):
annotation['properties'].append(
{
'property_id': generate_id('s.sentiment'),
'value': sent.sentiment
}
)
annotations.append(annotation)
for token in doc:
annotation = {
'start': token.idx + chunk_offset,
'end': token.idx + len(token.text) + chunk_offset,
'tag_id': generate_id('token'),
'properties': []
}
if hasattr(token, 'lemma_'):
annotation['properties'].append(
{
'property_id': generate_id('token.lemma'),
'value': token.lemma_
}
)
if hasattr(token, 'pos_'):
annotation['properties'].append(
{
'property_id': generate_id('token.simple_pos'),
'value': token.pos_
}
)
if hasattr(token, 'sentiment'):
annotation['properties'].append(
{
'property_id': generate_id('token.sentiment'),
'value': token.sentiment
}
)
if hasattr(token, 'tag_'):
annotation['properties'].append(
{
'property_id': generate_id('token.pos'),
'value': token.tag_
}
)
annotations.append(annotation)
chunk_offset += len(text_chunk)
text_chunk = None
with open(args.output_file, 'w') as output_file:
json.dump(
{'meta': meta, 'tags': tags, 'annotations': annotations},
output_file,
indent=4
)

@ -1,71 +0,0 @@
#!/usr/bin/env python3.5
# coding=utf-8
import argparse
import os
import spacy
import textwrap
parser = argparse.ArgumentParser(
description='Tag a text file with spaCy and save it as a verticalized text file.'
)
parser.add_argument(
'i',
metavar='txt-sourcefile',
)
parser.add_argument(
'-l',
choices=['de', 'el', 'en', 'es', 'fr', 'it', 'nl', 'pt'],
dest='lang',
required=True
)
parser.add_argument(
'o',
metavar='vrt-destfile',
)
args = parser.parse_args()
SPACY_MODELS = {
'de': 'de_core_news_sm', 'el': 'el_core_news_sm', 'en': 'en_core_web_sm',
'es': 'es_core_news_sm', 'fr': 'fr_core_news_sm', 'it': 'it_core_news_sm',
'nl': 'nl_core_news_sm', 'pt': 'pt_core_news_sm'
}
# Set the language model for spacy
nlp = spacy.load(SPACY_MODELS[args.lang])
# Read text from the input file and if neccessary split it into parts with a
# length of less than 1 million characters.
with open(args.i) as input_file:
text = input_file.read()
texts = textwrap.wrap(text, 1000000, break_long_words=False)
text = None
# Create and open the output file
output_file = open(args.o, 'w+')
output_file.write(
'<?xml version="1.0" encoding="UTF-8"?>\n<corpus>\n<text id="%s">\n' % (
os.path.basename(args.i).rsplit(".", 1)[0]
)
)
for text in texts:
# Run spacy nlp over the text (partial string if above 1 million chars)
doc = nlp(text)
for sent in doc.sents:
output_file.write('<s>\n')
for token in sent:
# Skip whitespace tokens like "\n" or "\t"
if token.text.isspace():
continue
# Write all information in .vrt style to the output file
# text, lemma, simple_pos, pos, ner
output_file.write(
token.text + '\t' + token.lemma_ + '\t'
+ token.pos_ + '\t' + token.tag_ + '\t'
+ (token.ent_type_ if token.ent_type_ != '' else 'NULL') + '\n'
)
output_file.write('</s>\n')
output_file.write('</text>\n</corpus>')
output_file.close()

45
vrt-creator Executable file

@ -0,0 +1,45 @@
#!/usr/bin/env python3.7
# coding=utf-8
from argparse import ArgumentParser
from stand_off_data import StandOffData
import hashlib
import json
parser = ArgumentParser(
description='Convert plain text and JSON stand off to a CWB vrt file'
)
parser.add_argument(
'-s', '--stand-off-data-file',
help='JSON stand off data input file',
required=True
)
parser.add_argument(
'-t', '--text-file',
help='Plain text input file',
required=True
)
parser.add_argument(
'-o', '--output-file',
help='Output file',
required=True
)
args = parser.parse_args()
with open(args.stand_off_data_file) as stand_of_data_file:
stand_off_data = StandOffData(json.load(stand_of_data_file))
with open(args.text_file, "rb") as text_file:
text_md5 = hashlib.md5()
for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''):
text_md5.update(chunk)
if text_md5.hexdigest() != stand_off_data.meta['file']['md5']:
raise Exception('md5 not equal')
with open(args.text_file, encoding=stand_off_data.meta['file']['encoding']) as text_file: # noqa
text = text_file.read()
with open(args.output_file, 'w') as vrt_file:
vrt_file.write(stand_off_data.to_vrt(text))

@ -1,39 +1,38 @@
#!/usr/bin/env python3
# coding=utf-8
import argparse
from argparse import ArgumentParser
import os
import subprocess
import sys
container_image = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:latest'
container_input_dir = '/input'
container_output_dir = '/output'
uid = str(os.getuid())
gid = str(os.getgid())
CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:v0.1.0'
CONTAINER_INPUT_DIR = '/input'
CONTAINER_OUTPUT_DIR = '/output'
CONTAINER_LOG_DIR = '/logs'
UID = str(os.getuid())
GID = str(os.getgid())
parser = argparse.ArgumentParser(add_help=False)
parser.add_argument(
'-i',
dest='input_dir',
required=False
)
parser.add_argument(
'-o',
dest='output_dir',
required=False
)
parser = ArgumentParser(add_help=False)
parser.add_argument('-i', '--input-dir')
parser.add_argument('-o', '--output-dir')
parser.add_argument('--log-dir')
args, remaining_args = parser.parse_known_args()
cmd = ['docker', 'run', '--rm', '-it', '-u', uid + ':' + gid]
cmd = ['docker', 'run', '--rm', '-it', '-u', f'{UID}:{GID}']
if args.input_dir is not None:
host_input_dir = os.path.abspath(args.input_dir)
cmd += ['-v', host_input_dir + ':' + container_input_dir]
remaining_args += ['-i', container_input_dir]
mapping = f'{os.path.abspath(args.input_dir)}:{CONTAINER_INPUT_DIR}'
cmd += ['-v', mapping]
remaining_args += ['-i', CONTAINER_INPUT_DIR]
if args.output_dir is not None:
host_output_dir = os.path.abspath(args.output_dir)
cmd += ['-v', host_output_dir + ':' + container_output_dir]
remaining_args += ['-o', container_output_dir]
cmd.append(container_image)
mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}'
cmd += ['-v', mapping]
remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
if args.log_dir is not None:
mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}'
cmd += ['-v', mapping]
remaining_args += ['--log-dir', CONTAINER_LOG_DIR]
cmd.append(CONTAINER_IMAGE)
cmd += remaining_args
subprocess.run(cmd)
sys.exit(subprocess.run(cmd).returncode)