13 Commits

7 changed files with 417 additions and 246 deletions

View File

@ -1,8 +1,5 @@
image: docker:19.03.13 image: docker:19.03.13
variables:
DOCKER_TLS_CERTDIR: "/certs"
services: services:
- docker:19.03.13-dind - docker:19.03.13-dind
@ -10,6 +7,10 @@ stages:
- build - build
- push - push
variables:
DOCKER_TLS_CERTDIR: "/certs"
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME-$CI_COMMIT_SHA
.reg_setup: .reg_setup:
before_script: before_script:
- apk add --no-cache curl - apk add --no-cache curl
@ -28,8 +29,6 @@ build_image:
stage: build stage: build
tags: tags:
- docker - docker
variables:
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
push_master: push_master:
extends: extends:
@ -47,7 +46,6 @@ push_master:
- docker - docker
variables: variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:latest IMAGE_TAG: $CI_REGISTRY_IMAGE:latest
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
push_other: push_other:
extends: extends:
@ -68,4 +66,3 @@ push_other:
- docker - docker
variables: variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA

View File

@ -7,28 +7,29 @@ LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <por
ENV LANG=C.UTF-8 ENV LANG=C.UTF-8
RUN apt-get update RUN apt-get update \
&& apt-get install --no-install-recommends --yes \
wget
# Install the NLP pipeline and it's dependencies #
# Install pipeline dependencies #
## Install pyFlow ## ## Install pyFlow ##
ENV PYFLOW_RELEASE=1.1.20 ENV PYFLOW_VERSION=1.1.20
ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" . RUN wget --no-check-certificate --quiet \
RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \ "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \
&& cd "pyflow-${PYFLOW_RELEASE}" \ && tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
&& cd "pyflow-${PYFLOW_VERSION}" \
&& apt-get install --no-install-recommends --yes \ && apt-get install --no-install-recommends --yes \
python2.7 \ python2.7 \
&& python2.7 setup.py build install \ && python2.7 setup.py build install \
&& cd .. \ && cd .. \
&& rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz" && rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"
## Install spaCy ## ## Install spaCy ##
ENV SPACY_VERSION=3.0.3 ENV SPACY_VERSION=3.0.5
RUN apt-get install --no-install-recommends --yes \ RUN apt-get install --no-install-recommends --yes \
python3.7 \ python3.7 \
python3-pip \ python3-pip \
zip \
&& pip3 install \ && pip3 install \
chardet \ chardet \
setuptools \ setuptools \
@ -36,22 +37,22 @@ RUN apt-get install --no-install-recommends --yes \
&& pip3 install --upgrade pip \ && pip3 install --upgrade pip \
&& pip3 install "spacy==${SPACY_VERSION}" && pip3 install "spacy==${SPACY_VERSION}"
# Only models that include the following components are compatibel:
# lemmatizer, ner, parser, senter, tagger,
ENV SPACY_MODELS="de_core_news_md,en_core_web_md,it_core_news_md,nl_core_news_md,pl_core_news_md,zh_core_web_md"
ENV SPACY_MODELS_VERSION=3.0.0 ENV SPACY_MODELS_VERSION=3.0.0
RUN python3 -m spacy download "da_core_news_md-${SPACY_MODELS_VERSION}" --direct \ RUN for spacy_model in $(echo ${SPACY_MODELS} | tr "," "\n"); do python3 -m spacy download "${spacy_model}-${SPACY_MODELS_VERSION}" --direct; done
&& python3 -m spacy download "de_core_news_md-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "el_core_news_md-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "en_core_web_md-${SPACY_MODELS_VERSION}" --direct \ ## Further dependencies ##
&& python3 -m spacy download "es_core_news_md-${SPACY_MODELS_VERSION}" --direct \ RUN apt-get install --no-install-recommends --yes \
&& python3 -m spacy download "fr_core_news_md-${SPACY_MODELS_VERSION}" --direct \ procps \
&& python3 -m spacy download "it_core_news_md-${SPACY_MODELS_VERSION}" --direct \ zip
&& python3 -m spacy download "nl_core_news_md-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "pt_core_news_md-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "ru_core_news_md-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "zh_core_web_md-${SPACY_MODELS_VERSION}" --direct
## Install Pipeline ## ## Install Pipeline ##
COPY nlp spacy-nlp /usr/local/bin/ COPY nlp spacy-nlp vrt-creator /usr/local/bin/
RUN rm -r /var/lib/apt/lists/* RUN rm -r /var/lib/apt/lists/*

View File

@ -5,18 +5,13 @@ This software implements a heavily parallelized pipeline for Natural Language Pr
## Software used in this pipeline implementation ## Software used in this pipeline implementation
- Official Debian Docker image (buster-slim) and programs from its free repositories: https://hub.docker.com/_/debian - Official Debian Docker image (buster-slim) and programs from its free repositories: https://hub.docker.com/_/debian
- pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20 - pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
- spaCy (3.0.3): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1 - spaCy (3.0.5): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1
- spaCy medium sized models (3.0.0): - spaCy medium sized models (3.0.0):
- https://github.com/explosion/spacy-models/releases/tag/da_core_news_md-3.0.0
- https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.0.0 - https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.0.0
- https://github.com/explosion/spacy-models/releases/tag/el_core_news_md-3.0.0
- https://github.com/explosion/spacy-models/releases/tag/en_core_web_md-3.0.0 - https://github.com/explosion/spacy-models/releases/tag/en_core_web_md-3.0.0
- https://github.com/explosion/spacy-models/releases/tag/es_core_news_md-3.0.0
- https://github.com/explosion/spacy-models/releases/tag/fr_core_news_md-3.0.0
- https://github.com/explosion/spacy-models/releases/tag/it_core_news_md-3.0.0 - https://github.com/explosion/spacy-models/releases/tag/it_core_news_md-3.0.0
- https://github.com/explosion/spacy-models/releases/tag/nl_core_news_md-3.0.0 - https://github.com/explosion/spacy-models/releases/tag/nl_core_news_md-3.0.0
- https://github.com/explosion/spacy-models/releases/tag/pt_core_news_md-3.0.0 - https://github.com/explosion/spacy-models/releases/tag/pl_core_news_md-3.0.0
- https://github.com/explosion/spacy-models/releases/tag/ru_core_news_md-3.0.0
- https://github.com/explosion/spacy-models/releases/tag/zh_core_web_md-3.0.0 - https://github.com/explosion/spacy-models/releases/tag/zh_core_web_md-3.0.0
@ -29,7 +24,7 @@ mkdir -p /<my_data_location>/input /<my_data_location>/output
2. Place your text files inside `/<my_data_location>/input`. Files should all contain text of the same language. 2. Place your text files inside `/<my_data_location>/input`. Files should all contain text of the same language.
3. Start the pipeline process. Check the [Pipeline arguments](#pipeline-arguments) section for more details. 3. Start the pipeline process. Check the pipeline help (`nlp --help`) for more details.
``` ```
# Option one: Use the wrapper script # Option one: Use the wrapper script
## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/raw/1.0.0/wrapper/nlp, make it executeable and add it to your ${PATH} ## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/raw/1.0.0/wrapper/nlp, make it executeable and add it to your ${PATH}
@ -51,38 +46,3 @@ docker run \
``` ```
4. Check your results in the `/<my_data_location>/output` directory. 4. Check your results in the `/<my_data_location>/output` directory.
```
### Pipeline arguments
`--check-encoding`
* If set, the pipeline tries to automatically determine the right encoding for
your texts. Only use it if you are not sure that your input is provided in UTF-8.
* default = False
* required = False
`-l languagecode`
* Tells spaCy which language will be used.
* options = da (Danish), de (German), el (Greek), en (English), es (Spanish), fr (French), it (Italian), nl (Dutch), pt (Portuguese), ru (Russian), zh (Chinese)
* required = True
`--nCores corenumber`
* Sets the number of CPU cores being used during the NLP process.
* default = min(4, multiprocessing.cpu_count())
* required = False
``` bash
# Example with all arguments used
docker run \
--rm \
-it \
-u $(id -u $USER):$(id -g $USER) \
-v "$HOME"/ocr/input:/input \
-v "$HOME"/ocr/output:/output \
gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0 \
-i /input \
-l en \
-o /output \
--check-encoding \
--nCores 8 \
```

153
nlp
View File

@ -14,39 +14,14 @@ import os
import sys import sys
SPACY_MODELS = {'da': 'da_core_news_md', SPACY_MODELS = {'de': 'de_core_news_md',
'de': 'de_core_news_md',
'el': 'el_core_news_md',
'en': 'en_core_web_md', 'en': 'en_core_web_md',
'es': 'es_core_news_md',
'fr': 'fr_core_news_md',
'it': 'it_core_news_md', 'it': 'it_core_news_md',
'nl': 'nl_core_news_md', 'nl': 'nl_core_news_md',
'pt': 'pt_core_news_md', 'pl': 'pl_core_news_md',
'ru': 'ru_core_news_md',
'zh': 'zh_core_web_md'} 'zh': 'zh_core_web_md'}
def parse_args():
parser = ArgumentParser(description='NLP Pipeline utilizing spaCy.')
parser.add_argument('-i', '--input-directory',
help='Input directory (only txt files get processed)',
required=True)
parser.add_argument('-o', '--output-directory',
help='Output directory',
required=True)
parser.add_argument('-l', '--language',
choices=SPACY_MODELS.keys(),
required=True)
parser.add_argument('--check-encoding', action='store_true')
parser.add_argument('--log-dir')
parser.add_argument('--n-cores',
default=min(4, multiprocessing.cpu_count()),
help='total number of cores available', type=int)
parser.add_argument('--zip', help='Zips everything into one archive.')
return parser.parse_args()
class NLPPipelineJob: class NLPPipelineJob:
"""An NLP pipeline job class """An NLP pipeline job class
@ -56,8 +31,6 @@ class NLPPipelineJob:
Arguments: Arguments:
file -- Path to the file file -- Path to the file
output_dir -- Path to a directory, where job results a stored output_dir -- Path to a directory, where job results a stored
intermediate_dir -- Path to a directory, where intermediate files are
stored.
""" """
def __init__(self, file, output_dir): def __init__(self, file, output_dir):
@ -67,21 +40,11 @@ class NLPPipelineJob:
class NLPPipeline(WorkflowRunner): class NLPPipeline(WorkflowRunner):
def __init__(self, input_dir, lang, output_dir, check_encoding, n_cores, zip): def __init__(self, input_dir, output_dir, check_encoding, lang, zip):
self.input_dir = input_dir self.input_dir = input_dir
self.lang = lang
self.output_dir = output_dir self.output_dir = output_dir
self.check_encoding = check_encoding self.check_encoding = check_encoding
self.n_cores = n_cores self.lang = lang
self.output_dir = output_dir
if zip is None:
self.zip = zip
else:
if zip.lower().endswith('.zip'):
# Remove .zip file extension if provided
self.zip = zip[:-4]
self.zip = self.zip if self.zip else 'output'
else:
self.zip = zip self.zip = zip
self.jobs = collect_jobs(self.input_dir, self.output_dir) self.jobs = collect_jobs(self.input_dir, self.output_dir)
@ -96,9 +59,7 @@ class NLPPipeline(WorkflowRunner):
''' '''
setup_output_directory_tasks = [] setup_output_directory_tasks = []
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
cmd = 'mkdir' cmd = 'mkdir -p "{}"'.format(job.output_dir)
cmd += ' -p'
cmd += ' "{}"'.format(job.output_dir)
lbl = 'setup_output_directory_-_{}'.format(i) lbl = 'setup_output_directory_-_{}'.format(i)
task = self.addTask(command=cmd, label=lbl) task = self.addTask(command=cmd, label=lbl)
setup_output_directory_tasks.append(task) setup_output_directory_tasks.append(task)
@ -109,20 +70,39 @@ class NLPPipeline(WorkflowRunner):
' ################################################## ' ##################################################
''' '''
nlp_tasks = [] nlp_tasks = []
n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs)))) n_cores = max(1, int(self.getNCores() / len(self.jobs)))
mem_mb = min(n_cores * 2048, int(self.getMemMb() / len(self.jobs)))
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
output_file = os.path.join(job.output_dir, '{}.vrt'.format(job.name)) # noqa output_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name)) # noqa
cmd = 'spacy-nlp' cmd = 'spacy-nlp'
cmd += ' -i "{}"'.format(job.file)
cmd += ' -l "{}"'.format(self.lang) cmd += ' -l "{}"'.format(self.lang)
cmd += ' -o "{}"'.format(output_file) cmd += ' --check-encoding' if self.check_encoding else ''
if self.check_encoding: cmd += ' "{}"'.format(job.file)
cmd += ' --check-encoding' cmd += ' "{}"'.format(output_file)
deps = 'setup_output_directory_-_{}'.format(i) deps = 'setup_output_directory_-_{}'.format(i)
lbl = 'nlp_-_{}'.format(i) lbl = 'nlp_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl, nCores=n_cores) # noqa task = self.addTask(command=cmd, dependencies=deps, label=lbl,
memMb=mem_mb, nCores=n_cores)
nlp_tasks.append(task) nlp_tasks.append(task)
'''
' ##################################################
' # vrt creation #
' ##################################################
'''
vrt_creation_tasks = []
for i, job in enumerate(self.jobs):
output_file = os.path.join(job.output_dir, '{}.vrt'.format(job.name)) # noqa
nlp_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name)) # noqa
cmd = 'vrt-creator'
cmd += ' "{}"'.format(job.file)
cmd += ' "{}"'.format(nlp_file)
cmd += ' "{}"'.format(output_file)
deps = 'nlp_-_{}'.format(i)
lbl = 'vrt_creation_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
vrt_creation_tasks.append(task)
''' '''
' ################################################## ' ##################################################
' # zip creation # ' # zip creation #
@ -136,10 +116,10 @@ class NLPPipeline(WorkflowRunner):
cmd += ' -r' cmd += ' -r'
cmd += ' "{}.zip" .'.format(self.zip) cmd += ' "{}.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*"' cmd += ' -x "pyflow.data*"'
cmd += ' -i "*.vrt"' cmd += ' -i "*.vrt" "*.json"'
cmd += ' && ' cmd += ' && '
cmd += 'cd -' cmd += 'cd -'
deps = nlp_tasks deps = vrt_creation_tasks
lbl = 'zip_creation' lbl = 'zip_creation'
task = self.addTask(command=cmd, dependencies=deps, label=lbl) task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task) zip_creation_tasks.append(task)
@ -149,23 +129,68 @@ def collect_jobs(input_dir, output_dir):
jobs = [] jobs = []
for file in os.listdir(input_dir): for file in os.listdir(input_dir):
if os.path.isdir(os.path.join(input_dir, file)): if os.path.isdir(os.path.join(input_dir, file)):
jobs += collect_jobs(os.path.join(input_dir, file), continue
if file.lower().endswith('.txt'):
job = NLPPipelineJob(os.path.join(input_dir, file),
os.path.join(output_dir, file)) os.path.join(output_dir, file))
elif file.lower().endswith('.txt'): jobs.append(job)
jobs.append(NLPPipelineJob(os.path.join(input_dir, file),
os.path.join(output_dir, file)))
return jobs return jobs
def parse_args():
parser = ArgumentParser(description='NLP pipeline for TXT file processing',
prog='NLP pipeline')
parser.add_argument('-i', '--input-dir',
help='Input directory',
required=True)
parser.add_argument('-o', '--output-dir',
help='Output directory',
required=True)
parser.add_argument('-l', '--language',
choices=SPACY_MODELS.keys(),
help='Language of the input (2-character ISO 639-1 language codes)', # noqa
required=True)
parser.add_argument('--check-encoding',
action='store_true',
help='Check encoding of the input file, UTF-8 is used instead') # noqa
parser.add_argument('--log-dir',
help='Logging directory')
parser.add_argument('--mem-mb',
help='Amount of system memory to be used (Default: min(--n-cores * 2048, available system memory))', # noqa
type=int)
parser.add_argument('--n-cores',
default=min(4, multiprocessing.cpu_count()),
help='Number of CPU threads to be used (Default: min(4, number of CPUs))', # noqa
type=int)
parser.add_argument('--zip',
help='Create one zip file per filetype')
parser.add_argument('-v', '--version',
action='version',
help='Returns the current version of the NLP pipeline',
version='%(prog)s {}'.format(__version__))
args = parser.parse_args()
# Set some tricky default values and check for insufficient input
if args.log_dir is None:
args.log_dir = args.output_dir
if args.n_cores < 1:
raise Exception('--n-cores must be greater or equal 1')
if args.mem_mb is None:
max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0])
args.mem_mb = min(args.n_cores * 2048, max_mem_mb)
if args.mem_mb < 2048:
raise Exception('--mem-mb must be greater or equal 2048')
if args.zip is not None and args.zip.lower().endswith('.zip'):
# Remove .zip file extension if provided
args.zip = args.zip[:-4]
args.zip = args.zip if args.zip else 'output'
return args
def main(): def main():
args = parse_args() args = parse_args()
nlp_pipeline = NLPPipeline(args.input_directory, args.language, nlp_pipeline = NLPPipeline(args.input_dir, args.output_dir, args.check_encoding, args.language, args.zip) # noqa
args.output_directory, args.check_encoding, retval = nlp_pipeline.run(dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores) # noqa
args.n_cores, args.zip)
retval = nlp_pipeline.run(
dataDirRoot=(args.log_dir or args.output_directory),
nCores=args.n_cores
)
sys.exit(retval) sys.exit(retval)

233
spacy-nlp
View File

@ -2,118 +2,171 @@
# coding=utf-8 # coding=utf-8
from argparse import ArgumentParser from argparse import ArgumentParser
from xml.sax.saxutils import escape
import chardet import chardet
import hashlib import hashlib
import json
import os import os
import spacy import spacy
import textwrap import textwrap
SPACY_MODELS = {'da': 'da_core_news_md', spacy_models = {spacy.info(pipeline)['lang']: pipeline
'de': 'de_core_news_md', for pipeline in spacy.info()['pipelines']}
'el': 'el_core_news_md',
'en': 'en_core_web_md',
'es': 'es_core_news_md',
'fr': 'fr_core_news_md',
'it': 'it_core_news_md',
'nl': 'nl_core_news_md',
'pt': 'pt_core_news_md',
'ru': 'ru_core_news_md',
'zh': 'zh_core_web_md'}
SPACY_MODELS_VERSION = os.environ.get('SPACY_MODELS_VERSION')
SPACY_VERSION = os.environ.get('SPACY_VERSION')
# Parse the given arguments # Parse the given arguments
parser = ArgumentParser(description=('Tag a text file with spaCy and save it ' parser = ArgumentParser(description='Create annotations for a given txt file')
'as a verticalized text file.')) parser.add_argument('input', help='Path to txt input file')
parser.add_argument('-i', '--input', metavar='txt-sourcefile', required=True) parser.add_argument('output', help='Path to JSON output file')
parser.add_argument('-o', '--output', metavar='vrt-destfile', required=True) parser.add_argument('-l', '--language',
parser.add_argument('-l', '--language', choices=SPACY_MODELS.keys(), required=True) # noqa choices=spacy_models.keys(),
parser.add_argument('--check-encoding', action='store_true') help='Language of the input (2-character ISO 639-1 language codes)', # noqa
required=True)
parser.add_argument('-c', '--check-encoding',
action='store_true',
help='Check encoding of the input file, UTF-8 is used instead') # noqa
args = parser.parse_args() args = parser.parse_args()
with open(args.input, "rb") as text_file:
# If requested: Check the encoding of the text contents from the input file
# Else: Use utf-8
if args.check_encoding: if args.check_encoding:
with open(args.input, "rb") as input_file: encoding = chardet.detect(text_file.read())['encoding']
bytes = input_file.read()
encoding = chardet.detect(bytes)['encoding']
else: else:
encoding = 'utf-8' encoding = 'utf-8'
text_file.seek(0)
text_md5 = hashlib.md5()
# hashing in chunks to avoid full RAM with huge files. for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''):
with open(args.input, 'rb') as input_file: text_md5.update(chunk)
source_md5 = hashlib.md5()
for chunk in iter(lambda: input_file.read(128 * source_md5.block_size), b''):
source_md5.update(chunk)
source_md5 = source_md5.hexdigest()
# Load the text contents from the input file # Load the text contents from the input file
with open(args.input, encoding=encoding) as input_file: with open(args.input, encoding=encoding) as text_file:
text = input_file.read() # spaCy NLP is limited to strings with a maximum of 1 million characters at
# spaCys NLP is limited to strings with maximum 1 million characters at
# once. So we split it into suitable chunks. # once. So we split it into suitable chunks.
text_chunks = textwrap.wrap(text, 1000000, break_long_words=False) text_chunks = textwrap.wrap(
# the text variable potentially occupies a lot of system memory and is no text_file.read(),
# longer needed... 1000000,
del text break_long_words=False,
break_on_hyphens=False,
drop_whitespace=False,
expand_tabs=False,
replace_whitespace=False
)
model = spacy_models[args.language]
# Setup the spaCy toolkit by loading the chosen language model
model = SPACY_MODELS[args.language]
nlp = spacy.load(model) nlp = spacy.load(model)
meta = {
'generator': {
'name': 'nopaque NLP service',
'version': '1.0.0',
'arguments': {
'check_encoding': args.check_encoding,
'language': args.language
}
},
'file': {
'encoding': encoding,
'md5': text_md5.hexdigest(),
'name': os.path.basename(args.input)
}
}
# Create the output file in verticalized text format
# See: http://cwb.sourceforge.net/files/CWB_Encoding_Tutorial/node3.html
output_file_original_filename = args.output
output_file_stand_off_filename = args.output.replace('.vrt', '.stand-off.vrt')
common_xml = ('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'
+ '<corpus>\n'
+ '<text>\n'
+ '<nlp name="spaCy:{}"\n'.format(SPACY_VERSION)
+ ' model="{}:{}"\n'.format(model, SPACY_MODELS_VERSION)
+ ' source-md5="{}" />\n'.format(source_md5))
with open(output_file_original_filename, 'w+') as output_file_original, \ tags = {
open(output_file_stand_off_filename, 'w+') as output_file_stand_off: 'token': {
'description': '',
'properties': {
'lemma': {
'description': 'The base form of the word',
'flags': ['required'],
'tagset': None
},
'pos': {
'description': 'The detailed part-of-speech tag',
'flags': ['required'],
'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['tagger']} # noqa
},
'simple_pos': {
'description': 'The simple UPOS part-of-speech tag',
'flags': ['required'],
'tagset': {
'ADJ': 'adjective',
'ADP': 'adposition',
'ADV': 'adverb',
'AUX': 'auxiliary verb',
'CONJ': 'coordinating conjunction',
'DET': 'determiner',
'INTJ': 'interjection',
'NOUN': 'noun',
'NUM': 'numeral',
'PART': 'particle',
'PRON': 'pronoun',
'PROPN': 'proper noun',
'PUNCT': 'punctuation',
'SCONJ': 'subordinating conjunction',
'SYM': 'symbol',
'VERB': 'verb',
'X': 'other'
}
},
'ner': {
'description': 'Label indicating the type of the entity',
'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['ner']} # noqa
}
}
},
's': {
'description': 'Encodes the start and end of a sentence',
'properties': None
},
'ent': {
'description': 'Encodes the start and end of a named entity',
'properties': {
'type': {
'description': 'Label indicating the type of the entity',
'flags': ['required'],
'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['ner']} # noqa
}
}
}
}
output_file_original.write(common_xml) annotations = []
output_file_stand_off.write(common_xml)
text_offset = 0 chunk_offset = 0
for text_chunk in text_chunks: while text_chunks:
text_chunk = text_chunks.pop(0)
doc = nlp(text_chunk) doc = nlp(text_chunk)
for sent in doc.sents: for token in doc:
output_file_original.write('<s>\n') if token.is_space:
output_file_stand_off.write('<s>\n') continue
space_flag = False if token.is_sent_start:
# Skip whitespace tokens annotation = {'start': token.sent.start_char + chunk_offset,
sent_no_space = [token for token in sent 'end': token.sent.end_char + chunk_offset,
if not token.text.isspace()] 'tag': 's'}
# No space variant for cwb original .vrt file input. annotations.append(annotation)
for token in sent_no_space: # Check if the token is the start of an entity
output_file_original.write('{}'.format(escape(token.text)) if token.ent_iob == 3:
+ '\t{}'.format(escape(token.lemma_)) for ent_candidate in token.sent.ents:
+ '\t{}'.format(token.pos_) if ent_candidate.start_char == token.idx:
+ '\t{}'.format(token.tag_) ent = ent_candidate
+ '\t{}\n'.format(token.ent_type_ or 'NULL')) annotation = {'start': ent.start_char + chunk_offset,
# Stand off variant with spaces. 'end': ent.end_char + chunk_offset,
for token in sent: 'tag': 'ent',
token_start = token.idx + text_offset 'properties': {'type': token.ent_type_}}
token_end = token.idx + len(token.text) + text_offset annotations.append(annotation)
output_file_stand_off.write('{}:{}'.format(token_start, break
token_end) annotation = {'start': token.idx + chunk_offset,
+ '\t{}'.format(escape(token.lemma_)) 'end': token.idx + len(token.text) + chunk_offset,
+ '\t{}'.format(token.pos_) 'tag': 'token',
+ '\t{}'.format(token.tag_) 'properties': {'pos': token.tag_,
+ '\t{}\n'.format(token.ent_type_ or 'NULL')) 'lemma': token.lemma_,
output_file_original.write('</s>\n') 'simple_pos': token.pos_}}
output_file_stand_off.write('</s>\n') if token.ent_type_:
text_offset = token_end + 1 annotation['properties']['ner'] = token.ent_type_
output_file_original.write('</text>\n</corpus>') annotations.append(annotation)
output_file_stand_off.write('</text>\n</corpus>') chunk_offset += len(text_chunk)
text_chunk = None
with open(args.output, 'w') as output_file:
json.dump({'meta': meta, 'tags': tags, 'annotations': annotations},
output_file, indent=4)

130
vrt-creator Executable file
View File

@ -0,0 +1,130 @@
#!/usr/bin/env python3.7
# coding=utf-8
from argparse import ArgumentParser
from xml.sax.saxutils import escape
import hashlib
import json
# Two global ressources - Not very elegant but it works for now
stand_off_data = None
text = None
def meta_to_string():
string = ''
string += '<generator software="{} ({})" arguments="check_encoding: {}; language: {}"/>\n'.format( # noqa
stand_off_data['meta']['generator']['name'],
stand_off_data['meta']['generator']['version'],
stand_off_data['meta']['generator']['arguments']['check_encoding'],
stand_off_data['meta']['generator']['arguments']['language']
)
string += '<file encoding="{}" name="{}" md5="{}"/>\n'.format(
stand_off_data['meta']['file']['encoding'],
stand_off_data['meta']['file']['name'],
stand_off_data['meta']['file']['md5']
)
return string
def tags_to_string():
return ''
def annotations_to_string(end=float('inf')):
string = ''
while stand_off_data['annotations']:
if stand_off_data['annotations'][0]['start'] >= end:
break
annotation = stand_off_data['annotations'].pop(0)
#######################################################################
# Check for malformed annotations #
#######################################################################
if 'tag' not in annotation:
raise Exception('Annotation tag is missing')
if annotation['tag'] not in stand_off_data['tags']:
raise Exception('Unknown annotation tag: ' + annotation['tag'])
tag_model = stand_off_data['tags'][annotation['tag']]
if 'properties' in tag_model:
properties_model = tag_model['properties']
if properties_model is not None:
required_properties = filter(lambda x: 'flags' in x and 'required' in x['flags'], properties_model) # noqa
if required_properties and annotation['properties'] is None:
raise Exception('There are required properties but the "Properties" attribute is missing') # noqa
for property in required_properties:
if property not in annotation['properties']:
raise Exception('Required property is missing: ' + property) # noqa
#######################################################################
# Process tokens ~ cwb's positional attributes #
#######################################################################
if annotation['tag'] == 'token':
string += '{}\t{}\t{}\t{}\t{}\n'.format(
escape(text[annotation['start']:annotation['end']]),
escape(annotation['properties']['pos']),
escape(annotation['properties']['lemma']),
escape(annotation['properties']['simple_pos']),
escape(annotation['properties']['ner'] if 'ner' in annotation['properties'] else 'None') # noqa
)
#######################################################################
# Process other tags ~ cwb's structural attributes #
#######################################################################
else:
properties = ''
if 'properties' in annotation and annotation['properties'] is not None: # noqa
for property, value in annotation['properties'].items():
if not value:
continue
if properties_model and property in properties_model:
if 'flags' in properties_model and 'multiple' in properties_model['flags']: # noqa
properties += ' {}="|{}|"'.format(property, '|'.join(value)) # noqa
else:
properties += ' {}="{}"'.format(property, value)
string += '<' + annotation['tag'] + properties + '>\n'
string += annotations_to_string(end=min(annotation['end'], end))
string += '</' + annotation['tag'] + '>\n'
return string
def main():
global stand_off_data
global text
# Parse the given arguments
parser = ArgumentParser(description='Create a vrt from JSON and txt')
parser.add_argument('text', help='Path to txt file')
parser.add_argument('stand_off_data', help='Path to JSON file')
parser.add_argument('output', help='Path to vrt output file')
args = parser.parse_args()
with open(args.stand_off_data) as stand_of_data_file:
stand_off_data = json.load(stand_of_data_file)
with open(args.text, "rb") as text_file:
text_md5 = hashlib.md5()
for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''): # noqa
text_md5.update(chunk)
if text_md5.hexdigest() != stand_off_data['meta']['file']['md5']:
raise Exception('md5 not equal')
with open(args.text, encoding=stand_off_data['meta']['file']['encoding']) as text_file: # noqa
text = text_file.read()
vrt = ''
vrt += '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'
vrt += '<corpus>\n'
vrt += '<text>\n'
vrt += meta_to_string()
vrt += tags_to_string()
vrt += annotations_to_string()
vrt += '</text>\n'
vrt += '</corpus>'
with open(args.output, 'w') as vrt_file:
vrt_file.write(vrt)
if __name__ == '__main__':
main()

View File

@ -4,30 +4,35 @@
from argparse import ArgumentParser from argparse import ArgumentParser
import os import os
import subprocess import subprocess
import sys
CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0' CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0b'
CONTAINER_INPUT_DIR = '/input' CONTAINER_INPUT_DIR = '/input'
CONTAINER_OUTPUT_DIR = '/output' CONTAINER_OUTPUT_DIR = '/output'
CONTAINER_LOG_DIR = '/logs'
UID = str(os.getuid()) UID = str(os.getuid())
GID = str(os.getgid()) GID = str(os.getgid())
parser = ArgumentParser(add_help=False) parser = ArgumentParser(add_help=False)
parser.add_argument('-i', '--input-directory') parser.add_argument('-i', '--input-dir')
parser.add_argument('-o', '--output-directory') parser.add_argument('-o', '--output-dir')
parser.add_argument('--log-dir')
args, remaining_args = parser.parse_known_args() args, remaining_args = parser.parse_known_args()
cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)] cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)]
if args.output_directory is not None: if args.input_dir is not None:
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.output_directory), mapping = os.path.abspath(args.input_dir) + ':' + CONTAINER_INPUT_DIR
CONTAINER_OUTPUT_DIR)] cmd += ['-v', mapping]
remaining_args.insert(0, CONTAINER_OUTPUT_DIR) remaining_args += ['-i', CONTAINER_INPUT_DIR]
remaining_args.insert(0, '-o') if args.output_dir is not None:
if args.input_directory is not None: mapping = os.path.abspath(args.output_dir) + ':' + CONTAINER_OUTPUT_DIR
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.input_directory), cmd += ['-v', mapping]
CONTAINER_INPUT_DIR)] remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
remaining_args.insert(0, CONTAINER_INPUT_DIR) if args.log_dir is not None:
remaining_args.insert(0, '-i') mapping = os.path.abspath(args.log_dir) + ':' + CONTAINER_LOG_DIR
cmd += ['-v', mapping]
remaining_args += ['--log-dir', CONTAINER_LOG_DIR]
cmd.append(CONTAINER_IMAGE) cmd.append(CONTAINER_IMAGE)
cmd += remaining_args cmd += remaining_args
subprocess.run(cmd) sys.exit(subprocess.run(cmd).returncode)