Use JSON files for stand-off annotations.

This commit is contained in:
Patrick Jentsch 2021-03-26 09:46:17 +01:00
parent d620c29f27
commit aa1bfa259d
5 changed files with 347 additions and 176 deletions

View File

@ -1,8 +1,5 @@
image: docker:19.03.13 image: docker:19.03.13
variables:
DOCKER_TLS_CERTDIR: "/certs"
services: services:
- docker:19.03.13-dind - docker:19.03.13-dind
@ -10,6 +7,10 @@ stages:
- build - build
- push - push
variables:
DOCKER_TLS_CERTDIR: "/certs"
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME-$CI_COMMIT_SHA
.reg_setup: .reg_setup:
before_script: before_script:
- apk add --no-cache curl - apk add --no-cache curl
@ -28,8 +29,6 @@ build_image:
stage: build stage: build
tags: tags:
- docker - docker
variables:
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
push_master: push_master:
extends: extends:
@ -47,7 +46,6 @@ push_master:
- docker - docker
variables: variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:latest IMAGE_TAG: $CI_REGISTRY_IMAGE:latest
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
push_other: push_other:
extends: extends:
@ -68,4 +66,3 @@ push_other:
- docker - docker
variables: variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA

View File

@ -7,28 +7,29 @@ LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <por
ENV LANG=C.UTF-8 ENV LANG=C.UTF-8
RUN apt-get update RUN apt-get update \
&& apt-get install --no-install-recommends --yes \
wget
# Install the NLP pipeline and it's dependencies #
# Install pipeline dependencies #
## Install pyFlow ## ## Install pyFlow ##
ENV PYFLOW_RELEASE=1.1.20 ENV PYFLOW_VERSION=1.1.20
ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" . RUN wget --no-check-certificate --quiet \
RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \ "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \
&& cd "pyflow-${PYFLOW_RELEASE}" \ && tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
&& cd "pyflow-${PYFLOW_VERSION}" \
&& apt-get install --no-install-recommends --yes \ && apt-get install --no-install-recommends --yes \
python2.7 \ python2.7 \
&& python2.7 setup.py build install \ && python2.7 setup.py build install \
&& cd .. \ && cd .. \
&& rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz" && rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"
## Install spaCy ## ## Install spaCy ##
ENV SPACY_VERSION=3.0.3 ENV SPACY_VERSION=3.0.5
RUN apt-get install --no-install-recommends --yes \ RUN apt-get install --no-install-recommends --yes \
python3.7 \ python3.7 \
python3-pip \ python3-pip \
zip \
&& pip3 install \ && pip3 install \
chardet \ chardet \
setuptools \ setuptools \
@ -36,22 +37,22 @@ RUN apt-get install --no-install-recommends --yes \
&& pip3 install --upgrade pip \ && pip3 install --upgrade pip \
&& pip3 install "spacy==${SPACY_VERSION}" && pip3 install "spacy==${SPACY_VERSION}"
# Only models that include the following components are compatibel:
# lemmatizer, ner, parser, senter, tagger,
ENV SPACY_MODELS="de_core_news_md,en_core_web_md,it_core_news_md,nl_core_news_md,pl_core_news_md,zh_core_web_md"
ENV SPACY_MODELS_VERSION=3.0.0 ENV SPACY_MODELS_VERSION=3.0.0
RUN python3 -m spacy download "da_core_news_md-${SPACY_MODELS_VERSION}" --direct \ RUN for spacy_model in $(echo ${SPACY_MODELS} | tr "," "\n"); do python3 -m spacy download "${spacy_model}-${SPACY_MODELS_VERSION}" --direct; done
&& python3 -m spacy download "de_core_news_md-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "el_core_news_md-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "en_core_web_md-${SPACY_MODELS_VERSION}" --direct \ ## Further dependencies ##
&& python3 -m spacy download "es_core_news_md-${SPACY_MODELS_VERSION}" --direct \ RUN apt-get install --no-install-recommends --yes \
&& python3 -m spacy download "fr_core_news_md-${SPACY_MODELS_VERSION}" --direct \ procps \
&& python3 -m spacy download "it_core_news_md-${SPACY_MODELS_VERSION}" --direct \ zip
&& python3 -m spacy download "nl_core_news_md-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "pt_core_news_md-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "ru_core_news_md-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "zh_core_web_md-${SPACY_MODELS_VERSION}" --direct
## Install Pipeline ## ## Install Pipeline ##
COPY nlp spacy-nlp /usr/local/bin/ COPY nlp spacy-nlp vrt-creator /usr/local/bin/
RUN rm -r /var/lib/apt/lists/* RUN rm -r /var/lib/apt/lists/*

143
nlp
View File

@ -14,39 +14,14 @@ import os
import sys import sys
SPACY_MODELS = {'da': 'da_core_news_md', SPACY_MODELS = {'de': 'de_core_news_md',
'de': 'de_core_news_md',
'el': 'el_core_news_md',
'en': 'en_core_web_md', 'en': 'en_core_web_md',
'es': 'es_core_news_md',
'fr': 'fr_core_news_md',
'it': 'it_core_news_md', 'it': 'it_core_news_md',
'nl': 'nl_core_news_md', 'nl': 'nl_core_news_md',
'pt': 'pt_core_news_md', 'pl': 'pl_core_news_md',
'ru': 'ru_core_news_md',
'zh': 'zh_core_web_md'} 'zh': 'zh_core_web_md'}
def parse_args():
parser = ArgumentParser(description='NLP Pipeline utilizing spaCy.')
parser.add_argument('-i', '--input-directory',
help='Input directory (only txt files get processed)',
required=True)
parser.add_argument('-o', '--output-directory',
help='Output directory',
required=True)
parser.add_argument('-l', '--language',
choices=SPACY_MODELS.keys(),
required=True)
parser.add_argument('--check-encoding', action='store_true')
parser.add_argument('--log-dir')
parser.add_argument('--n-cores',
default=min(4, multiprocessing.cpu_count()),
help='total number of cores available', type=int)
parser.add_argument('--zip', help='Zips everything into one archive.')
return parser.parse_args()
class NLPPipelineJob: class NLPPipelineJob:
"""An NLP pipeline job class """An NLP pipeline job class
@ -56,8 +31,6 @@ class NLPPipelineJob:
Arguments: Arguments:
file -- Path to the file file -- Path to the file
output_dir -- Path to a directory, where job results a stored output_dir -- Path to a directory, where job results a stored
intermediate_dir -- Path to a directory, where intermediate files are
stored.
""" """
def __init__(self, file, output_dir): def __init__(self, file, output_dir):
@ -67,21 +40,11 @@ class NLPPipelineJob:
class NLPPipeline(WorkflowRunner): class NLPPipeline(WorkflowRunner):
def __init__(self, input_dir, lang, output_dir, check_encoding, n_cores, zip): def __init__(self, input_dir, output_dir, check_encoding, lang, zip):
self.input_dir = input_dir self.input_dir = input_dir
self.lang = lang
self.output_dir = output_dir self.output_dir = output_dir
self.check_encoding = check_encoding self.check_encoding = check_encoding
self.n_cores = n_cores self.lang = lang
self.output_dir = output_dir
if zip is None:
self.zip = zip
else:
if zip.lower().endswith('.zip'):
# Remove .zip file extension if provided
self.zip = zip[:-4]
self.zip = self.zip if self.zip else 'output'
else:
self.zip = zip self.zip = zip
self.jobs = collect_jobs(self.input_dir, self.output_dir) self.jobs = collect_jobs(self.input_dir, self.output_dir)
@ -96,9 +59,7 @@ class NLPPipeline(WorkflowRunner):
''' '''
setup_output_directory_tasks = [] setup_output_directory_tasks = []
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
cmd = 'mkdir' cmd = 'mkdir -p "{}"'.format(job.output_dir)
cmd += ' -p'
cmd += ' "{}"'.format(job.output_dir)
lbl = 'setup_output_directory_-_{}'.format(i) lbl = 'setup_output_directory_-_{}'.format(i)
task = self.addTask(command=cmd, label=lbl) task = self.addTask(command=cmd, label=lbl)
setup_output_directory_tasks.append(task) setup_output_directory_tasks.append(task)
@ -109,20 +70,36 @@ class NLPPipeline(WorkflowRunner):
' ################################################## ' ##################################################
''' '''
nlp_tasks = [] nlp_tasks = []
n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs)))) n_cores = max(1, int(self.getNCores() / len(self.jobs)))
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
output_file = os.path.join(job.output_dir, '{}.vrt'.format(job.name)) # noqa output_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name)) # noqa
cmd = 'spacy-nlp' cmd = 'spacy-nlp'
cmd += ' -i "{}"'.format(job.file)
cmd += ' -l "{}"'.format(self.lang) cmd += ' -l "{}"'.format(self.lang)
cmd += ' -o "{}"'.format(output_file) cmd += ' --check-encoding' if self.check_encoding else ''
if self.check_encoding: cmd += ' "{}"'.format(job.file)
cmd += ' --check-encoding' cmd += ' "{}"'.format(output_file)
deps = 'setup_output_directory_-_{}'.format(i) deps = 'setup_output_directory_-_{}'.format(i)
lbl = 'nlp_-_{}'.format(i) lbl = 'nlp_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl, nCores=n_cores) # noqa task = self.addTask(command=cmd, dependencies=deps, label=lbl,
nCores=n_cores)
nlp_tasks.append(task) nlp_tasks.append(task)
'''
' ##################################################
' # vrt creation #
' ##################################################
'''
for i, job in enumerate(self.jobs):
output_file = os.path.join(job.output_dir, '{}.vrt'.format(job.name)) # noqa
nlp_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name)) # noqa
cmd = 'vrt-creator'
cmd += ' "{}"'.format(job.file)
cmd += ' "{}"'.format(nlp_file)
cmd += ' "{}"'.format(output_file)
deps = 'nlp_-_{}'.format(i)
lbl = 'vrt_creation_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
''' '''
' ################################################## ' ##################################################
' # zip creation # ' # zip creation #
@ -136,7 +113,7 @@ class NLPPipeline(WorkflowRunner):
cmd += ' -r' cmd += ' -r'
cmd += ' "{}.zip" .'.format(self.zip) cmd += ' "{}.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*"' cmd += ' -x "pyflow.data*"'
cmd += ' -i "*.vrt"' cmd += ' -i "*.vrt" "*.json"'
cmd += ' && ' cmd += ' && '
cmd += 'cd -' cmd += 'cd -'
deps = nlp_tasks deps = nlp_tasks
@ -152,20 +129,64 @@ def collect_jobs(input_dir, output_dir):
jobs += collect_jobs(os.path.join(input_dir, file), jobs += collect_jobs(os.path.join(input_dir, file),
os.path.join(output_dir, file)) os.path.join(output_dir, file))
elif file.lower().endswith('.txt'): elif file.lower().endswith('.txt'):
jobs.append(NLPPipelineJob(os.path.join(input_dir, file), job = NLPPipelineJob(os.path.join(input_dir, file),
os.path.join(output_dir, file))) os.path.join(output_dir, file))
jobs.append(job)
return jobs return jobs
def parse_args():
parser = ArgumentParser(description='NLP pipeline for TXT file processing',
prog='NLP pipeline')
parser.add_argument('-i', '--input-dir',
help='Input directory',
required=True)
parser.add_argument('-o', '--output-dir',
help='Output directory',
required=True)
parser.add_argument('-l', '--language',
choices=SPACY_MODELS.keys(),
required=True)
parser.add_argument('--check-encoding',
action='store_true')
parser.add_argument('--log-dir',
help='Logging directory')
parser.add_argument('--mem-mb',
help='Amount of system memory to be used (Default: min(--n-cores * 2048, available system memory))', # noqa
type=int)
parser.add_argument('--n-cores',
default=min(4, multiprocessing.cpu_count()),
help='Number of CPU threads to be used',
type=int)
parser.add_argument('--zip',
help='Create one zip file per filetype')
parser.add_argument('-v', '--version',
action='version',
help='Returns the current version of the NLP pipeline',
version='%(prog)s {}'.format(__version__))
args = parser.parse_args()
# Set some tricky default values and check for insufficient input
if args.log_dir is None:
args.log_dir = args.output_dir
if args.n_cores < 1:
raise Exception('--n-cores must be greater or equal 1')
if args.mem_mb is None:
max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0])
args.mem_mb = min(args.n_cores * 2048, max_mem_mb)
if args.mem_mb < 2048:
raise Exception('--mem-mb must be greater or equal 2048')
if args.zip is not None and args.zip.lower().endswith('.zip'):
# Remove .zip file extension if provided
args.zip = args.zip[:-4]
args.zip = args.zip if args.zip else 'output'
return args
def main(): def main():
args = parse_args() args = parse_args()
nlp_pipeline = NLPPipeline(args.input_directory, args.language, nlp_pipeline = NLPPipeline(args.input_dir, args.output_dir, args.check_encoding, args.language, args.zip) # noqa
args.output_directory, args.check_encoding, retval = nlp_pipeline.run(dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores) # noqa
args.n_cores, args.zip)
retval = nlp_pipeline.run(
dataDirRoot=(args.log_dir or args.output_directory),
nCores=args.n_cores
)
sys.exit(retval) sys.exit(retval)

203
spacy-nlp
View File

@ -2,56 +2,39 @@
# coding=utf-8 # coding=utf-8
from argparse import ArgumentParser from argparse import ArgumentParser
from xml.sax.saxutils import escape
import chardet import chardet
import hashlib import hashlib
import json
import os import os
import spacy import spacy
import textwrap import textwrap
SPACY_MODELS = {'da': 'da_core_news_md', spacy_models = {spacy.info(pipeline)['lang']: pipeline
'de': 'de_core_news_md', for pipeline in spacy.info()['pipelines']}
'el': 'el_core_news_md',
'en': 'en_core_web_md',
'es': 'es_core_news_md',
'fr': 'fr_core_news_md',
'it': 'it_core_news_md',
'nl': 'nl_core_news_md',
'pt': 'pt_core_news_md',
'ru': 'ru_core_news_md',
'zh': 'zh_core_web_md'}
SPACY_MODELS_VERSION = os.environ.get('SPACY_MODELS_VERSION')
SPACY_VERSION = os.environ.get('SPACY_VERSION')
# Parse the given arguments # Parse the given arguments
parser = ArgumentParser(description=('Tag a text file with spaCy and save it ' parser = ArgumentParser(description='Create annotations for a given txt file')
'as a verticalized text file.')) parser.add_argument('input', metavar='Path to txt input file')
parser.add_argument('-i', '--input', metavar='txt-sourcefile', required=True) parser.add_argument('output', metavar='Path to JSON output file')
parser.add_argument('-o', '--output', metavar='vrt-destfile', required=True) parser.add_argument('-l', '--language',
parser.add_argument('-l', '--language', choices=SPACY_MODELS.keys(), required=True) # noqa choices=spacy_models.keys(),
parser.add_argument('--check-encoding', action='store_true') required=True)
parser.add_argument('-c', '--check-encoding', action='store_true')
args = parser.parse_args() args = parser.parse_args()
# If requested: Check the encoding of the text contents from the input file # If requested: Check the encoding of the text contents from the input file
# Else: Use utf-8 # Else: Use utf-8
if args.check_encoding:
with open(args.input, "rb") as input_file: with open(args.input, "rb") as input_file:
bytes = input_file.read() if args.check_encoding:
encoding = chardet.detect(bytes)['encoding'] encoding = chardet.detect(input_file.read())['encoding']
else: else:
encoding = 'utf-8' encoding = 'utf-8'
text_md5 = hashlib.md5()
for chunk in iter(lambda: input_file.read(128 * text_md5.block_size), b''):
# hashing in chunks to avoid full RAM with huge files. text_md5.update(chunk)
with open(args.input, 'rb') as input_file:
source_md5 = hashlib.md5()
for chunk in iter(lambda: input_file.read(128 * source_md5.block_size), b''):
source_md5.update(chunk)
source_md5 = source_md5.hexdigest()
# Load the text contents from the input file # Load the text contents from the input file
with open(args.input, encoding=encoding) as input_file: with open(args.input, encoding=encoding) as input_file:
@ -63,57 +46,119 @@ with open(args.input, encoding=encoding) as input_file:
# longer needed... # longer needed...
del text del text
model = spacy_models[args.language]
# Setup the spaCy toolkit by loading the chosen language model
model = SPACY_MODELS[args.language]
nlp = spacy.load(model) nlp = spacy.load(model)
meta = {
'generator': {
'name': 'nopaque NLP service',
'version': '1.0.0',
'arguments': {
'check_encoding': args.check_encoding,
'language': args.language
}
},
'file': {
'md5': text_md5.hexdigest(),
'name': os.path.basename(args.input)
}
}
# Create the output file in verticalized text format
# See: http://cwb.sourceforge.net/files/CWB_Encoding_Tutorial/node3.html
output_file_original_filename = args.output
output_file_stand_off_filename = args.output.replace('.vrt', '.stand-off.vrt')
common_xml = ('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'
+ '<corpus>\n'
+ '<text>\n'
+ '<nlp name="spaCy:{}"\n'.format(SPACY_VERSION)
+ ' model="{}:{}"\n'.format(model, SPACY_MODELS_VERSION)
+ ' source-md5="{}" />\n'.format(source_md5))
with open(output_file_original_filename, 'w+') as output_file_original, \ tags = {
open(output_file_stand_off_filename, 'w+') as output_file_stand_off: 'token': {
'description': '',
'properties': {
'lemma': {
'description': 'The base form of the word',
'flags': ['required'],
'tagset': None
},
'pos': {
'description': 'The detailed part-of-speech tag',
'flags': ['required'],
'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['tagger']} # noqa
},
'simple_pos': {
'description': 'The simple UPOS part-of-speech tag',
'flags': ['required'],
'tagset': {
'ADJ': 'adjective',
'ADP': 'adposition',
'ADV': 'adverb',
'AUX': 'auxiliary verb',
'CONJ': 'coordinating conjunction',
'DET': 'determiner',
'INTJ': 'interjection',
'NOUN': 'noun',
'NUM': 'numeral',
'PART': 'particle',
'PRON': 'pronoun',
'PROPN': 'proper noun',
'PUNCT': 'punctuation',
'SCONJ': 'subordinating conjunction',
'SYM': 'symbol',
'VERB': 'verb',
'X': 'other'
}
},
'ner': {
'description': 'Label indicating the type of the entity',
'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['ner']} # noqa
}
}
},
's': {
'description': 'Encodes the start and end of a sentence',
'properties': None
},
'ent': {
'description': 'Encodes the start and end of a named entity',
'properties': {
'type': {
'description': 'Label indicating the type of the entity',
'flags': ['required'],
'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['ner']} # noqa
}
}
}
}
output_file_original.write(common_xml) annotations = []
output_file_stand_off.write(common_xml)
text_offset = 0 chunk_offset = 0
for text_chunk in text_chunks: for text_chunk in text_chunks:
doc = nlp(text_chunk) doc = nlp(text_chunk)
for sent in doc.sents: for token in doc:
output_file_original.write('<s>\n') if token.is_space:
output_file_stand_off.write('<s>\n') continue
space_flag = False if token.is_sent_start:
# Skip whitespace tokens annotation = {'start': token.sent.start_char + chunk_offset,
sent_no_space = [token for token in sent 'end': token.sent.end_char + chunk_offset,
if not token.text.isspace()] 'tag': 's'}
# No space variant for cwb original .vrt file input. annotations.append(annotation)
for token in sent_no_space: # Check if the token is the start of an entity
output_file_original.write('{}'.format(escape(token.text)) if token.ent_iob == 3:
+ '\t{}'.format(escape(token.lemma_)) for ent_candidate in token.sent.ents:
+ '\t{}'.format(token.pos_) if ent_candidate.start_char == token.idx:
+ '\t{}'.format(token.tag_) ent = ent_candidate
+ '\t{}\n'.format(token.ent_type_ or 'NULL')) break
# Stand off variant with spaces. annotation = {'start': ent.start_char + chunk_offset,
for token in sent: 'end': ent.end_char + chunk_offset,
token_start = token.idx + text_offset 'tag': 'ent',
token_end = token.idx + len(token.text) + text_offset 'properties': {'type': token.ent_type_}}
output_file_stand_off.write('{}:{}'.format(token_start, annotations.append(annotation)
token_end) annotation = {'start': token.idx + chunk_offset,
+ '\t{}'.format(escape(token.lemma_)) 'end': token.idx + len(token.text) + chunk_offset,
+ '\t{}'.format(token.pos_) 'tag': 'token',
+ '\t{}'.format(token.tag_) 'properties': {'pos': token.tag_,
+ '\t{}\n'.format(token.ent_type_ or 'NULL')) 'lemma': token.lemma_,
output_file_original.write('</s>\n') 'simple_pos': token.pos_}}
output_file_stand_off.write('</s>\n') if token.ent_type_:
text_offset = token_end + 1 annotation['properties']['ner'] = token.ent_type_
output_file_original.write('</text>\n</corpus>') annotations.append(annotation)
output_file_stand_off.write('</text>\n</corpus>') chunk_offset = len(text_chunk)
with open(args.output, 'w') as output_file:
json.dump({'meta': meta, 'tags': tags, 'annotations': annotations},
output_file, indent=4)

107
vrt-creator Normal file
View File

@ -0,0 +1,107 @@
#!/usr/bin/env python3.7
# coding=utf-8
from argparse import ArgumentParser
from xml.sax.saxutils import escape
import json
# Parse the given arguments
parser = ArgumentParser(description='Create annotations for a given txt file')
parser.add_argument('input', metavar='Path to txt input file')
parser.add_argument('annotations', metavar='Path to JSON annotation file')
parser.add_argument('output', metavar='Path to vrt output file')
args = parser.parse_args()
with open(args.input) as text_file, \
open(args.annotations) as data_file:
text = text_file.read()
stand_off_data = json.load(data_file)
def meta_to_string():
string = ''
string += '<generator software="{} ({})" arguments="check_encoding: {}; language: {}"/>\n'.format( # noqa
stand_off_data['meta']['generator']['name'],
stand_off_data['meta']['generator']['version'],
stand_off_data['meta']['generator']['arguments']['check_encoding'],
stand_off_data['meta']['generator']['arguments']['language']
)
string += '<file name="{}" md5="{}"/>\n'.format(
stand_off_data['meta']['file']['name'],
stand_off_data['meta']['file']['md5']
)
return string
def tags_to_string():
return ''
def annotations_to_string(end=float('inf')):
string = ''
while stand_off_data['annotations']:
if stand_off_data['annotations'][0]['start'] >= end:
break
annotation = stand_off_data['annotations'].pop(0)
#######################################################################
# Check for malformed annotations #
#######################################################################
if 'tag' not in annotation:
raise Exception('Annotation tag is missing')
if annotation['tag'] not in stand_off_data['tags']:
raise Exception('Unknown annotation tag: ' + annotation['tag'])
tag_model = stand_off_data['tags'][annotation['tag']]
if 'properties' in tag_model:
properties_model = tag_model['properties']
if properties_model is not None:
required_properties = filter(lambda x: 'flags' in x and 'required' in x['flags'], properties_model) # noqa
if required_properties and annotation['properties'] is None:
raise Exception('There are required properties but the "Properties" attribute is missing') # noqa
for property in required_properties:
if property not in annotation['properties']:
raise Exception('Required property is missing: ' + property) # noqa
#######################################################################
# Process tokens ~ cwb's positional attributes #
#######################################################################
if annotation['tag'] == 'token':
string += '{}\t{}\t{}\t{}\t{}\n'.format(
escape(text[annotation['start']:annotation['end']]),
escape(annotation['properties']['pos']),
escape(annotation['properties']['lemma']),
escape(annotation['properties']['simple_pos']),
escape(annotation['properties']['ner'] if 'ner' in annotation['properties'] else 'None') # noqa
)
#######################################################################
# Process other tags ~ cwb's structural attributes #
#######################################################################
else:
properties = ''
if 'properties' in annotation and annotation['properties'] is not None: # noqa
for property, value in annotation['properties'].items():
if not value:
continue
if properties_model and property in properties_model:
if 'flags' in properties_model and 'multiple' in properties_model['flags']: # noqa
properties += ' {}="|{}|"'.format(property, '|'.join(value)) # noqa
else:
properties += ' {}="{}"'.format(property, value)
string += '<' + annotation['tag'] + properties + '>\n'
string += annotations_to_string(end=min(annotation['end'], end))
string += '</' + annotation['tag'] + '>\n'
return string
vrt = ''
vrt += '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'
vrt += '<corpus>\n'
vrt += '<text>\n'
vrt += meta_to_string()
vrt += tags_to_string()
vrt += annotations_to_string()
vrt += '</text>\n'
vrt += '</corpus>'
with open(args.output, 'w') as vrt_file:
vrt_file.write(vrt)