diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 3d97cac..9ee7b2d 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,8 +1,5 @@
image: docker:19.03.13
-variables:
- DOCKER_TLS_CERTDIR: "/certs"
-
services:
- docker:19.03.13-dind
@@ -10,6 +7,10 @@ stages:
- build
- push
+variables:
+ DOCKER_TLS_CERTDIR: "/certs"
+ INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME-$CI_COMMIT_SHA
+
.reg_setup:
before_script:
- apk add --no-cache curl
@@ -28,8 +29,6 @@ build_image:
stage: build
tags:
- docker
- variables:
- INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
push_master:
extends:
@@ -47,7 +46,6 @@ push_master:
- docker
variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:latest
- INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
push_other:
extends:
@@ -68,4 +66,3 @@ push_other:
- docker
variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
- INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
diff --git a/Dockerfile b/Dockerfile
index 6c9f483..bdfddb6 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,28 +7,29 @@ LABEL authors="Patrick Jentsch
, Stephan Porada \n'
- + '\n'
- + '\n'
- + '\n'.format(source_md5))
-with open(output_file_original_filename, 'w+') as output_file_original, \
- open(output_file_stand_off_filename, 'w+') as output_file_stand_off:
+tags = {
+ 'token': {
+ 'description': '',
+ 'properties': {
+ 'lemma': {
+ 'description': 'The base form of the word',
+ 'flags': ['required'],
+ 'tagset': None
+ },
+ 'pos': {
+ 'description': 'The detailed part-of-speech tag',
+ 'flags': ['required'],
+ 'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['tagger']} # noqa
+ },
+ 'simple_pos': {
+ 'description': 'The simple UPOS part-of-speech tag',
+ 'flags': ['required'],
+ 'tagset': {
+ 'ADJ': 'adjective',
+ 'ADP': 'adposition',
+ 'ADV': 'adverb',
+ 'AUX': 'auxiliary verb',
+ 'CONJ': 'coordinating conjunction',
+ 'DET': 'determiner',
+ 'INTJ': 'interjection',
+ 'NOUN': 'noun',
+ 'NUM': 'numeral',
+ 'PART': 'particle',
+ 'PRON': 'pronoun',
+ 'PROPN': 'proper noun',
+ 'PUNCT': 'punctuation',
+ 'SCONJ': 'subordinating conjunction',
+ 'SYM': 'symbol',
+ 'VERB': 'verb',
+ 'X': 'other'
+ }
+ },
+ 'ner': {
+ 'description': 'Label indicating the type of the entity',
+ 'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['ner']} # noqa
+ }
+ }
+ },
+ 's': {
+ 'description': 'Encodes the start and end of a sentence',
+ 'properties': None
+ },
+ 'ent': {
+ 'description': 'Encodes the start and end of a named entity',
+ 'properties': {
+ 'type': {
+ 'description': 'Label indicating the type of the entity',
+ 'flags': ['required'],
+ 'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['ner']} # noqa
+ }
+ }
+ }
+}
- output_file_original.write(common_xml)
- output_file_stand_off.write(common_xml)
- text_offset = 0
- for text_chunk in text_chunks:
- doc = nlp(text_chunk)
- for sent in doc.sents:
- output_file_original.write('\n')
- output_file_stand_off.write('\n')
- space_flag = False
- # Skip whitespace tokens
- sent_no_space = [token for token in sent
- if not token.text.isspace()]
- # No space variant for cwb original .vrt file input.
- for token in sent_no_space:
- output_file_original.write('{}'.format(escape(token.text))
- + '\t{}'.format(escape(token.lemma_))
- + '\t{}'.format(token.pos_)
- + '\t{}'.format(token.tag_)
- + '\t{}\n'.format(token.ent_type_ or 'NULL'))
- # Stand off variant with spaces.
- for token in sent:
- token_start = token.idx + text_offset
- token_end = token.idx + len(token.text) + text_offset
- output_file_stand_off.write('{}:{}'.format(token_start,
- token_end)
- + '\t{}'.format(escape(token.lemma_))
- + '\t{}'.format(token.pos_)
- + '\t{}'.format(token.tag_)
- + '\t{}\n'.format(token.ent_type_ or 'NULL'))
- output_file_original.write('\n')
- output_file_stand_off.write('\n')
- text_offset = token_end + 1
- output_file_original.write('\n')
- output_file_stand_off.write('\n')
+annotations = []
+
+chunk_offset = 0
+for text_chunk in text_chunks:
+ doc = nlp(text_chunk)
+ for token in doc:
+ if token.is_space:
+ continue
+ if token.is_sent_start:
+ annotation = {'start': token.sent.start_char + chunk_offset,
+ 'end': token.sent.end_char + chunk_offset,
+ 'tag': 's'}
+ annotations.append(annotation)
+ # Check if the token is the start of an entity
+ if token.ent_iob == 3:
+ for ent_candidate in token.sent.ents:
+ if ent_candidate.start_char == token.idx:
+ ent = ent_candidate
+ break
+ annotation = {'start': ent.start_char + chunk_offset,
+ 'end': ent.end_char + chunk_offset,
+ 'tag': 'ent',
+ 'properties': {'type': token.ent_type_}}
+ annotations.append(annotation)
+ annotation = {'start': token.idx + chunk_offset,
+ 'end': token.idx + len(token.text) + chunk_offset,
+ 'tag': 'token',
+ 'properties': {'pos': token.tag_,
+ 'lemma': token.lemma_,
+ 'simple_pos': token.pos_}}
+ if token.ent_type_:
+ annotation['properties']['ner'] = token.ent_type_
+ annotations.append(annotation)
+ chunk_offset = len(text_chunk)
+
+with open(args.output, 'w') as output_file:
+ json.dump({'meta': meta, 'tags': tags, 'annotations': annotations},
+ output_file, indent=4)
diff --git a/vrt-creator b/vrt-creator
new file mode 100644
index 0000000..48902f1
--- /dev/null
+++ b/vrt-creator
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3.7
+# coding=utf-8
+
+from argparse import ArgumentParser
+from xml.sax.saxutils import escape
+import json
+
+# Parse the given arguments
+parser = ArgumentParser(description='Create annotations for a given txt file')
+parser.add_argument('input', metavar='Path to txt input file')
+parser.add_argument('annotations', metavar='Path to JSON annotation file')
+parser.add_argument('output', metavar='Path to vrt output file')
+args = parser.parse_args()
+
+with open(args.input) as text_file, \
+ open(args.annotations) as data_file:
+ text = text_file.read()
+ stand_off_data = json.load(data_file)
+
+
+def meta_to_string():
+ string = ''
+ string += '\n'.format( # noqa
+ stand_off_data['meta']['generator']['name'],
+ stand_off_data['meta']['generator']['version'],
+ stand_off_data['meta']['generator']['arguments']['check_encoding'],
+ stand_off_data['meta']['generator']['arguments']['language']
+ )
+ string += '\n'.format(
+ stand_off_data['meta']['file']['name'],
+ stand_off_data['meta']['file']['md5']
+ )
+ return string
+
+
+def tags_to_string():
+ return ''
+
+
+def annotations_to_string(end=float('inf')):
+ string = ''
+ while stand_off_data['annotations']:
+ if stand_off_data['annotations'][0]['start'] >= end:
+ break
+ annotation = stand_off_data['annotations'].pop(0)
+ #######################################################################
+ # Check for malformed annotations #
+ #######################################################################
+ if 'tag' not in annotation:
+ raise Exception('Annotation tag is missing')
+
+ if annotation['tag'] not in stand_off_data['tags']:
+ raise Exception('Unknown annotation tag: ' + annotation['tag'])
+
+ tag_model = stand_off_data['tags'][annotation['tag']]
+ if 'properties' in tag_model:
+ properties_model = tag_model['properties']
+ if properties_model is not None:
+ required_properties = filter(lambda x: 'flags' in x and 'required' in x['flags'], properties_model) # noqa
+ if required_properties and annotation['properties'] is None:
+ raise Exception('There are required properties but the "Properties" attribute is missing') # noqa
+ for property in required_properties:
+ if property not in annotation['properties']:
+ raise Exception('Required property is missing: ' + property) # noqa
+ #######################################################################
+ # Process tokens ~ cwb's positional attributes #
+ #######################################################################
+ if annotation['tag'] == 'token':
+ string += '{}\t{}\t{}\t{}\t{}\n'.format(
+ escape(text[annotation['start']:annotation['end']]),
+ escape(annotation['properties']['pos']),
+ escape(annotation['properties']['lemma']),
+ escape(annotation['properties']['simple_pos']),
+ escape(annotation['properties']['ner'] if 'ner' in annotation['properties'] else 'None') # noqa
+ )
+ #######################################################################
+ # Process other tags ~ cwb's structural attributes #
+ #######################################################################
+ else:
+ properties = ''
+ if 'properties' in annotation and annotation['properties'] is not None: # noqa
+ for property, value in annotation['properties'].items():
+ if not value:
+ continue
+ if properties_model and property in properties_model:
+ if 'flags' in properties_model and 'multiple' in properties_model['flags']: # noqa
+ properties += ' {}="|{}|"'.format(property, '|'.join(value)) # noqa
+ else:
+ properties += ' {}="{}"'.format(property, value)
+ string += '<' + annotation['tag'] + properties + '>\n'
+ string += annotations_to_string(end=min(annotation['end'], end))
+ string += '' + annotation['tag'] + '>\n'
+ return string
+
+
+vrt = ''
+vrt += '\n'
+vrt += '\n'
+vrt += '\n'
+vrt += meta_to_string()
+vrt += tags_to_string()
+vrt += annotations_to_string()
+vrt += '\n'
+vrt += ''
+
+with open(args.output, 'w') as vrt_file:
+ vrt_file.write(vrt)