diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 3d97cac..9ee7b2d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,8 +1,5 @@ image: docker:19.03.13 -variables: - DOCKER_TLS_CERTDIR: "/certs" - services: - docker:19.03.13-dind @@ -10,6 +7,10 @@ stages: - build - push +variables: + DOCKER_TLS_CERTDIR: "/certs" + INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME-$CI_COMMIT_SHA + .reg_setup: before_script: - apk add --no-cache curl @@ -28,8 +29,6 @@ build_image: stage: build tags: - docker - variables: - INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA push_master: extends: @@ -47,7 +46,6 @@ push_master: - docker variables: IMAGE_TAG: $CI_REGISTRY_IMAGE:latest - INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA push_other: extends: @@ -68,4 +66,3 @@ push_other: - docker variables: IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME - INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA diff --git a/Dockerfile b/Dockerfile index 6c9f483..bdfddb6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,28 +7,29 @@ LABEL authors="Patrick Jentsch , Stephan Porada \n' - + '\n' - + '\n' - + '\n'.format(source_md5)) -with open(output_file_original_filename, 'w+') as output_file_original, \ - open(output_file_stand_off_filename, 'w+') as output_file_stand_off: +tags = { + 'token': { + 'description': '', + 'properties': { + 'lemma': { + 'description': 'The base form of the word', + 'flags': ['required'], + 'tagset': None + }, + 'pos': { + 'description': 'The detailed part-of-speech tag', + 'flags': ['required'], + 'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['tagger']} # noqa + }, + 'simple_pos': { + 'description': 'The simple UPOS part-of-speech tag', + 'flags': ['required'], + 'tagset': { + 'ADJ': 'adjective', + 'ADP': 'adposition', + 'ADV': 'adverb', + 'AUX': 'auxiliary verb', + 'CONJ': 'coordinating conjunction', + 'DET': 'determiner', + 'INTJ': 'interjection', + 'NOUN': 'noun', + 'NUM': 'numeral', + 'PART': 'particle', + 'PRON': 'pronoun', + 'PROPN': 'proper noun', + 'PUNCT': 'punctuation', + 'SCONJ': 'subordinating conjunction', + 'SYM': 'symbol', + 'VERB': 'verb', + 'X': 'other' + } + }, + 'ner': { + 'description': 'Label indicating the type of the entity', + 'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['ner']} # noqa + } + } + }, + 's': { + 'description': 'Encodes the start and end of a sentence', + 'properties': None + }, + 'ent': { + 'description': 'Encodes the start and end of a named entity', + 'properties': { + 'type': { + 'description': 'Label indicating the type of the entity', + 'flags': ['required'], + 'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['ner']} # noqa + } + } + } +} - output_file_original.write(common_xml) - output_file_stand_off.write(common_xml) - text_offset = 0 - for text_chunk in text_chunks: - doc = nlp(text_chunk) - for sent in doc.sents: - output_file_original.write('\n') - output_file_stand_off.write('\n') - space_flag = False - # Skip whitespace tokens - sent_no_space = [token for token in sent - if not token.text.isspace()] - # No space variant for cwb original .vrt file input. - for token in sent_no_space: - output_file_original.write('{}'.format(escape(token.text)) - + '\t{}'.format(escape(token.lemma_)) - + '\t{}'.format(token.pos_) - + '\t{}'.format(token.tag_) - + '\t{}\n'.format(token.ent_type_ or 'NULL')) - # Stand off variant with spaces. - for token in sent: - token_start = token.idx + text_offset - token_end = token.idx + len(token.text) + text_offset - output_file_stand_off.write('{}:{}'.format(token_start, - token_end) - + '\t{}'.format(escape(token.lemma_)) - + '\t{}'.format(token.pos_) - + '\t{}'.format(token.tag_) - + '\t{}\n'.format(token.ent_type_ or 'NULL')) - output_file_original.write('\n') - output_file_stand_off.write('\n') - text_offset = token_end + 1 - output_file_original.write('\n') - output_file_stand_off.write('\n') +annotations = [] + +chunk_offset = 0 +for text_chunk in text_chunks: + doc = nlp(text_chunk) + for token in doc: + if token.is_space: + continue + if token.is_sent_start: + annotation = {'start': token.sent.start_char + chunk_offset, + 'end': token.sent.end_char + chunk_offset, + 'tag': 's'} + annotations.append(annotation) + # Check if the token is the start of an entity + if token.ent_iob == 3: + for ent_candidate in token.sent.ents: + if ent_candidate.start_char == token.idx: + ent = ent_candidate + break + annotation = {'start': ent.start_char + chunk_offset, + 'end': ent.end_char + chunk_offset, + 'tag': 'ent', + 'properties': {'type': token.ent_type_}} + annotations.append(annotation) + annotation = {'start': token.idx + chunk_offset, + 'end': token.idx + len(token.text) + chunk_offset, + 'tag': 'token', + 'properties': {'pos': token.tag_, + 'lemma': token.lemma_, + 'simple_pos': token.pos_}} + if token.ent_type_: + annotation['properties']['ner'] = token.ent_type_ + annotations.append(annotation) + chunk_offset = len(text_chunk) + +with open(args.output, 'w') as output_file: + json.dump({'meta': meta, 'tags': tags, 'annotations': annotations}, + output_file, indent=4) diff --git a/vrt-creator b/vrt-creator new file mode 100644 index 0000000..48902f1 --- /dev/null +++ b/vrt-creator @@ -0,0 +1,107 @@ +#!/usr/bin/env python3.7 +# coding=utf-8 + +from argparse import ArgumentParser +from xml.sax.saxutils import escape +import json + +# Parse the given arguments +parser = ArgumentParser(description='Create annotations for a given txt file') +parser.add_argument('input', metavar='Path to txt input file') +parser.add_argument('annotations', metavar='Path to JSON annotation file') +parser.add_argument('output', metavar='Path to vrt output file') +args = parser.parse_args() + +with open(args.input) as text_file, \ + open(args.annotations) as data_file: + text = text_file.read() + stand_off_data = json.load(data_file) + + +def meta_to_string(): + string = '' + string += '\n'.format( # noqa + stand_off_data['meta']['generator']['name'], + stand_off_data['meta']['generator']['version'], + stand_off_data['meta']['generator']['arguments']['check_encoding'], + stand_off_data['meta']['generator']['arguments']['language'] + ) + string += '\n'.format( + stand_off_data['meta']['file']['name'], + stand_off_data['meta']['file']['md5'] + ) + return string + + +def tags_to_string(): + return '' + + +def annotations_to_string(end=float('inf')): + string = '' + while stand_off_data['annotations']: + if stand_off_data['annotations'][0]['start'] >= end: + break + annotation = stand_off_data['annotations'].pop(0) + ####################################################################### + # Check for malformed annotations # + ####################################################################### + if 'tag' not in annotation: + raise Exception('Annotation tag is missing') + + if annotation['tag'] not in stand_off_data['tags']: + raise Exception('Unknown annotation tag: ' + annotation['tag']) + + tag_model = stand_off_data['tags'][annotation['tag']] + if 'properties' in tag_model: + properties_model = tag_model['properties'] + if properties_model is not None: + required_properties = filter(lambda x: 'flags' in x and 'required' in x['flags'], properties_model) # noqa + if required_properties and annotation['properties'] is None: + raise Exception('There are required properties but the "Properties" attribute is missing') # noqa + for property in required_properties: + if property not in annotation['properties']: + raise Exception('Required property is missing: ' + property) # noqa + ####################################################################### + # Process tokens ~ cwb's positional attributes # + ####################################################################### + if annotation['tag'] == 'token': + string += '{}\t{}\t{}\t{}\t{}\n'.format( + escape(text[annotation['start']:annotation['end']]), + escape(annotation['properties']['pos']), + escape(annotation['properties']['lemma']), + escape(annotation['properties']['simple_pos']), + escape(annotation['properties']['ner'] if 'ner' in annotation['properties'] else 'None') # noqa + ) + ####################################################################### + # Process other tags ~ cwb's structural attributes # + ####################################################################### + else: + properties = '' + if 'properties' in annotation and annotation['properties'] is not None: # noqa + for property, value in annotation['properties'].items(): + if not value: + continue + if properties_model and property in properties_model: + if 'flags' in properties_model and 'multiple' in properties_model['flags']: # noqa + properties += ' {}="|{}|"'.format(property, '|'.join(value)) # noqa + else: + properties += ' {}="{}"'.format(property, value) + string += '<' + annotation['tag'] + properties + '>\n' + string += annotations_to_string(end=min(annotation['end'], end)) + string += '\n' + return string + + +vrt = '' +vrt += '\n' +vrt += '\n' +vrt += '\n' +vrt += meta_to_string() +vrt += tags_to_string() +vrt += annotations_to_string() +vrt += '\n' +vrt += '' + +with open(args.output, 'w') as vrt_file: + vrt_file.write(vrt)