From 0ba0c14b7268c24c1feb787f8d5f3751e2f74da0 Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Tue, 10 Aug 2021 14:43:55 +0200 Subject: [PATCH] First attempt --- .../stand_off_data/export.py | 80 ----------- .../stand_off_data/models.py | 97 ++++++++++++- vrt-creator | 6 + vrt-creator.bak | 130 ------------------ 4 files changed, 99 insertions(+), 214 deletions(-) delete mode 100644 packages/stand-off-data-py/stand_off_data/export.py delete mode 100644 vrt-creator.bak diff --git a/packages/stand-off-data-py/stand_off_data/export.py b/packages/stand-off-data-py/stand_off_data/export.py deleted file mode 100644 index afb49f6..0000000 --- a/packages/stand-off-data-py/stand_off_data/export.py +++ /dev/null @@ -1,80 +0,0 @@ -from xml.sax.saxutils import escape - - -class ExportMixin: - def to_vrt(self, text): - # Devide annotations into CWB's verticalized text format (.vrt) logic - p_attrs = [] # positional attributes - s_attrs = [] # structural attributes - for annotation in self.annotations: - if annotation.name == 'token': - p_attrs.append(annotation) - else: - s_attrs.append(annotation) - # Sort annotations, necessary for the next checks - p_attrs.sort() - s_attrs.sort() - # Check for p_attr<->p_attr overlap - for i, p_attr in enumerate(p_attrs[:-1]): - next_p_attr = p_attrs[i + 1] - # Check if first_p_attr starts/ends within second_p_attr - if ((p_attr.start >= next_p_attr.start) and (p_attr.start <= next_p_attr.end) # noqa - or (p_attr.end >= next_p_attr.start) and (p_attr.end <= next_p_attr.end)): # noqa - raise Exception('Positional attribute overlaps another') - # Check for s_attr<->p_attr overlap - for i, s_attr in enumerate(s_attrs): - for p_attr in p_attrs: - # Check if s_attr starts within p_attr - if s_attr.start > p_attr.start and s_attr.start < p_attr.end: - # Change s_attr start to p_attr's start - s_attrs[i].start = p_attr.start - # Check if s_attr ends within p_attr - if s_attr.end < p_attr.end and s_attr.end > p_attr.start: - # Change s_attr end to p_attr's end - s_attrs[i].end = p_attr.end - # Check if s_attr starts/ends before/after p_attr - if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start: - # No further Checking needed (just because p_attrs are sorted) - break - s_attr_start_buffer = {} - s_attr_end_buffer = {} - for i, s_attr in enumerate(s_attrs): - if s_attr_start_buffer[s_attr.start]: - s_attr_start_buffer[s_attr.start].append(i) - else: - s_attr_start_buffer[s_attr.start] = [i] - if s_attr_end_buffer[s_attr.end]: - s_attr_end_buffer[s_attr.end].append(i) - else: - s_attr_end_buffer[s_attr.end] = [i] - vrt = '' - vrt += '\n' - for p_attr in p_attrs: - # s_attr_starts - for k in {k: v for k, v in s_attr_start_buffer.items() if k <= p_attr.start}: # noqa - s_attrs = s_attr_start_buffer.pop(k) - for s_attr in s_attrs: - foo = '' - for property in s_attr.properties: - foo += ' {}="{}"'.format(escape(property.name), - escape(property.value)) - vrt += '<{}{}>\n'.format(escape(s_attr.name), foo) - for k in {k: v for k, v in s_attr_end_buffer.items() if k <= p_attr.start}: # noqa - s_attrs = s_attr_end_buffer.pop(k) - for s_attr in s_attrs: - vrt += '\n'.format(escape(s_attr.name)) - # s_attr_ends - foo = {'lemma': None, 'ner': None, 'pos': None, 'simple_pos': None, 'word': None} # noqa - for property in p_attrs.properties: - if property.name == 'lemma': - foo['lemma'] = escape(property.value) - elif property.name == 'ner': - foo['ner'] = escape(property.value) - elif property.name == 'pos': - foo['pos'] = escape(property.value) - elif property.name == 'simple_pos': - foo['simple_pos'] = escape(property.value) - foo['word'] = escape(text[p_attr.start:p_attr.end]) - vrt += '{word}\t{pos}\t{lemma}\t{simple_pos}\t{ner}\n'.format( - **foo) - vrt += '\n' diff --git a/packages/stand-off-data-py/stand_off_data/models.py b/packages/stand-off-data-py/stand_off_data/models.py index e447fa1..7d6efc3 100644 --- a/packages/stand-off-data-py/stand_off_data/models.py +++ b/packages/stand-off-data-py/stand_off_data/models.py @@ -1,7 +1,7 @@ -from .export import ExportMixin +from xml.sax.saxutils import escape -class StandOffData(ExportMixin): +class StandOffData: def __init__(self, attrs): self.meta = attrs.get('meta', {}) self.lookup = {tag_definition.id: tag_definition for tag_definition in @@ -9,6 +9,80 @@ class StandOffData(ExportMixin): self.annotations = [TagAnnotation(x, self.lookup) for x in attrs.get('annotations', [])] + def to_vrt(self, text): + # Devide annotations into CWB's verticalized text format (.vrt) logic + p_attrs = [] # positional attributes + s_attrs = [] # structural attributes + for annotation in self.annotations: + if annotation.name == 'token': + p_attrs.append(annotation) + else: + s_attrs.append(annotation) + # Sort annotations, necessary for the next checks + p_attrs.sort() + s_attrs.sort() + # Check for p_attr<->p_attr overlap + for i, p_attr in enumerate(p_attrs[:-1]): + next_p_attr = p_attrs[i + 1] + # Check if first_p_attr starts/ends within second_p_attr + if ((p_attr.start >= next_p_attr.start) and (p_attr.start < next_p_attr.end) # noqa + or (p_attr.end > next_p_attr.start) and (p_attr.end <= next_p_attr.end)): # noqa + raise Exception( + 'Positional attribute overlaps another: {}<->{}'.format(p_attr.to_dict(), next_p_attr.to_dict())) + # Check for s_attr<->p_attr overlap + for i, s_attr in enumerate(s_attrs): + for p_attr in p_attrs: + # Check if s_attr starts within p_attr + if s_attr.start > p_attr.start and s_attr.start < p_attr.end: + # Change s_attr start to p_attr's start + s_attrs[i].start = p_attr.start + # Check if s_attr ends within p_attr + if s_attr.end < p_attr.end and s_attr.end > p_attr.start: + # Change s_attr end to p_attr's end + s_attrs[i].end = p_attr.end + # Check if s_attr starts/ends before/after p_attr + if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start: + # No further Checking needed (just because p_attrs are sorted) + break + s_attr_start_buffer = {} + s_attr_end_buffer = {} + for i, s_attr in enumerate(s_attrs): + if s_attr.start in s_attr_start_buffer: + s_attr_start_buffer[s_attr.start].append(i) + else: + s_attr_start_buffer[s_attr.start] = [i] + if s_attr.end in s_attr_end_buffer: + s_attr_end_buffer[s_attr.end].append(i) + else: + s_attr_end_buffer[s_attr.end] = [i] + vrt = '' + vrt += '\n' + for p_attr in p_attrs: + # s_attr_ends + for k in {k: v for k, v in s_attr_end_buffer.items() if k <= p_attr.start}: # noqa + s_attr_indexes = s_attr_end_buffer.pop(k) + for s_attr_index in s_attr_indexes: + s_attr = s_attrs[s_attr_index] + vrt += '\n'.format(escape(s_attr.name)) + # s_attr_starts + for k in {k: v for k, v in s_attr_start_buffer.items() if k <= p_attr.start}: # noqa + s_attr_indexes = s_attr_start_buffer.pop(k) + for s_attr_index in s_attr_indexes: + s_attr = s_attrs[s_attr_index] + foo = '' + for property in s_attr.properties: + foo += ' {}="{}"'.format(escape(property.name), + escape(property.value)) + vrt += '<{}{}>\n'.format(escape(s_attr.name), foo) + foo = {'lemma': None, 'ner': None, 'pos': None, 'simple_pos': None, 'word': None} # noqa + for property in p_attr.properties: + foo[property.name] = escape(property.value) + foo['word'] = escape(text[p_attr.start:p_attr.end]) + vrt += '{word}\t{pos}\t{lemma}\t{simple_pos}\t{ner}\n'.format( + **foo) + vrt += '\n' + return vrt + class TagAnnotation: def __init__(self, attrs, lookup): @@ -20,13 +94,13 @@ class TagAnnotation: self.end = attrs['end'] if self.start >= self.end: raise Exception('start must be lower then end') - self.description = attrs.get('description', '') self.properties = [ PropertyAnnotation({**x, 'tag_id': self.tag_id}, self.lookup) for x in attrs.get('properties', []) ] + property_ids = [x.property_id for x in self.properties] for required_property_id in self.lookup[self.tag_id].required_properties: - if required_property_id not in self.properties: + if required_property_id not in property_ids: raise Exception( 'Missing required property: {}'.format(required_property_id)) @@ -34,6 +108,14 @@ class TagAnnotation: def name(self): return self.lookup[self.tag_id].name + def to_dict(self): + return { + 'tag_id': self.tag_id, + 'start': self.start, + 'end': self.end, + 'properties': [x.to_dict() for x in self.properties] + } + def __lt__(self, other): if self.start == other.start: return self.name == 'token' and other.name != 'token' @@ -79,6 +161,13 @@ class PropertyAnnotation: def name(self): return self.lookup[self.tag_id].properties[self.property_id].name + def to_dict(self): + return { + 'property_id': self.property_id, + 'tag_id': self.tag_id, + 'value': self.value + } + class TagDefinition: def __init__(self, attrs): diff --git a/vrt-creator b/vrt-creator index 4624c64..3443168 100755 --- a/vrt-creator +++ b/vrt-creator @@ -25,6 +25,12 @@ def main(): if text_md5.hexdigest() != stand_off_data.meta['file']['md5']: raise Exception('md5 not equal') + with open(args.text, encoding=stand_off_data.meta['file']['encoding']) as text_file: + text = text_file.read() + + with open(args.output, 'w') as vrt_file: + vrt_file.write(stand_off_data.to_vrt(text)) + if __name__ == '__main__': main() diff --git a/vrt-creator.bak b/vrt-creator.bak deleted file mode 100644 index e998903..0000000 --- a/vrt-creator.bak +++ /dev/null @@ -1,130 +0,0 @@ -#!/usr/bin/env python3.7 -# coding=utf-8 - -from argparse import ArgumentParser -from xml.sax.saxutils import escape -import hashlib -import json - - -# Two global ressources - Not very elegant but it works for now -stand_off_data = None -text = None - - -def meta_to_string(): - string = '' - string += '\n'.format( # noqa - stand_off_data['meta']['generator']['name'], - stand_off_data['meta']['generator']['version'], - stand_off_data['meta']['generator']['arguments']['check_encoding'], - stand_off_data['meta']['generator']['arguments']['language'] - ) - string += '\n'.format( - stand_off_data['meta']['file']['encoding'], - stand_off_data['meta']['file']['name'], - stand_off_data['meta']['file']['md5'] - ) - return string - - -def tags_to_string(): - return '' - - -def annotations_to_string(end=float('inf')): - string = '' - while stand_off_data['annotations']: - if stand_off_data['annotations'][0]['start'] >= end: - break - annotation = stand_off_data['annotations'].pop(0) - ####################################################################### - # Check for malformed annotations # - ####################################################################### - if 'tag' not in annotation: - raise Exception('Annotation tag is missing') - - if annotation['tag'] not in stand_off_data['tags']: - raise Exception('Unknown annotation tag: ' + annotation['tag']) - - tag_model = stand_off_data['tags'][annotation['tag']] - if 'properties' in tag_model: - properties_model = tag_model['properties'] - if properties_model is not None: - required_properties = filter(lambda x: 'flags' in x and 'required' in x['flags'], properties_model) # noqa - if required_properties and annotation['properties'] is None: - raise Exception('There are required properties but the "Properties" attribute is missing') # noqa - for property in required_properties: - if property not in annotation['properties']: - raise Exception('Required property is missing: ' + property) # noqa - ####################################################################### - # Process tokens ~ cwb's positional attributes # - ####################################################################### - if annotation['tag'] == 'token': - string += '{}\t{}\t{}\t{}\t{}\n'.format( - escape(text[annotation['start']:annotation['end']]), - escape(annotation['properties']['pos']), - escape(annotation['properties']['lemma']), - escape(annotation['properties']['simple_pos']), - escape(annotation['properties']['ner'] if 'ner' in annotation['properties'] else 'None') # noqa - ) - ####################################################################### - # Process other tags ~ cwb's structural attributes # - ####################################################################### - else: - properties = '' - if 'properties' in annotation and annotation['properties'] is not None: # noqa - for property, value in annotation['properties'].items(): - if not value: - continue - if properties_model and property in properties_model: - if 'flags' in properties_model and 'multiple' in properties_model['flags']: # noqa - properties += ' {}="|{}|"'.format(property, '|'.join(value)) # noqa - else: - properties += ' {}="{}"'.format(property, value) - string += '<' + annotation['tag'] + properties + '>\n' - string += annotations_to_string(end=min(annotation['end'], end)) - string += '\n' - return string - - -def main(): - global stand_off_data - global text - - # Parse the given arguments - parser = ArgumentParser(description='Create a vrt from JSON and txt') - parser.add_argument('text', help='Path to txt file') - parser.add_argument('stand_off_data', help='Path to JSON file') - parser.add_argument('output', help='Path to vrt output file') - args = parser.parse_args() - - with open(args.stand_off_data) as stand_of_data_file: - stand_off_data = json.load(stand_of_data_file) - - with open(args.text, "rb") as text_file: - text_md5 = hashlib.md5() - for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''): # noqa - text_md5.update(chunk) - if text_md5.hexdigest() != stand_off_data['meta']['file']['md5']: - raise Exception('md5 not equal') - - with open(args.text, encoding=stand_off_data['meta']['file']['encoding']) as text_file: # noqa - text = text_file.read() - - vrt = '' - vrt += '\n' - vrt += '\n' - vrt += '\n' - vrt += meta_to_string() - vrt += tags_to_string() - vrt += annotations_to_string() - vrt += '\n' - vrt += '' - - with open(args.output, 'w') as vrt_file: - vrt_file.write(vrt) - - -if __name__ == '__main__': - main()