diff --git a/packages/stand-off-data-py/stand_off_data/export.py b/packages/stand-off-data-py/stand_off_data/export.py new file mode 100644 index 0000000..afb49f6 --- /dev/null +++ b/packages/stand-off-data-py/stand_off_data/export.py @@ -0,0 +1,80 @@ +from xml.sax.saxutils import escape + + +class ExportMixin: + def to_vrt(self, text): + # Devide annotations into CWB's verticalized text format (.vrt) logic + p_attrs = [] # positional attributes + s_attrs = [] # structural attributes + for annotation in self.annotations: + if annotation.name == 'token': + p_attrs.append(annotation) + else: + s_attrs.append(annotation) + # Sort annotations, necessary for the next checks + p_attrs.sort() + s_attrs.sort() + # Check for p_attr<->p_attr overlap + for i, p_attr in enumerate(p_attrs[:-1]): + next_p_attr = p_attrs[i + 1] + # Check if first_p_attr starts/ends within second_p_attr + if ((p_attr.start >= next_p_attr.start) and (p_attr.start <= next_p_attr.end) # noqa + or (p_attr.end >= next_p_attr.start) and (p_attr.end <= next_p_attr.end)): # noqa + raise Exception('Positional attribute overlaps another') + # Check for s_attr<->p_attr overlap + for i, s_attr in enumerate(s_attrs): + for p_attr in p_attrs: + # Check if s_attr starts within p_attr + if s_attr.start > p_attr.start and s_attr.start < p_attr.end: + # Change s_attr start to p_attr's start + s_attrs[i].start = p_attr.start + # Check if s_attr ends within p_attr + if s_attr.end < p_attr.end and s_attr.end > p_attr.start: + # Change s_attr end to p_attr's end + s_attrs[i].end = p_attr.end + # Check if s_attr starts/ends before/after p_attr + if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start: + # No further Checking needed (just because p_attrs are sorted) + break + s_attr_start_buffer = {} + s_attr_end_buffer = {} + for i, s_attr in enumerate(s_attrs): + if s_attr_start_buffer[s_attr.start]: + s_attr_start_buffer[s_attr.start].append(i) + else: + s_attr_start_buffer[s_attr.start] = [i] + if s_attr_end_buffer[s_attr.end]: + s_attr_end_buffer[s_attr.end].append(i) + else: + s_attr_end_buffer[s_attr.end] = [i] + vrt = '' + vrt += '\n' + for p_attr in p_attrs: + # s_attr_starts + for k in {k: v for k, v in s_attr_start_buffer.items() if k <= p_attr.start}: # noqa + s_attrs = s_attr_start_buffer.pop(k) + for s_attr in s_attrs: + foo = '' + for property in s_attr.properties: + foo += ' {}="{}"'.format(escape(property.name), + escape(property.value)) + vrt += '<{}{}>\n'.format(escape(s_attr.name), foo) + for k in {k: v for k, v in s_attr_end_buffer.items() if k <= p_attr.start}: # noqa + s_attrs = s_attr_end_buffer.pop(k) + for s_attr in s_attrs: + vrt += '\n'.format(escape(s_attr.name)) + # s_attr_ends + foo = {'lemma': None, 'ner': None, 'pos': None, 'simple_pos': None, 'word': None} # noqa + for property in p_attrs.properties: + if property.name == 'lemma': + foo['lemma'] = escape(property.value) + elif property.name == 'ner': + foo['ner'] = escape(property.value) + elif property.name == 'pos': + foo['pos'] = escape(property.value) + elif property.name == 'simple_pos': + foo['simple_pos'] = escape(property.value) + foo['word'] = escape(text[p_attr.start:p_attr.end]) + vrt += '{word}\t{pos}\t{lemma}\t{simple_pos}\t{ner}\n'.format( + **foo) + vrt += '\n' diff --git a/packages/stand-off-data-py/stand_off_data/models.py b/packages/stand-off-data-py/stand_off_data/models.py index 0e426dd..e447fa1 100644 --- a/packages/stand-off-data-py/stand_off_data/models.py +++ b/packages/stand-off-data-py/stand_off_data/models.py @@ -1,4 +1,7 @@ -class StandOffData: +from .export import ExportMixin + + +class StandOffData(ExportMixin): def __init__(self, attrs): self.meta = attrs.get('meta', {}) self.lookup = {tag_definition.id: tag_definition for tag_definition in @@ -69,7 +72,7 @@ class PropertyAnnotation: self.tag_id = attrs['tag_id'] if self.property_id not in self.lookup[self.tag_id].properties: raise Exception('Unknown property id: {}'.format(self.property_id)) - self.value = property['value'] + self.value = attrs['value'] # TODO: Process attrs['possibleValues'] as self.labels (no id?) @property @@ -91,7 +94,7 @@ class TagDefinition: @property def required_properties(self): - return {property.id: property for property in self.properties + return {property.id: property for property in self.properties.values() if property.is_required} diff --git a/packages/stand-off-data-py/stand_off_data/utils.py b/packages/stand-off-data-py/stand_off_data/utils.py deleted file mode 100644 index 5a225eb..0000000 --- a/packages/stand-off-data-py/stand_off_data/utils.py +++ /dev/null @@ -1,78 +0,0 @@ -from xml.sax.saxutils import escape - - -def create_vrt(text, stand_off_data): - # Devide annotations into CWB's verticalized text format (.vrt) logic - p_attrs = [] # positional attributes - s_attrs = [] # structural attributes - for annotation in stand_off_data.annotations: - if annotation.name == 'token': - p_attrs.append(annotation) - else: - s_attrs.append(annotation) - # Sort annotations, necessary for the next checks - p_attrs.sort() - s_attrs.sort() - # Check for p_attr<->p_attr overlap - for i, p_attr in enumerate(p_attrs[:-1]): - next_p_attr = p_attrs[i + 1] - # Check if first_p_attr starts/ends within second_p_attr - if ((p_attr.start >= next_p_attr.start) and (p_attr.start <= next_p_attr.end) # noqa - or (p_attr.end >= next_p_attr.start) and (p_attr.end <= next_p_attr.end)): # noqa - raise Exception('Positional attribute overlaps another') - # Check for s_attr<->p_attr overlap - for i, s_attr in enumerate(s_attrs): - for p_attr in p_attrs: - # Check if s_attr starts within p_attr - if s_attr.start > p_attr.start and s_attr.start < p_attr.end: - # Change s_attr start to p_attr's start - s_attrs[i].start = p_attr.start - # Check if s_attr ends within p_attr - if s_attr.end < p_attr.end and s_attr.end > p_attr.start: - # Change s_attr end to p_attr's end - s_attrs[i].end = p_attr.end - # Check if s_attr starts/ends before/after p_attr - if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start: - # No further Checking needed (just because p_attrs are sorted) - break - s_attr_start_buffer = {} - s_attr_end_buffer = {} - for i, s_attr in enumerate(s_attrs): - if s_attr_start_buffer[s_attr.start]: - s_attr_start_buffer[s_attr.start].append(i) - else: - s_attr_start_buffer[s_attr.start] = [i] - if s_attr_end_buffer[s_attr.end]: - s_attr_end_buffer[s_attr.end].append(i) - else: - s_attr_end_buffer[s_attr.end] = [i] - vrt = '' - vrt += '\n' - for p_attr in p_attrs: - # s_attr_starts - for k in {k: v for k, v in s_attr_start_buffer.items() if k <= p_attr.start}: # noqa - s_attrs = s_attr_start_buffer.pop(k) - for s_attr in s_attrs: - foo = '' - for property in s_attr.properties: - foo += ' {}="{}"'.format(escape(property.name), - escape(property.value)) - vrt += '<{}{}>\n'.format(escape(s_attr.name), foo) - for k in {k: v for k, v in s_attr_end_buffer.items() if k <= p_attr.start}: # noqa - s_attrs = s_attr_end_buffer.pop(k) - for s_attr in s_attrs: - vrt += '\n'.format(escape(s_attr.name)) - # s_attr_ends - foo = {'lemma': None, 'ner': None, 'pos': None, 'simple_pos': None, 'word': None} # noqa - for property in p_attrs.properties: - if property.name == 'lemma': - foo['lemma'] = escape(property.value) - elif property.name == 'ner': - foo['ner'] = escape(property.value) - elif property.name == 'pos': - foo['pos'] = escape(property.value) - elif property.name == 'simple_pos': - foo['simple_pos'] = escape(property.value) - foo['word'] = escape(text[p_attr.start:p_attr.end]) - vrt += '{word}\t{pos}\t{lemma}\t{simple_pos}\t{ner}\n'.format(**foo) - vrt += '\n' diff --git a/vrt-creator b/vrt-creator index e998903..4624c64 100755 --- a/vrt-creator +++ b/vrt-creator @@ -2,96 +2,12 @@ # coding=utf-8 from argparse import ArgumentParser -from xml.sax.saxutils import escape +from stand_off_data import StandOffData import hashlib import json -# Two global ressources - Not very elegant but it works for now -stand_off_data = None -text = None - - -def meta_to_string(): - string = '' - string += '\n'.format( # noqa - stand_off_data['meta']['generator']['name'], - stand_off_data['meta']['generator']['version'], - stand_off_data['meta']['generator']['arguments']['check_encoding'], - stand_off_data['meta']['generator']['arguments']['language'] - ) - string += '\n'.format( - stand_off_data['meta']['file']['encoding'], - stand_off_data['meta']['file']['name'], - stand_off_data['meta']['file']['md5'] - ) - return string - - -def tags_to_string(): - return '' - - -def annotations_to_string(end=float('inf')): - string = '' - while stand_off_data['annotations']: - if stand_off_data['annotations'][0]['start'] >= end: - break - annotation = stand_off_data['annotations'].pop(0) - ####################################################################### - # Check for malformed annotations # - ####################################################################### - if 'tag' not in annotation: - raise Exception('Annotation tag is missing') - - if annotation['tag'] not in stand_off_data['tags']: - raise Exception('Unknown annotation tag: ' + annotation['tag']) - - tag_model = stand_off_data['tags'][annotation['tag']] - if 'properties' in tag_model: - properties_model = tag_model['properties'] - if properties_model is not None: - required_properties = filter(lambda x: 'flags' in x and 'required' in x['flags'], properties_model) # noqa - if required_properties and annotation['properties'] is None: - raise Exception('There are required properties but the "Properties" attribute is missing') # noqa - for property in required_properties: - if property not in annotation['properties']: - raise Exception('Required property is missing: ' + property) # noqa - ####################################################################### - # Process tokens ~ cwb's positional attributes # - ####################################################################### - if annotation['tag'] == 'token': - string += '{}\t{}\t{}\t{}\t{}\n'.format( - escape(text[annotation['start']:annotation['end']]), - escape(annotation['properties']['pos']), - escape(annotation['properties']['lemma']), - escape(annotation['properties']['simple_pos']), - escape(annotation['properties']['ner'] if 'ner' in annotation['properties'] else 'None') # noqa - ) - ####################################################################### - # Process other tags ~ cwb's structural attributes # - ####################################################################### - else: - properties = '' - if 'properties' in annotation and annotation['properties'] is not None: # noqa - for property, value in annotation['properties'].items(): - if not value: - continue - if properties_model and property in properties_model: - if 'flags' in properties_model and 'multiple' in properties_model['flags']: # noqa - properties += ' {}="|{}|"'.format(property, '|'.join(value)) # noqa - else: - properties += ' {}="{}"'.format(property, value) - string += '<' + annotation['tag'] + properties + '>\n' - string += annotations_to_string(end=min(annotation['end'], end)) - string += '\n' - return string - - def main(): - global stand_off_data - global text - # Parse the given arguments parser = ArgumentParser(description='Create a vrt from JSON and txt') parser.add_argument('text', help='Path to txt file') @@ -100,31 +16,15 @@ def main(): args = parser.parse_args() with open(args.stand_off_data) as stand_of_data_file: - stand_off_data = json.load(stand_of_data_file) + stand_off_data = StandOffData(json.load(stand_of_data_file)) with open(args.text, "rb") as text_file: text_md5 = hashlib.md5() for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''): # noqa text_md5.update(chunk) - if text_md5.hexdigest() != stand_off_data['meta']['file']['md5']: + if text_md5.hexdigest() != stand_off_data.meta['file']['md5']: raise Exception('md5 not equal') - with open(args.text, encoding=stand_off_data['meta']['file']['encoding']) as text_file: # noqa - text = text_file.read() - - vrt = '' - vrt += '\n' - vrt += '\n' - vrt += '\n' - vrt += meta_to_string() - vrt += tags_to_string() - vrt += annotations_to_string() - vrt += '\n' - vrt += '' - - with open(args.output, 'w') as vrt_file: - vrt_file.write(vrt) - if __name__ == '__main__': main() diff --git a/vrt-creator.bak b/vrt-creator.bak new file mode 100644 index 0000000..e998903 --- /dev/null +++ b/vrt-creator.bak @@ -0,0 +1,130 @@ +#!/usr/bin/env python3.7 +# coding=utf-8 + +from argparse import ArgumentParser +from xml.sax.saxutils import escape +import hashlib +import json + + +# Two global ressources - Not very elegant but it works for now +stand_off_data = None +text = None + + +def meta_to_string(): + string = '' + string += '\n'.format( # noqa + stand_off_data['meta']['generator']['name'], + stand_off_data['meta']['generator']['version'], + stand_off_data['meta']['generator']['arguments']['check_encoding'], + stand_off_data['meta']['generator']['arguments']['language'] + ) + string += '\n'.format( + stand_off_data['meta']['file']['encoding'], + stand_off_data['meta']['file']['name'], + stand_off_data['meta']['file']['md5'] + ) + return string + + +def tags_to_string(): + return '' + + +def annotations_to_string(end=float('inf')): + string = '' + while stand_off_data['annotations']: + if stand_off_data['annotations'][0]['start'] >= end: + break + annotation = stand_off_data['annotations'].pop(0) + ####################################################################### + # Check for malformed annotations # + ####################################################################### + if 'tag' not in annotation: + raise Exception('Annotation tag is missing') + + if annotation['tag'] not in stand_off_data['tags']: + raise Exception('Unknown annotation tag: ' + annotation['tag']) + + tag_model = stand_off_data['tags'][annotation['tag']] + if 'properties' in tag_model: + properties_model = tag_model['properties'] + if properties_model is not None: + required_properties = filter(lambda x: 'flags' in x and 'required' in x['flags'], properties_model) # noqa + if required_properties and annotation['properties'] is None: + raise Exception('There are required properties but the "Properties" attribute is missing') # noqa + for property in required_properties: + if property not in annotation['properties']: + raise Exception('Required property is missing: ' + property) # noqa + ####################################################################### + # Process tokens ~ cwb's positional attributes # + ####################################################################### + if annotation['tag'] == 'token': + string += '{}\t{}\t{}\t{}\t{}\n'.format( + escape(text[annotation['start']:annotation['end']]), + escape(annotation['properties']['pos']), + escape(annotation['properties']['lemma']), + escape(annotation['properties']['simple_pos']), + escape(annotation['properties']['ner'] if 'ner' in annotation['properties'] else 'None') # noqa + ) + ####################################################################### + # Process other tags ~ cwb's structural attributes # + ####################################################################### + else: + properties = '' + if 'properties' in annotation and annotation['properties'] is not None: # noqa + for property, value in annotation['properties'].items(): + if not value: + continue + if properties_model and property in properties_model: + if 'flags' in properties_model and 'multiple' in properties_model['flags']: # noqa + properties += ' {}="|{}|"'.format(property, '|'.join(value)) # noqa + else: + properties += ' {}="{}"'.format(property, value) + string += '<' + annotation['tag'] + properties + '>\n' + string += annotations_to_string(end=min(annotation['end'], end)) + string += '\n' + return string + + +def main(): + global stand_off_data + global text + + # Parse the given arguments + parser = ArgumentParser(description='Create a vrt from JSON and txt') + parser.add_argument('text', help='Path to txt file') + parser.add_argument('stand_off_data', help='Path to JSON file') + parser.add_argument('output', help='Path to vrt output file') + args = parser.parse_args() + + with open(args.stand_off_data) as stand_of_data_file: + stand_off_data = json.load(stand_of_data_file) + + with open(args.text, "rb") as text_file: + text_md5 = hashlib.md5() + for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''): # noqa + text_md5.update(chunk) + if text_md5.hexdigest() != stand_off_data['meta']['file']['md5']: + raise Exception('md5 not equal') + + with open(args.text, encoding=stand_off_data['meta']['file']['encoding']) as text_file: # noqa + text = text_file.read() + + vrt = '' + vrt += '\n' + vrt += '\n' + vrt += '\n' + vrt += meta_to_string() + vrt += tags_to_string() + vrt += annotations_to_string() + vrt += '\n' + vrt += '' + + with open(args.output, 'w') as vrt_file: + vrt_file.write(vrt) + + +if __name__ == '__main__': + main()