From 0ba0c14b7268c24c1feb787f8d5f3751e2f74da0 Mon Sep 17 00:00:00 2001
From: Patrick Jentsch
Date: Tue, 10 Aug 2021 14:43:55 +0200
Subject: [PATCH] First attempt
---
.../stand_off_data/export.py | 80 -----------
.../stand_off_data/models.py | 97 ++++++++++++-
vrt-creator | 6 +
vrt-creator.bak | 130 ------------------
4 files changed, 99 insertions(+), 214 deletions(-)
delete mode 100644 packages/stand-off-data-py/stand_off_data/export.py
delete mode 100644 vrt-creator.bak
diff --git a/packages/stand-off-data-py/stand_off_data/export.py b/packages/stand-off-data-py/stand_off_data/export.py
deleted file mode 100644
index afb49f6..0000000
--- a/packages/stand-off-data-py/stand_off_data/export.py
+++ /dev/null
@@ -1,80 +0,0 @@
-from xml.sax.saxutils import escape
-
-
-class ExportMixin:
- def to_vrt(self, text):
- # Devide annotations into CWB's verticalized text format (.vrt) logic
- p_attrs = [] # positional attributes
- s_attrs = [] # structural attributes
- for annotation in self.annotations:
- if annotation.name == 'token':
- p_attrs.append(annotation)
- else:
- s_attrs.append(annotation)
- # Sort annotations, necessary for the next checks
- p_attrs.sort()
- s_attrs.sort()
- # Check for p_attr<->p_attr overlap
- for i, p_attr in enumerate(p_attrs[:-1]):
- next_p_attr = p_attrs[i + 1]
- # Check if first_p_attr starts/ends within second_p_attr
- if ((p_attr.start >= next_p_attr.start) and (p_attr.start <= next_p_attr.end) # noqa
- or (p_attr.end >= next_p_attr.start) and (p_attr.end <= next_p_attr.end)): # noqa
- raise Exception('Positional attribute overlaps another')
- # Check for s_attr<->p_attr overlap
- for i, s_attr in enumerate(s_attrs):
- for p_attr in p_attrs:
- # Check if s_attr starts within p_attr
- if s_attr.start > p_attr.start and s_attr.start < p_attr.end:
- # Change s_attr start to p_attr's start
- s_attrs[i].start = p_attr.start
- # Check if s_attr ends within p_attr
- if s_attr.end < p_attr.end and s_attr.end > p_attr.start:
- # Change s_attr end to p_attr's end
- s_attrs[i].end = p_attr.end
- # Check if s_attr starts/ends before/after p_attr
- if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start:
- # No further Checking needed (just because p_attrs are sorted)
- break
- s_attr_start_buffer = {}
- s_attr_end_buffer = {}
- for i, s_attr in enumerate(s_attrs):
- if s_attr_start_buffer[s_attr.start]:
- s_attr_start_buffer[s_attr.start].append(i)
- else:
- s_attr_start_buffer[s_attr.start] = [i]
- if s_attr_end_buffer[s_attr.end]:
- s_attr_end_buffer[s_attr.end].append(i)
- else:
- s_attr_end_buffer[s_attr.end] = [i]
- vrt = ''
- vrt += '\n'
- for p_attr in p_attrs:
- # s_attr_starts
- for k in {k: v for k, v in s_attr_start_buffer.items() if k <= p_attr.start}: # noqa
- s_attrs = s_attr_start_buffer.pop(k)
- for s_attr in s_attrs:
- foo = ''
- for property in s_attr.properties:
- foo += ' {}="{}"'.format(escape(property.name),
- escape(property.value))
- vrt += '<{}{}>\n'.format(escape(s_attr.name), foo)
- for k in {k: v for k, v in s_attr_end_buffer.items() if k <= p_attr.start}: # noqa
- s_attrs = s_attr_end_buffer.pop(k)
- for s_attr in s_attrs:
- vrt += '{}>\n'.format(escape(s_attr.name))
- # s_attr_ends
- foo = {'lemma': None, 'ner': None, 'pos': None, 'simple_pos': None, 'word': None} # noqa
- for property in p_attrs.properties:
- if property.name == 'lemma':
- foo['lemma'] = escape(property.value)
- elif property.name == 'ner':
- foo['ner'] = escape(property.value)
- elif property.name == 'pos':
- foo['pos'] = escape(property.value)
- elif property.name == 'simple_pos':
- foo['simple_pos'] = escape(property.value)
- foo['word'] = escape(text[p_attr.start:p_attr.end])
- vrt += '{word}\t{pos}\t{lemma}\t{simple_pos}\t{ner}\n'.format(
- **foo)
- vrt += '\n'
diff --git a/packages/stand-off-data-py/stand_off_data/models.py b/packages/stand-off-data-py/stand_off_data/models.py
index e447fa1..7d6efc3 100644
--- a/packages/stand-off-data-py/stand_off_data/models.py
+++ b/packages/stand-off-data-py/stand_off_data/models.py
@@ -1,7 +1,7 @@
-from .export import ExportMixin
+from xml.sax.saxutils import escape
-class StandOffData(ExportMixin):
+class StandOffData:
def __init__(self, attrs):
self.meta = attrs.get('meta', {})
self.lookup = {tag_definition.id: tag_definition for tag_definition in
@@ -9,6 +9,80 @@ class StandOffData(ExportMixin):
self.annotations = [TagAnnotation(x, self.lookup) for x in
attrs.get('annotations', [])]
+ def to_vrt(self, text):
+ # Devide annotations into CWB's verticalized text format (.vrt) logic
+ p_attrs = [] # positional attributes
+ s_attrs = [] # structural attributes
+ for annotation in self.annotations:
+ if annotation.name == 'token':
+ p_attrs.append(annotation)
+ else:
+ s_attrs.append(annotation)
+ # Sort annotations, necessary for the next checks
+ p_attrs.sort()
+ s_attrs.sort()
+ # Check for p_attr<->p_attr overlap
+ for i, p_attr in enumerate(p_attrs[:-1]):
+ next_p_attr = p_attrs[i + 1]
+ # Check if first_p_attr starts/ends within second_p_attr
+ if ((p_attr.start >= next_p_attr.start) and (p_attr.start < next_p_attr.end) # noqa
+ or (p_attr.end > next_p_attr.start) and (p_attr.end <= next_p_attr.end)): # noqa
+ raise Exception(
+ 'Positional attribute overlaps another: {}<->{}'.format(p_attr.to_dict(), next_p_attr.to_dict()))
+ # Check for s_attr<->p_attr overlap
+ for i, s_attr in enumerate(s_attrs):
+ for p_attr in p_attrs:
+ # Check if s_attr starts within p_attr
+ if s_attr.start > p_attr.start and s_attr.start < p_attr.end:
+ # Change s_attr start to p_attr's start
+ s_attrs[i].start = p_attr.start
+ # Check if s_attr ends within p_attr
+ if s_attr.end < p_attr.end and s_attr.end > p_attr.start:
+ # Change s_attr end to p_attr's end
+ s_attrs[i].end = p_attr.end
+ # Check if s_attr starts/ends before/after p_attr
+ if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start:
+ # No further Checking needed (just because p_attrs are sorted)
+ break
+ s_attr_start_buffer = {}
+ s_attr_end_buffer = {}
+ for i, s_attr in enumerate(s_attrs):
+ if s_attr.start in s_attr_start_buffer:
+ s_attr_start_buffer[s_attr.start].append(i)
+ else:
+ s_attr_start_buffer[s_attr.start] = [i]
+ if s_attr.end in s_attr_end_buffer:
+ s_attr_end_buffer[s_attr.end].append(i)
+ else:
+ s_attr_end_buffer[s_attr.end] = [i]
+ vrt = ''
+ vrt += '\n'
+ for p_attr in p_attrs:
+ # s_attr_ends
+ for k in {k: v for k, v in s_attr_end_buffer.items() if k <= p_attr.start}: # noqa
+ s_attr_indexes = s_attr_end_buffer.pop(k)
+ for s_attr_index in s_attr_indexes:
+ s_attr = s_attrs[s_attr_index]
+ vrt += '{}>\n'.format(escape(s_attr.name))
+ # s_attr_starts
+ for k in {k: v for k, v in s_attr_start_buffer.items() if k <= p_attr.start}: # noqa
+ s_attr_indexes = s_attr_start_buffer.pop(k)
+ for s_attr_index in s_attr_indexes:
+ s_attr = s_attrs[s_attr_index]
+ foo = ''
+ for property in s_attr.properties:
+ foo += ' {}="{}"'.format(escape(property.name),
+ escape(property.value))
+ vrt += '<{}{}>\n'.format(escape(s_attr.name), foo)
+ foo = {'lemma': None, 'ner': None, 'pos': None, 'simple_pos': None, 'word': None} # noqa
+ for property in p_attr.properties:
+ foo[property.name] = escape(property.value)
+ foo['word'] = escape(text[p_attr.start:p_attr.end])
+ vrt += '{word}\t{pos}\t{lemma}\t{simple_pos}\t{ner}\n'.format(
+ **foo)
+ vrt += '\n'
+ return vrt
+
class TagAnnotation:
def __init__(self, attrs, lookup):
@@ -20,13 +94,13 @@ class TagAnnotation:
self.end = attrs['end']
if self.start >= self.end:
raise Exception('start must be lower then end')
- self.description = attrs.get('description', '')
self.properties = [
PropertyAnnotation({**x, 'tag_id': self.tag_id}, self.lookup)
for x in attrs.get('properties', [])
]
+ property_ids = [x.property_id for x in self.properties]
for required_property_id in self.lookup[self.tag_id].required_properties:
- if required_property_id not in self.properties:
+ if required_property_id not in property_ids:
raise Exception(
'Missing required property: {}'.format(required_property_id))
@@ -34,6 +108,14 @@ class TagAnnotation:
def name(self):
return self.lookup[self.tag_id].name
+ def to_dict(self):
+ return {
+ 'tag_id': self.tag_id,
+ 'start': self.start,
+ 'end': self.end,
+ 'properties': [x.to_dict() for x in self.properties]
+ }
+
def __lt__(self, other):
if self.start == other.start:
return self.name == 'token' and other.name != 'token'
@@ -79,6 +161,13 @@ class PropertyAnnotation:
def name(self):
return self.lookup[self.tag_id].properties[self.property_id].name
+ def to_dict(self):
+ return {
+ 'property_id': self.property_id,
+ 'tag_id': self.tag_id,
+ 'value': self.value
+ }
+
class TagDefinition:
def __init__(self, attrs):
diff --git a/vrt-creator b/vrt-creator
index 4624c64..3443168 100755
--- a/vrt-creator
+++ b/vrt-creator
@@ -25,6 +25,12 @@ def main():
if text_md5.hexdigest() != stand_off_data.meta['file']['md5']:
raise Exception('md5 not equal')
+ with open(args.text, encoding=stand_off_data.meta['file']['encoding']) as text_file:
+ text = text_file.read()
+
+ with open(args.output, 'w') as vrt_file:
+ vrt_file.write(stand_off_data.to_vrt(text))
+
if __name__ == '__main__':
main()
diff --git a/vrt-creator.bak b/vrt-creator.bak
deleted file mode 100644
index e998903..0000000
--- a/vrt-creator.bak
+++ /dev/null
@@ -1,130 +0,0 @@
-#!/usr/bin/env python3.7
-# coding=utf-8
-
-from argparse import ArgumentParser
-from xml.sax.saxutils import escape
-import hashlib
-import json
-
-
-# Two global ressources - Not very elegant but it works for now
-stand_off_data = None
-text = None
-
-
-def meta_to_string():
- string = ''
- string += '\n'.format( # noqa
- stand_off_data['meta']['generator']['name'],
- stand_off_data['meta']['generator']['version'],
- stand_off_data['meta']['generator']['arguments']['check_encoding'],
- stand_off_data['meta']['generator']['arguments']['language']
- )
- string += '\n'.format(
- stand_off_data['meta']['file']['encoding'],
- stand_off_data['meta']['file']['name'],
- stand_off_data['meta']['file']['md5']
- )
- return string
-
-
-def tags_to_string():
- return ''
-
-
-def annotations_to_string(end=float('inf')):
- string = ''
- while stand_off_data['annotations']:
- if stand_off_data['annotations'][0]['start'] >= end:
- break
- annotation = stand_off_data['annotations'].pop(0)
- #######################################################################
- # Check for malformed annotations #
- #######################################################################
- if 'tag' not in annotation:
- raise Exception('Annotation tag is missing')
-
- if annotation['tag'] not in stand_off_data['tags']:
- raise Exception('Unknown annotation tag: ' + annotation['tag'])
-
- tag_model = stand_off_data['tags'][annotation['tag']]
- if 'properties' in tag_model:
- properties_model = tag_model['properties']
- if properties_model is not None:
- required_properties = filter(lambda x: 'flags' in x and 'required' in x['flags'], properties_model) # noqa
- if required_properties and annotation['properties'] is None:
- raise Exception('There are required properties but the "Properties" attribute is missing') # noqa
- for property in required_properties:
- if property not in annotation['properties']:
- raise Exception('Required property is missing: ' + property) # noqa
- #######################################################################
- # Process tokens ~ cwb's positional attributes #
- #######################################################################
- if annotation['tag'] == 'token':
- string += '{}\t{}\t{}\t{}\t{}\n'.format(
- escape(text[annotation['start']:annotation['end']]),
- escape(annotation['properties']['pos']),
- escape(annotation['properties']['lemma']),
- escape(annotation['properties']['simple_pos']),
- escape(annotation['properties']['ner'] if 'ner' in annotation['properties'] else 'None') # noqa
- )
- #######################################################################
- # Process other tags ~ cwb's structural attributes #
- #######################################################################
- else:
- properties = ''
- if 'properties' in annotation and annotation['properties'] is not None: # noqa
- for property, value in annotation['properties'].items():
- if not value:
- continue
- if properties_model and property in properties_model:
- if 'flags' in properties_model and 'multiple' in properties_model['flags']: # noqa
- properties += ' {}="|{}|"'.format(property, '|'.join(value)) # noqa
- else:
- properties += ' {}="{}"'.format(property, value)
- string += '<' + annotation['tag'] + properties + '>\n'
- string += annotations_to_string(end=min(annotation['end'], end))
- string += '' + annotation['tag'] + '>\n'
- return string
-
-
-def main():
- global stand_off_data
- global text
-
- # Parse the given arguments
- parser = ArgumentParser(description='Create a vrt from JSON and txt')
- parser.add_argument('text', help='Path to txt file')
- parser.add_argument('stand_off_data', help='Path to JSON file')
- parser.add_argument('output', help='Path to vrt output file')
- args = parser.parse_args()
-
- with open(args.stand_off_data) as stand_of_data_file:
- stand_off_data = json.load(stand_of_data_file)
-
- with open(args.text, "rb") as text_file:
- text_md5 = hashlib.md5()
- for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''): # noqa
- text_md5.update(chunk)
- if text_md5.hexdigest() != stand_off_data['meta']['file']['md5']:
- raise Exception('md5 not equal')
-
- with open(args.text, encoding=stand_off_data['meta']['file']['encoding']) as text_file: # noqa
- text = text_file.read()
-
- vrt = ''
- vrt += '\n'
- vrt += '\n'
- vrt += '\n'
- vrt += meta_to_string()
- vrt += tags_to_string()
- vrt += annotations_to_string()
- vrt += '\n'
- vrt += ''
-
- with open(args.output, 'w') as vrt_file:
- vrt_file.write(vrt)
-
-
-if __name__ == '__main__':
- main()