Preliminary work

2026-02-12 18:12:04 +00:00 · 2021-07-13 16:31:53 +02:00
parent 5139fd9727
commit 4dea95a108
6 changed files with 374 additions and 61 deletions
--- a/packages/stand-off-data-py/setup.py
+++ b/packages/stand-off-data-py/setup.py
--- a/packages/stand-off-data-py/stand_off_data/init.py
+++ b/packages/stand-off-data-py/stand_off_data/init.py
--- a/packages/stand-off-data-py/stand_off_data/models.py
+++ b/packages/stand-off-data-py/stand_off_data/models.py
@@ -0,0 +1,126 @@
+'''
+    'generator': {
+        'name': 'nopaque NLP service',
+        'version': '1.0.0',
+        'arguments': {
+            'check_encoding': args.check_encoding,
+            'language': args.language
+        }
+    },
+    'file': {
+        'encoding': encoding,
+        'md5': text_md5.hexdigest(),
+        'name': os.path.basename(args.input)
+    }
+'''
+
+
+class StandOffData:
+    def __init__(self, attrs):
+        self.tags = {tag_definition.id: tag_definition for tag_definition in
+                     [TagDefinition(x) for x in attrs.get('tags', [])]}
+        self.annotations = [TagAnnotation(x, self.tags) for x in
+                            attrs.get('annotations', [])]
+
+
+class TagAnnotation:
+    def __init__(self, attrs, tag_lookup):
+        self.tag_id = attrs['tag_id']
+        self.tag_lookup = tag_lookup
+        if self.tag_id not in self.tag_lookup:
+            raise Exception('Unknown tag id: {}'.format(self.tag_id))
+        self.start = attrs['start']
+        self.end = attrs['end']
+        if self.start >= self.end:
+            raise Exception('start must be lower then end')
+        self.description = attrs.get('description', '')
+        self.properties = [
+            PropertyAnnotation(x, self.tag_lookup[self.tag_id].properties)
+            for x in attrs.get('properties', [])
+        ]
+        for required_property_id in self.tag_lookup[self.tag_id].required_properties:
+            if required_property_id not in self.properties:
+                raise Exception('Missing required property: {}'.format(required_property_id))
+
+    @property
+    def name(self):
+        return self.tag_lookup[self.tag_id].name
+
+    def __lt__(self, other):
+        if self.start == other.start:
+            return self.name == 'token' and other.name != 'token'
+        else:
+            return self.start < other.start
+
+    def __le__(self, other):
+        if self.start == other.start:
+            return self.name == 'token' or other.name != 'token'
+        else:
+            return self.start < other.start
+
+    def __eq__(self, other):
+        return self.start == other.start and self.name == other.name
+
+    def __ne__(self, other):
+        return self.start != other.start and self.name != other.name
+
+    def __gt__(self, other):
+        if self.start == other.start:
+            return self.name != 'token' and other.name == 'token'
+        else:
+            return self.start > other.start
+
+    def __ge__(self, other):
+        if self.start == other.start:
+            return self.name != 'token' or other.name == 'token'
+        else:
+            return self.start > other.start
+
+
+class PropertyAnnotation:
+    def __init__(self, attrs, property_lookup):
+        self.property_id = property['property_id']
+        self.property_lookup = property_lookup
+        if self.property_id not in self.property_lookup:
+            raise Exception('Unknown property id: {}'.format(self.property_id))
+        self.value = property['value']
+        # TODO: Process attrs['possibleValues'] as self.labels (no id?)
+
+    @property
+    def name(self):
+        return self.property_lookup[self.property_id].name
+
+
+class TagDefinition:
+    def __init__(self, attrs):
+        self.id = attrs['id']
+        self.name = attrs['name']
+        self.description = attrs.get('description', '')
+        self.properties = {
+            property_definition.id: property_definition
+            for property_definition in [
+                PropertyDefinition(x) for x in attrs.get('properties', [])
+            ]
+        }
+
+    @property
+    def required_properties(self):
+        return {property.id: property for property in self.properties
+                if property.is_required}
+
+
+class PropertyDefinition:
+    def __init__(self, attrs):
+        self.id = attrs['id']
+        self.name = attrs['name']
+        self.description = attrs.get('description', '')
+        self.flags = attrs.get('flags', [])
+        self.labels = attrs.get('labels', [])
+
+    @property
+    def is_required(self):
+        return 'required' in self.flags
+
+    @property
+    def has_multiple_values(self):
+        return 'multiple' in self.flags
--- a/packages/stand-off-data-py/stand_off_data/utils.py
+++ b/packages/stand-off-data-py/stand_off_data/utils.py
@@ -0,0 +1,47 @@
+def create_vrt(text, stand_off_data):
+    # Devide annotations into CWB's verticalized text format (.vrt) logic
+    p_attrs = []    # positional attributes
+    s_attrs = []    # structural attributes
+    for annotation in stand_off_data.annotations:
+        if annotation.name == 'token':
+            p_attrs.append(annotation)
+        else:
+            s_attrs.append(annotation)
+    # Sort annotations, necessary for the next checks
+    p_attrs.sort()
+    s_attrs.sort()
+    # Check for p_attr<->p_attr overlap
+    for i, p_attr in enumerate(p_attrs[:-1]):
+        next_p_attr = p_attrs[i + 1]
+        # Check if first_p_attr starts/ends within second_p_attr
+        if ((p_attr.start >= next_p_attr.start) and (p_attr.start <= next_p_attr.end)  # noqa
+            or (p_attr.end >= next_p_attr.start) and (p_attr.end <= next_p_attr.end)):  # noqa
+            raise Exception('Positional attribute overlaps another')
+    # Check for s_attr<->p_attr overlap
+    for i, s_attr in enumerate(s_attrs):
+        for p_attr in p_attrs:
+            # Check if s_attr starts within p_attr
+            if s_attr.start > p_attr.start and s_attr.start < p_attr.end:
+                # Change s_attr start to p_attr's start
+                s_attrs[i].start = p_attr.start
+            # Check if s_attr ends within p_attr
+            if s_attr.end < p_attr.end and s_attr.end > p_attr.start:
+                # Change s_attr end to p_attr's end
+                s_attrs[i].end = p_attr.end
+            # Check if s_attr starts/ends before/after p_attr
+            if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start:
+                # No further Checking needed (just because p_attrs are sorted)
+                break
+    s_attr_start_buffer = {}
+    s_attr_end_buffer = {}
+    for i, s_attr in enumerate(s_attrs):
+        if s_attr_start_buffer[s_attr.start]:
+            s_attr_start_buffer[s_attr.start].append(i)
+        else:
+            s_attr_start_buffer[s_attr.start] = [i]
+        if s_attr_end_buffer[s_attr.end]:
+            s_attr_end_buffer[s_attr.end].append(i)
+        else:
+            s_attr_end_buffer[s_attr.end] = [1]
+    vrt = ''
+    # TODO do the work!