nlp/packages/stand-off-data-py/stand_off_data.py

from xml.sax.saxutils import escape


class StandOffData:
    def __init__(self, attrs={}):
        self.meta = attrs.get('meta', {})
        self.lookup = {}
        for x in attrs.get('tags', []):
            self.add_tag_definition(x)
        self.annotations = [
            TagAnnotation(x, self.lookup)
            for x in attrs.get('annotations', [])
        ]

    def add_tag_definition(self, attrs):
        tag_definition = TagDefinition(attrs)
        if tag_definition.id in self.lookup:
            raise Exception(f'Tag id already in use: {self.to_dict()}')
        self.lookup[tag_definition.id] = tag_definition

    def to_dict(self):
        return {
            'meta': self.meta,
            'lookup': {k: v.to_dict() for k, v in self.lookup.items()},
            'annotations': [x.to_dict() for x in self.annotations]
        }

    def to_vrt(self, text):
        # Devide annotations into CWB's verticalized text format (.vrt) logic
        p_attrs = []    # positional attributes
        s_attrs = []    # structural attributes
        for annotation in self.annotations:
            if annotation.name == 'token':
                p_attrs.append(annotation)
            else:
                s_attrs.append(annotation)
        # Sort annotations, necessary for the next checks
        p_attrs.sort()
        s_attrs.sort()
        # Check for p_attr<->p_attr overlap
        for i, p_attr in enumerate(p_attrs[:-1]):
            next_p_attr = p_attrs[i + 1]
            # Check if first_p_attr starts/ends within second_p_attr
            if ((p_attr.start >= next_p_attr.start) and (p_attr.start < next_p_attr.end)  # noqa
                or (p_attr.end > next_p_attr.start) and (p_attr.end <= next_p_attr.end)):  # noqa
                raise Exception(
                    'Positional attribute overlaps another: '
                    f'{p_attr.to_dict()}<->{next_p_attr.to_dict()}'
                )
        # Check for s_attr<->p_attr overlap
        for i, s_attr in enumerate(s_attrs):
            for p_attr in p_attrs:
                # Check if s_attr starts within p_attr
                if s_attr.start > p_attr.start and s_attr.start < p_attr.end:
                    # Change s_attr start to p_attr's start
                    s_attrs[i].start = p_attr.start
                # Check if s_attr ends within p_attr
                if s_attr.end < p_attr.end and s_attr.end > p_attr.start:
                    # Change s_attr end to p_attr's end
                    s_attrs[i].end = p_attr.end
                # Check if s_attr starts/ends before/after p_attr
                if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start:
                    # No further Checking needed (because p_attrs are sorted)
                    break
        p_attr_buffer = {}
        for i, p_attr in enumerate(p_attrs):
            p_attr_buffer[p_attr.start] = i
        s_attr_start_buffer = {}
        s_attr_end_buffer = {}
        for i, s_attr in enumerate(s_attrs):
            if s_attr.start in s_attr_start_buffer:
                s_attr_start_buffer[s_attr.start].append(i)
            else:
                s_attr_start_buffer[s_attr.start] = [i]
            if s_attr.end in s_attr_end_buffer:
                s_attr_end_buffer[s_attr.end].insert(0, i)
            else:
                s_attr_end_buffer[s_attr.end] = [i]
        vrt = ''
        vrt += '<text>\n'
        current_position = 0
        text_len = len(text)
        # As long as we have something in our buffers we process it
        while current_position <= text_len:
            # s_attr endings
            # for k in {k: v for k, v in s_attr_end_buffer.items() if k <= current_position}:  # noqa
            if current_position in s_attr_end_buffer:
                # s_attr_indexes = s_attr_end_buffer.pop(k)
                s_attr_indexes = s_attr_end_buffer.pop(current_position)
                for s_attr_index in s_attr_indexes:
                    s_attr = s_attrs[s_attr_index]
                    vrt += f'</{escape(s_attr.name)}>\n'
            # s_attrs starts
            # for k in {k: v for k, v in s_attr_start_buffer.items() if k <= current_position}:  # noqa
            if current_position in s_attr_start_buffer:
                # s_attr_indexes = s_attr_start_buffer.pop(k)
                s_attr_indexes = s_attr_start_buffer.pop(current_position)
                for s_attr_index in s_attr_indexes:
                    s_attr = s_attrs[s_attr_index]
                    vrt += f'<{escape(s_attr.name)}'
                    for property in s_attr.properties:
                        vrt += f' {escape(property.name)}="{escape(str(property.value))}"'  # noqa
                    vrt += '>\n'
            # p_attrs
            if current_position not in p_attr_buffer:
                current_position += 1
                continue
            p_attr_index = p_attr_buffer.pop(current_position)
            p_attr = p_attrs[p_attr_index]
            if text[p_attr.start:p_attr.end].isspace():
                current_position = p_attr.end
                continue
            _p_attr = {
                'lemma': 'None',
                'pos': 'None',
                'simple_pos': 'None',
                'word': 'None'
            }
            for property in p_attr.properties:
                if property.name not in _p_attr:
                    continue
                _p_attr[property.name] = escape(str(property.value))
            _p_attr['word'] = escape(text[p_attr.start:p_attr.end])
            vrt += '{word}\t{pos}\t{lemma}\t{simple_pos}\n'.format(**_p_attr)
            current_position = p_attr.end
        vrt += '</text>\n'
        return vrt


class TagAnnotation:
    def __init__(self, attrs, lookup):
        self.lookup = lookup
        self.tag_id = attrs['tag_id']
        self.start = attrs['start']
        self.end = attrs['end']
        self.properties = [
            PropertyAnnotation(x, self.lookup[self.tag_id].properties)
            for x in attrs.get('properties', [])
        ]
        ''' Sanity checks '''
        if self.tag_id not in self.lookup:
            raise Exception(f'Unknown tag: {self.to_dict()}')
        if self.end < self.start:
            raise Exception(f'Annotation end less then start: {self.to_dict()}')  # noqa
        # property_ids = [x.property_id for x in self.properties]
        # for required_property_id, required_property in self.lookup[self.tag_id].required_properties.items():  # noqa
        #     if required_property_id not in property_ids:
        #         raise Exception(
        #             f'Missing required property: {required_property.to_dict()}'
        #         )

    @property
    def name(self):
        return self.lookup[self.tag_id].name

    def to_dict(self):
        return {
            'tag_id': self.tag_id,
            'start': self.start,
            'end': self.end,
            'properties': [x.to_dict() for x in self.properties]
        }

    def __lt__(self, other):
        if self.start == other.start:
            if self.name == 'token' and other.name != 'token':
                return False
            elif self.name != 'token' and other.name == 'token':
                return True
            else:
                return self.end > other.end
        else:
            return self.start < other.start

    def __le__(self, other):
        if self.start == other.start:
            if self.name == 'token' and other.name != 'token':
                return False
            elif self.name != 'token' and other.name == 'token':
                return True
            else:
                return self.end >= other.end
        else:
            return self.start <= other.start

    def __eq__(self, other):
        if self.start == other.start:
            if self.name == 'token' and other.name != 'token':
                return False
            elif self.name != 'token' and other.name == 'token':
                return False
            else:
                return self.end == other.end
        else:
            return False

    def __ne__(self, other):
        return not self == other

    def __gt__(self, other):
        return not self <= other

    def __ge__(self, other):
        return not self < other


class PropertyAnnotation:
    def __init__(self, attrs, lookup):
        self.lookup = lookup
        self.property_id = attrs['property_id']
        self.value = attrs['value']
        # TODO: Process attrs['possibleValues'] as self.labels (no id?)
        ''' Sanity checks '''
        if self.property_id not in self.lookup:
            raise Exception(f'Unknown property: {self.to_dict()}')

    @property
    def name(self):
        return self.lookup[self.property_id].name

    def to_dict(self):
        return {
            'property_id': self.property_id,
            'tag_id': self.tag_id,
            'value': self.value
        }


class TagDefinition:
    def __init__(self, attrs):
        self.id = attrs['id']
        self.name = attrs['name']
        self.description = attrs.get('description', '')
        self.properties = {}
        for x in attrs.get('properties', []):
            self.add_property_definition(x)

    def add_property_definition(self, attrs):
        property_definition = PropertyDefinition(attrs)
        if property_definition.id in self.properties:
            raise Exception(
                f'Property id already in use: {property_definition.to_dict()}')
        self.properties[property_definition.id] = property_definition

    # @property
    # def required_properties(self):
    #     return {property.id: property for property in self.properties.values()
    #             if property.is_required}

    def to_dict(self):
        return {
            'id': self.id,
            'name': self.name,
            'description': self.description,
            'properties': {k: v.to_dict() for k, v in self.properties.items()}
        }


class PropertyDefinition:
    def __init__(self, attrs):
        self.id = attrs['id']
        self.name = attrs['name']
        self.description = attrs.get('description', '')
        self.flags = attrs.get('flags', [])
        self.labels = attrs.get('labels', [])

    # @property
    # def is_required(self):
    #     return 'required' in self.flags

    @property
    def has_multiple_values(self):
        return 'multiple' in self.flags

    def to_dict(self):
        return {
            'id': self.id,
            'name': self.name,
            'description': self.description,
            'flags': self.flags,
            'labels': self.labels
        }