from xml.sax.saxutils import escape class StandOffData: def __init__(self, attrs): self.meta = attrs.get('meta', {}) self.lookup = {tag_definition.id: tag_definition for tag_definition in [TagDefinition(x) for x in attrs.get('tags', [])]} self.annotations = [TagAnnotation(x, self.lookup) for x in attrs.get('annotations', [])] def to_vrt(self, text): # Devide annotations into CWB's verticalized text format (.vrt) logic p_attrs = [] # positional attributes s_attrs = [] # structural attributes for annotation in self.annotations: if annotation.name == 'token': p_attrs.append(annotation) else: s_attrs.append(annotation) # Sort annotations, necessary for the next checks p_attrs.sort() s_attrs.sort() # Check for p_attr<->p_attr overlap for i, p_attr in enumerate(p_attrs[:-1]): next_p_attr = p_attrs[i + 1] # Check if first_p_attr starts/ends within second_p_attr if ((p_attr.start >= next_p_attr.start) and (p_attr.start < next_p_attr.end) # noqa or (p_attr.end > next_p_attr.start) and (p_attr.end <= next_p_attr.end)): # noqa raise Exception( 'Positional attribute overlaps another: {}<->{}'.format(p_attr.to_dict(), next_p_attr.to_dict())) # Check for s_attr<->p_attr overlap for i, s_attr in enumerate(s_attrs): for p_attr in p_attrs: # Check if s_attr starts within p_attr if s_attr.start > p_attr.start and s_attr.start < p_attr.end: # Change s_attr start to p_attr's start s_attrs[i].start = p_attr.start # Check if s_attr ends within p_attr if s_attr.end < p_attr.end and s_attr.end > p_attr.start: # Change s_attr end to p_attr's end s_attrs[i].end = p_attr.end # Check if s_attr starts/ends before/after p_attr if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start: # No further Checking needed (just because p_attrs are sorted) break s_attr_start_buffer = {} s_attr_end_buffer = {} for i, s_attr in enumerate(s_attrs): if s_attr.start in s_attr_start_buffer: s_attr_start_buffer[s_attr.start].append(i) else: s_attr_start_buffer[s_attr.start] = [i] if s_attr.end in s_attr_end_buffer: s_attr_end_buffer[s_attr.end].append(i) else: s_attr_end_buffer[s_attr.end] = [i] vrt = '' vrt += '\n' for p_attr in p_attrs: # s_attr_ends for k in {k: v for k, v in s_attr_end_buffer.items() if k <= p_attr.start}: # noqa s_attr_indexes = s_attr_end_buffer.pop(k) for s_attr_index in s_attr_indexes: s_attr = s_attrs[s_attr_index] vrt += '\n'.format(escape(s_attr.name)) # s_attr_starts for k in {k: v for k, v in s_attr_start_buffer.items() if k <= p_attr.start}: # noqa s_attr_indexes = s_attr_start_buffer.pop(k) for s_attr_index in s_attr_indexes: s_attr = s_attrs[s_attr_index] foo = '' for property in s_attr.properties: foo += ' {}="{}"'.format(escape(property.name), escape(property.value)) vrt += '<{}{}>\n'.format(escape(s_attr.name), foo) foo = {'lemma': None, 'ner': None, 'pos': None, 'simple_pos': None, 'word': None} # noqa for property in p_attr.properties: foo[property.name] = escape(property.value) foo['word'] = escape(text[p_attr.start:p_attr.end]) vrt += '{word}\t{pos}\t{lemma}\t{simple_pos}\t{ner}\n'.format( **foo) vrt += '\n' return vrt class TagAnnotation: def __init__(self, attrs, lookup): self.lookup = lookup self.tag_id = attrs['tag_id'] if self.tag_id not in self.lookup: raise Exception('Unknown tag id: {}'.format(self.tag_id)) self.start = attrs['start'] self.end = attrs['end'] if self.start >= self.end: raise Exception('start must be lower then end') self.properties = [ PropertyAnnotation({**x, 'tag_id': self.tag_id}, self.lookup) for x in attrs.get('properties', []) ] property_ids = [x.property_id for x in self.properties] for required_property_id in self.lookup[self.tag_id].required_properties: if required_property_id not in property_ids: raise Exception( 'Missing required property: {}'.format(required_property_id)) @property def name(self): return self.lookup[self.tag_id].name def to_dict(self): return { 'tag_id': self.tag_id, 'start': self.start, 'end': self.end, 'properties': [x.to_dict() for x in self.properties] } def __lt__(self, other): if self.start == other.start: return self.name == 'token' and other.name != 'token' else: return self.start < other.start def __le__(self, other): if self.start == other.start: return self.name == 'token' or other.name != 'token' else: return self.start < other.start def __eq__(self, other): return self.start == other.start and self.name == other.name def __ne__(self, other): return self.start != other.start and self.name != other.name def __gt__(self, other): if self.start == other.start: return self.name != 'token' and other.name == 'token' else: return self.start > other.start def __ge__(self, other): if self.start == other.start: return self.name != 'token' or other.name == 'token' else: return self.start > other.start class PropertyAnnotation: def __init__(self, attrs, lookup): self.lookup = lookup self.property_id = attrs['property_id'] self.tag_id = attrs['tag_id'] if self.property_id not in self.lookup[self.tag_id].properties: raise Exception('Unknown property id: {}'.format(self.property_id)) self.value = attrs['value'] # TODO: Process attrs['possibleValues'] as self.labels (no id?) @property def name(self): return self.lookup[self.tag_id].properties[self.property_id].name def to_dict(self): return { 'property_id': self.property_id, 'tag_id': self.tag_id, 'value': self.value } class TagDefinition: def __init__(self, attrs): self.id = attrs['id'] self.name = attrs['name'] self.description = attrs.get('description', '') self.properties = { property_definition.id: property_definition for property_definition in [ PropertyDefinition(x) for x in attrs.get('properties', []) ] } @property def required_properties(self): return {property.id: property for property in self.properties.values() if property.is_required} class PropertyDefinition: def __init__(self, attrs): self.id = attrs['id'] self.name = attrs['name'] self.description = attrs.get('description', '') self.flags = attrs.get('flags', []) self.labels = attrs.get('labels', []) @property def is_required(self): return 'required' in self.flags @property def has_multiple_values(self): return 'multiple' in self.flags