from xml.sax.saxutils import escape class StandOffData: def __init__(self, attrs={}): self.meta = attrs.get('meta', {}) self.lookup = {} for x in attrs.get('tags', []): self.add_tag_definition(x) self.annotations = [ TagAnnotation(x, self.lookup) for x in attrs.get('annotations', []) ] def add_tag_definition(self, attrs): tag_definition = TagDefinition(attrs) if tag_definition.id in self.lookup: raise Exception(f'Tag id already in use: {self.to_dict()}') self.lookup[tag_definition.id] = tag_definition def to_dict(self): return { 'meta': self.meta, 'lookup': {k: v.to_dict() for k, v in self.lookup.items()}, 'annotations': [x.to_dict() for x in self.annotations] } def to_vrt(self, text): # Devide annotations into CWB's verticalized text format (.vrt) logic p_attrs = [] # positional attributes s_attrs = [] # structural attributes for annotation in self.annotations: if annotation.name == 'token': p_attrs.append(annotation) else: s_attrs.append(annotation) # Sort annotations, necessary for the next checks p_attrs.sort() s_attrs.sort() # Check for p_attr<->p_attr overlap for i, p_attr in enumerate(p_attrs[:-1]): next_p_attr = p_attrs[i + 1] # Check if first_p_attr starts/ends within second_p_attr if ((p_attr.start >= next_p_attr.start) and (p_attr.start < next_p_attr.end) # noqa or (p_attr.end > next_p_attr.start) and (p_attr.end <= next_p_attr.end)): # noqa raise Exception( 'Positional attribute overlaps another: ' f'{p_attr.to_dict()}<->{next_p_attr.to_dict()}' ) # Check for s_attr<->p_attr overlap for i, s_attr in enumerate(s_attrs): for p_attr in p_attrs: # Check if s_attr starts within p_attr if s_attr.start > p_attr.start and s_attr.start < p_attr.end: # Change s_attr start to p_attr's start s_attrs[i].start = p_attr.start # Check if s_attr ends within p_attr if s_attr.end < p_attr.end and s_attr.end > p_attr.start: # Change s_attr end to p_attr's end s_attrs[i].end = p_attr.end # Check if s_attr starts/ends before/after p_attr if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start: # No further Checking needed (because p_attrs are sorted) break p_attr_buffer = {} for i, p_attr in enumerate(p_attrs): p_attr_buffer[p_attr.start] = i s_attr_start_buffer = {} s_attr_end_buffer = {} for i, s_attr in enumerate(s_attrs): if s_attr.start in s_attr_start_buffer: s_attr_start_buffer[s_attr.start].append(i) else: s_attr_start_buffer[s_attr.start] = [i] if s_attr.end in s_attr_end_buffer: s_attr_end_buffer[s_attr.end].insert(0, i) else: s_attr_end_buffer[s_attr.end] = [i] vrt = '' vrt += '\n' current_position = 0 text_len = len(text) # As long as we have something in our buffers we process it while current_position <= text_len: # s_attr endings # for k in {k: v for k, v in s_attr_end_buffer.items() if k <= current_position}: # noqa if current_position in s_attr_end_buffer: # s_attr_indexes = s_attr_end_buffer.pop(k) s_attr_indexes = s_attr_end_buffer.pop(current_position) for s_attr_index in s_attr_indexes: s_attr = s_attrs[s_attr_index] vrt += f'\n' # s_attrs starts # for k in {k: v for k, v in s_attr_start_buffer.items() if k <= current_position}: # noqa if current_position in s_attr_start_buffer: # s_attr_indexes = s_attr_start_buffer.pop(k) s_attr_indexes = s_attr_start_buffer.pop(current_position) for s_attr_index in s_attr_indexes: s_attr = s_attrs[s_attr_index] vrt += f'<{escape(s_attr.name)}' for property in s_attr.properties: vrt += f' {escape(property.name)}="{escape(str(property.value))}"' # noqa vrt += '>\n' # p_attrs if current_position not in p_attr_buffer: current_position += 1 continue p_attr_index = p_attr_buffer.pop(current_position) p_attr = p_attrs[p_attr_index] if text[p_attr.start:p_attr.end].isspace(): current_position = p_attr.end continue _p_attr = { 'lemma': 'None', 'pos': 'None', 'simple_pos': 'None', 'word': 'None' } for property in p_attr.properties: if property.name not in _p_attr: continue _p_attr[property.name] = escape(str(property.value)) _p_attr['word'] = escape(text[p_attr.start:p_attr.end]) vrt += '{word}\t{pos}\t{lemma}\t{simple_pos}\n'.format(**_p_attr) current_position = p_attr.end vrt += '\n' return vrt class TagAnnotation: def __init__(self, attrs, lookup): self.lookup = lookup self.tag_id = attrs['tag_id'] self.start = attrs['start'] self.end = attrs['end'] self.properties = [ PropertyAnnotation(x, self.lookup[self.tag_id].properties) for x in attrs.get('properties', []) ] ''' Sanity checks ''' if self.tag_id not in self.lookup: raise Exception(f'Unknown tag: {self.to_dict()}') if self.end < self.start: raise Exception(f'Annotation end less then start: {self.to_dict()}') # noqa # property_ids = [x.property_id for x in self.properties] # for required_property_id, required_property in self.lookup[self.tag_id].required_properties.items(): # noqa # if required_property_id not in property_ids: # raise Exception( # f'Missing required property: {required_property.to_dict()}' # ) @property def name(self): return self.lookup[self.tag_id].name def to_dict(self): return { 'tag_id': self.tag_id, 'start': self.start, 'end': self.end, 'properties': [x.to_dict() for x in self.properties] } def __lt__(self, other): if self.start == other.start: if self.name == 'token' and other.name != 'token': return False elif self.name != 'token' and other.name == 'token': return True else: return self.end > other.end else: return self.start < other.start def __le__(self, other): if self.start == other.start: if self.name == 'token' and other.name != 'token': return False elif self.name != 'token' and other.name == 'token': return True else: return self.end >= other.end else: return self.start <= other.start def __eq__(self, other): if self.start == other.start: if self.name == 'token' and other.name != 'token': return False elif self.name != 'token' and other.name == 'token': return False else: return self.end == other.end else: return False def __ne__(self, other): return not self == other def __gt__(self, other): return not self <= other def __ge__(self, other): return not self < other class PropertyAnnotation: def __init__(self, attrs, lookup): self.lookup = lookup self.property_id = attrs['property_id'] self.value = attrs['value'] # TODO: Process attrs['possibleValues'] as self.labels (no id?) ''' Sanity checks ''' if self.property_id not in self.lookup: raise Exception(f'Unknown property: {self.to_dict()}') @property def name(self): return self.lookup[self.property_id].name def to_dict(self): return { 'property_id': self.property_id, 'tag_id': self.tag_id, 'value': self.value } class TagDefinition: def __init__(self, attrs): self.id = attrs['id'] self.name = attrs['name'] self.description = attrs.get('description', '') self.properties = {} for x in attrs.get('properties', []): self.add_property_definition(x) def add_property_definition(self, attrs): property_definition = PropertyDefinition(attrs) if property_definition.id in self.properties: raise Exception( f'Property id already in use: {property_definition.to_dict()}') self.properties[property_definition.id] = property_definition # @property # def required_properties(self): # return {property.id: property for property in self.properties.values() # if property.is_required} def to_dict(self): return { 'id': self.id, 'name': self.name, 'description': self.description, 'properties': {k: v.to_dict() for k, v in self.properties.items()} } class PropertyDefinition: def __init__(self, attrs): self.id = attrs['id'] self.name = attrs['name'] self.description = attrs.get('description', '') self.flags = attrs.get('flags', []) self.labels = attrs.get('labels', []) # @property # def is_required(self): # return 'required' in self.flags @property def has_multiple_values(self): return 'multiple' in self.flags def to_dict(self): return { 'id': self.id, 'name': self.name, 'description': self.description, 'flags': self.flags, 'labels': self.labels }