from xml.sax.saxutils import escape def create_vrt(text, stand_off_data): # Devide annotations into CWB's verticalized text format (.vrt) logic p_attrs = [] # positional attributes s_attrs = [] # structural attributes for annotation in stand_off_data.annotations: if annotation.name == 'token': p_attrs.append(annotation) else: s_attrs.append(annotation) # Sort annotations, necessary for the next checks p_attrs.sort() s_attrs.sort() # Check for p_attr<->p_attr overlap for i, p_attr in enumerate(p_attrs[:-1]): next_p_attr = p_attrs[i + 1] # Check if first_p_attr starts/ends within second_p_attr if ((p_attr.start >= next_p_attr.start) and (p_attr.start <= next_p_attr.end) # noqa or (p_attr.end >= next_p_attr.start) and (p_attr.end <= next_p_attr.end)): # noqa raise Exception('Positional attribute overlaps another') # Check for s_attr<->p_attr overlap for i, s_attr in enumerate(s_attrs): for p_attr in p_attrs: # Check if s_attr starts within p_attr if s_attr.start > p_attr.start and s_attr.start < p_attr.end: # Change s_attr start to p_attr's start s_attrs[i].start = p_attr.start # Check if s_attr ends within p_attr if s_attr.end < p_attr.end and s_attr.end > p_attr.start: # Change s_attr end to p_attr's end s_attrs[i].end = p_attr.end # Check if s_attr starts/ends before/after p_attr if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start: # No further Checking needed (just because p_attrs are sorted) break s_attr_start_buffer = {} s_attr_end_buffer = {} for i, s_attr in enumerate(s_attrs): if s_attr_start_buffer[s_attr.start]: s_attr_start_buffer[s_attr.start].append(i) else: s_attr_start_buffer[s_attr.start] = [i] if s_attr_end_buffer[s_attr.end]: s_attr_end_buffer[s_attr.end].append(i) else: s_attr_end_buffer[s_attr.end] = [i] vrt = '' vrt += '\n' for p_attr in p_attrs: # s_attr_starts for k in {k: v for k, v in s_attr_start_buffer.items() if k <= p_attr.start}: # noqa s_attrs = s_attr_start_buffer.pop(k) for s_attr in s_attrs: foo = '' for property in s_attr.properties: foo += ' {}="{}"'.format(escape(property.name), escape(property.value)) vrt += '<{}{}>\n'.format(escape(s_attr.name), foo) for k in {k: v for k, v in s_attr_end_buffer.items() if k <= p_attr.start}: # noqa s_attrs = s_attr_end_buffer.pop(k) for s_attr in s_attrs: vrt += '\n'.format(escape(s_attr.name)) # s_attr_ends foo = {'lemma': None, 'ner': None, 'pos': None, 'simple_pos': None, 'word': None} # noqa for property in p_attrs.properties: if property.name == 'lemma': foo['lemma'] = escape(property.value) elif property.name == 'ner': foo['ner'] = escape(property.value) elif property.name == 'pos': foo['pos'] = escape(property.value) elif property.name == 'simple_pos': foo['simple_pos'] = escape(property.value) foo['word'] = escape(text[p_attr.start:p_attr.end]) vrt += '{word}\t{pos}\t{lemma}\t{simple_pos}\t{ner}\n'.format(**foo) vrt += '\n'