def create_vrt(text, stand_off_data): # Devide annotations into CWB's verticalized text format (.vrt) logic p_attrs = [] # positional attributes s_attrs = [] # structural attributes for annotation in stand_off_data.annotations: if annotation.name == 'token': p_attrs.append(annotation) else: s_attrs.append(annotation) # Sort annotations, necessary for the next checks p_attrs.sort() s_attrs.sort() # Check for p_attr<->p_attr overlap for i, p_attr in enumerate(p_attrs[:-1]): next_p_attr = p_attrs[i + 1] # Check if first_p_attr starts/ends within second_p_attr if ((p_attr.start >= next_p_attr.start) and (p_attr.start <= next_p_attr.end) # noqa or (p_attr.end >= next_p_attr.start) and (p_attr.end <= next_p_attr.end)): # noqa raise Exception('Positional attribute overlaps another') # Check for s_attr<->p_attr overlap for i, s_attr in enumerate(s_attrs): for p_attr in p_attrs: # Check if s_attr starts within p_attr if s_attr.start > p_attr.start and s_attr.start < p_attr.end: # Change s_attr start to p_attr's start s_attrs[i].start = p_attr.start # Check if s_attr ends within p_attr if s_attr.end < p_attr.end and s_attr.end > p_attr.start: # Change s_attr end to p_attr's end s_attrs[i].end = p_attr.end # Check if s_attr starts/ends before/after p_attr if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start: # No further Checking needed (just because p_attrs are sorted) break s_attr_start_buffer = {} s_attr_end_buffer = {} for i, s_attr in enumerate(s_attrs): if s_attr_start_buffer[s_attr.start]: s_attr_start_buffer[s_attr.start].append(i) else: s_attr_start_buffer[s_attr.start] = [i] if s_attr_end_buffer[s_attr.end]: s_attr_end_buffer[s_attr.end].append(i) else: s_attr_end_buffer[s_attr.end] = [1] vrt = '' # TODO do the work!