from flask import current_app def normalize_vrt_file(input_file, output_file): def check_pos_attribute_order(vrt_lines): # The following orders are possible: # since 26.02.2019: 'word,lemma,simple_pos,pos,ner' # since 26.03.2021: 'word,pos,lemma,simple_pos,ner' # since 27.01.2022: 'word,pos,lemma,simple_pos' # This Function tries to find out which order we have by looking at the # number of attributes and the position of the simple_pos attribute SIMPLE_POS_LABELS = [ 'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X' ] for line in vrt_lines: if line.startswith('<'): continue pos_attrs = line.rstrip('\n').split('\t') num_pos_attrs = len(pos_attrs) if num_pos_attrs == 4: if pos_attrs[3] in SIMPLE_POS_LABELS: return ['word', 'pos', 'lemma', 'simple_pos'] continue elif num_pos_attrs == 5: if pos_attrs[2] in SIMPLE_POS_LABELS: return ['word', 'lemma', 'simple_pos', 'pos', 'ner'] elif pos_attrs[3] in SIMPLE_POS_LABELS: return ['word', 'pos', 'lemma', 'simple_pos', 'ner'] continue return None def check_has_ent_as_s_attr(vrt_lines): for line in vrt_lines: if line.startswith('<ent'): return True return False def pos_attrs_to_string_1(pos_attrs): return f'{pos_attrs[0]}\t{pos_attrs[3]}\t{pos_attrs[1]}\t{pos_attrs[2]}\n' def pos_attrs_to_string_2(pos_attrs): return f'{pos_attrs[0]}\t{pos_attrs[1]}\t{pos_attrs[2]}\t{pos_attrs[3]}\n' current_app.logger.info(f'Converting {input_file}...') with open(input_file) as f: input_vrt_lines = f.readlines() pos_attr_order = check_pos_attribute_order(input_vrt_lines) has_ent_as_s_attr = check_has_ent_as_s_attr(input_vrt_lines) current_app.logger.info(f'Detected pos_attr_order: [{",".join(pos_attr_order)}]') current_app.logger.info(f'Detected has_ent_as_s_attr: {has_ent_as_s_attr}') if pos_attr_order == ['word', 'lemma', 'simple_pos', 'pos', 'ner']: pos_attrs_to_string_function = pos_attrs_to_string_1 elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos', 'ner']: pos_attrs_to_string_function = pos_attrs_to_string_2 elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos']: pos_attrs_to_string_function = pos_attrs_to_string_2 else: raise Exception('Can not handle format') current_ent = None multi_line_tag_definition = False output_vrt = '' for line in input_vrt_lines: if line.strip() == '': continue if line.startswith('<'): if not has_ent_as_s_attr: if current_ent is not None: output_vrt += '</ent>\n' current_ent = None if not line.rstrip().endswith('>'): multi_line_tag_definition = True if line.startswith('<text'): output_vrt += '<text>\n' if line.startswith('</text>'): output_vrt += '</text>\n' elif line.startswith('<s'): output_vrt += '<s>\n' elif line.startswith('</s>'): output_vrt += '</s>\n' elif line.startswith('<ent'): output_vrt += line elif line.startswith('</ent>'): output_vrt += line continue if multi_line_tag_definition and line.rstrip().endswith('>'): multi_line_tag_definition = False continue pos_attrs = line.rstrip('\n').split('\t') if not has_ent_as_s_attr: if pos_attrs[4].lower() in ['null', 'none']: if current_ent: output_vrt += '</ent>\n' current_ent = None else: if current_ent is None: output_vrt += f'<ent type="{pos_attrs[4]}">\n' current_ent = pos_attrs[4] elif current_ent != pos_attrs[4]: output_vrt += '</ent>\n' current_ent = None output_vrt += f'<ent type="{pos_attrs[4]}">\n' current_ent = pos_attrs[4] output_vrt += pos_attrs_to_string_function(pos_attrs) with open(output_file, 'w') as f: f.write(output_vrt)