from flask import current_app def normalize_vrt_file(input_file, output_file): def check_pos_attribute_order(vrt_lines): # The following orders are possible: # since 26.02.2019: 'word,lemma,simple_pos,pos,ner' # since 26.03.2021: 'word,pos,lemma,simple_pos,ner' # since 27.01.2022: 'word,pos,lemma,simple_pos' # This Function tries to find out which order we have by looking at the # number of attributes and the position of the simple_pos attribute SIMPLE_POS_LABELS = [ 'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X' ] for line in vrt_lines: if line.startswith('<'): continue pos_attrs = line.rstrip('\n').split('\t') num_pos_attrs = len(pos_attrs) if num_pos_attrs == 4: if pos_attrs[3] in SIMPLE_POS_LABELS: return ['word', 'pos', 'lemma', 'simple_pos'] continue elif num_pos_attrs == 5: if pos_attrs[2] in SIMPLE_POS_LABELS: return ['word', 'lemma', 'simple_pos', 'pos', 'ner'] elif pos_attrs[3] in SIMPLE_POS_LABELS: return ['word', 'pos', 'lemma', 'simple_pos', 'ner'] continue return None def check_has_ent_as_s_attr(vrt_lines): for line in vrt_lines: if line.startswith(''): multi_line_tag_definition = True if line.startswith(''): output_vrt += '\n' elif line.startswith(''): output_vrt += '\n' elif line.startswith(''): output_vrt += line continue if multi_line_tag_definition and line.rstrip().endswith('>'): multi_line_tag_definition = False continue pos_attrs = line.rstrip('\n').split('\t') if not has_ent_as_s_attr and len(pos_attrs) > 4: if pos_attrs[4].lower() in ['null', 'none']: if current_ent: output_vrt += '\n' current_ent = None else: if current_ent is None: output_vrt += f'\n' current_ent = pos_attrs[4] elif current_ent != pos_attrs[4]: output_vrt += '\n' current_ent = None output_vrt += f'\n' current_ent = pos_attrs[4] output_vrt += pos_attrs_to_string_function(pos_attrs) with open(output_file, 'w') as f: f.write(output_vrt)