from flask import current_app from pathlib import Path def normalize_vrt_file(input_file: Path, output_file: Path): current_app.logger.info(f'Converting {input_file}...') with input_file.open() as f: input_vrt_lines = f.readlines() pos_attr_order = _check_pos_attribute_order(input_vrt_lines) has_ent_as_s_attr = _check_has_ent_as_s_attr(input_vrt_lines) current_app.logger.info(f'Detected pos_attr_order: [{",".join(pos_attr_order)}]') current_app.logger.info(f'Detected has_ent_as_s_attr: {has_ent_as_s_attr}') if pos_attr_order == ['word', 'lemma', 'simple_pos', 'pos', 'ner']: pos_attrs_to_string_function = _pos_attrs_to_string_1 elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos', 'ner']: pos_attrs_to_string_function = _pos_attrs_to_string_2 elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos']: pos_attrs_to_string_function = _pos_attrs_to_string_2 else: raise Exception('Can not handle format') current_ent = None multi_line_tag_definition = False output_vrt = '' for line in input_vrt_lines: if line.strip() == '': continue if line.startswith('<'): if not has_ent_as_s_attr: if current_ent is not None: output_vrt += '\n' current_ent = None if not line.rstrip().endswith('>'): multi_line_tag_definition = True if line.startswith(''): output_vrt += '\n' elif line.startswith(''): output_vrt += '\n' elif line.startswith(''): output_vrt += line continue if multi_line_tag_definition and line.rstrip().endswith('>'): multi_line_tag_definition = False continue pos_attrs = line.rstrip('\n').split('\t') if not has_ent_as_s_attr and len(pos_attrs) > 4: if pos_attrs[4].lower() in ['null', 'none']: if current_ent: output_vrt += '\n' current_ent = None else: if current_ent is None: output_vrt += f'\n' current_ent = pos_attrs[4] elif current_ent != pos_attrs[4]: output_vrt += '\n' current_ent = None output_vrt += f'\n' current_ent = pos_attrs[4] output_vrt += pos_attrs_to_string_function(pos_attrs) with output_file.open(mode='w') as f: f.write(output_vrt) def _check_pos_attribute_order(vrt_lines: list[str]) -> list[str]: # The following orders are possible: # since 26.02.2019: 'word,lemma,simple_pos,pos,ner' # since 26.03.2021: 'word,pos,lemma,simple_pos,ner' # since 27.01.2022: 'word,pos,lemma,simple_pos' # This Function tries to find out which order we have by looking at the # number of attributes and the position of the simple_pos attribute SIMPLE_POS_LABELS = [ 'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X' ] for line in vrt_lines: if line.startswith('<'): continue pos_attrs = line.rstrip('\n').split('\t') num_pos_attrs = len(pos_attrs) if num_pos_attrs == 4: if pos_attrs[3] in SIMPLE_POS_LABELS: return ['word', 'pos', 'lemma', 'simple_pos'] continue elif num_pos_attrs == 5: if pos_attrs[2] in SIMPLE_POS_LABELS: return ['word', 'lemma', 'simple_pos', 'pos', 'ner'] elif pos_attrs[3] in SIMPLE_POS_LABELS: return ['word', 'pos', 'lemma', 'simple_pos', 'ner'] continue # TODO: raise exception "can't determine attribute order" def _check_has_ent_as_s_attr(vrt_lines: list[str]) -> bool: for line in vrt_lines: if line.startswith(' str: return f'{pos_attrs[0]}\t{pos_attrs[3]}\t{pos_attrs[1]}\t{pos_attrs[2]}\n' def _pos_attrs_to_string_2(pos_attrs: list[str]) -> str: return f'{pos_attrs[0]}\t{pos_attrs[1]}\t{pos_attrs[2]}\t{pos_attrs[3]}\n'