nopaque/app/converters/vrt.py

from flask import current_app
from pathlib import Path


def normalize_vrt_file(input_file: Path, output_file: Path):
    current_app.logger.info(f'Converting {input_file}...')

    with input_file.open() as f:
        input_vrt_lines = f.readlines()

    pos_attr_order = _check_pos_attribute_order(input_vrt_lines)
    has_ent_as_s_attr = _check_has_ent_as_s_attr(input_vrt_lines)

    current_app.logger.info(f'Detected pos_attr_order: [{",".join(pos_attr_order)}]')
    current_app.logger.info(f'Detected has_ent_as_s_attr: {has_ent_as_s_attr}')

    if pos_attr_order == ['word', 'lemma', 'simple_pos', 'pos', 'ner']:
        pos_attrs_to_string_function = _pos_attrs_to_string_1
    elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos', 'ner']:
        pos_attrs_to_string_function = _pos_attrs_to_string_2
    elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos']:
        pos_attrs_to_string_function = _pos_attrs_to_string_2
    else:
        raise Exception('Can not handle format')

    current_ent = None
    multi_line_tag_definition = False
    output_vrt = ''
    for line in input_vrt_lines:
        if line.strip() == '':
            continue
        if line.startswith('<'):
            if not has_ent_as_s_attr:
                if current_ent is not None:
                    output_vrt += '</ent>\n'
                    current_ent = None
            if not line.rstrip().endswith('>'):
                multi_line_tag_definition = True
            if line.startswith('<text'):
                output_vrt += '<text>\n'
            if line.startswith('</text>'):
                output_vrt += '</text>\n'
            elif line.startswith('<s'):
                output_vrt += '<s>\n'
            elif line.startswith('</s>'):
                output_vrt += '</s>\n'
            elif line.startswith('<ent'):
                output_vrt += line
            elif line.startswith('</ent>'):
                output_vrt += line
            continue
        if multi_line_tag_definition and line.rstrip().endswith('>'):
            multi_line_tag_definition = False
            continue
        pos_attrs = line.rstrip('\n').split('\t')
        if not has_ent_as_s_attr and len(pos_attrs) > 4:
            if pos_attrs[4].lower() in ['null', 'none']:
                if current_ent:
                    output_vrt += '</ent>\n'
                    current_ent = None
            else:
                if current_ent is None:
                    output_vrt += f'<ent type="{pos_attrs[4]}">\n'
                    current_ent = pos_attrs[4]
                elif current_ent != pos_attrs[4]:
                    output_vrt += '</ent>\n'
                    current_ent = None
                    output_vrt += f'<ent type="{pos_attrs[4]}">\n'
                    current_ent = pos_attrs[4]
        output_vrt += pos_attrs_to_string_function(pos_attrs)

    with output_file.open(mode='w') as f:
        f.write(output_vrt)


def _check_pos_attribute_order(vrt_lines: list[str]) -> list[str]:
    # The following orders are possible:
    # since 26.02.2019: 'word,lemma,simple_pos,pos,ner'
    # since 26.03.2021: 'word,pos,lemma,simple_pos,ner'
    # since 27.01.2022: 'word,pos,lemma,simple_pos'
    # This Function tries to find out which order we have by looking at the
    # number of attributes and the position of the simple_pos attribute
    SIMPLE_POS_LABELS = [
        'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'DET', 'INTJ', 'NOUN', 'NUM',
        'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X'
    ]
    for line in vrt_lines:
        if line.startswith('<'):
            continue
        pos_attrs = line.rstrip('\n').split('\t')
        num_pos_attrs = len(pos_attrs)
        if num_pos_attrs == 4:
            if pos_attrs[3] in SIMPLE_POS_LABELS:
                return ['word', 'pos', 'lemma', 'simple_pos']
            continue
        elif num_pos_attrs == 5:
            if pos_attrs[2] in SIMPLE_POS_LABELS:
                return ['word', 'lemma', 'simple_pos', 'pos', 'ner']
            elif pos_attrs[3] in SIMPLE_POS_LABELS:
                return ['word', 'pos', 'lemma', 'simple_pos', 'ner']
            continue
    # TODO: raise exception "can't determine attribute order"


def _check_has_ent_as_s_attr(vrt_lines: list[str]) -> bool:
    for line in vrt_lines:
        if line.startswith('<ent'):
            return True
    return False


def _pos_attrs_to_string_1(pos_attrs: list[str]) -> str:
    return f'{pos_attrs[0]}\t{pos_attrs[3]}\t{pos_attrs[1]}\t{pos_attrs[2]}\n'


def _pos_attrs_to_string_2(pos_attrs: list[str]) -> str:
    return f'{pos_attrs[0]}\t{pos_attrs[1]}\t{pos_attrs[2]}\t{pos_attrs[3]}\n'
normalize vrt on build 2022-04-12 14:11:40 +00:00			`from flask import current_app`
Code enhancements in vrt file normalizer module 2024-11-07 09:40:25 +00:00			`from pathlib import Path`
normalize vrt on build 2022-04-12 14:11:40 +00:00

Code enhancements in vrt file normalizer module 2024-11-07 09:40:25 +00:00			`def normalize_vrt_file(input_file: Path, output_file: Path):`
normalize vrt on build 2022-04-12 14:11:40 +00:00			`current_app.logger.info(f'Converting {input_file}...')`

Code enhancements in vrt file normalizer module 2024-11-07 09:40:25 +00:00			`with input_file.open() as f:`
normalize vrt on build 2022-04-12 14:11:40 +00:00			`input_vrt_lines = f.readlines()`

Code enhancements in vrt file normalizer module 2024-11-07 09:40:25 +00:00			`pos_attr_order = _check_pos_attribute_order(input_vrt_lines)`
			`has_ent_as_s_attr = _check_has_ent_as_s_attr(input_vrt_lines)`
normalize vrt on build 2022-04-12 14:11:40 +00:00
			`current_app.logger.info(f'Detected pos_attr_order: [{",".join(pos_attr_order)}]')`
			`current_app.logger.info(f'Detected has_ent_as_s_attr: {has_ent_as_s_attr}')`

			`if pos_attr_order == ['word', 'lemma', 'simple_pos', 'pos', 'ner']:`
Code enhancements in vrt file normalizer module 2024-11-07 09:40:25 +00:00			`pos_attrs_to_string_function = _pos_attrs_to_string_1`
normalize vrt on build 2022-04-12 14:11:40 +00:00			`elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos', 'ner']:`
Code enhancements in vrt file normalizer module 2024-11-07 09:40:25 +00:00			`pos_attrs_to_string_function = _pos_attrs_to_string_2`
normalize vrt on build 2022-04-12 14:11:40 +00:00			`elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos']:`
Code enhancements in vrt file normalizer module 2024-11-07 09:40:25 +00:00			`pos_attrs_to_string_function = _pos_attrs_to_string_2`
normalize vrt on build 2022-04-12 14:11:40 +00:00			`else:`
			`raise Exception('Can not handle format')`

			`current_ent = None`
			`multi_line_tag_definition = False`
			`output_vrt = ''`
			`for line in input_vrt_lines:`
			`if line.strip() == '':`
			`continue`
			`if line.startswith('<'):`
			`if not has_ent_as_s_attr:`
			`if current_ent is not None:`
			`output_vrt += '</ent>\n'`
			`current_ent = None`
			`if not line.rstrip().endswith('>'):`
			`multi_line_tag_definition = True`
			`if line.startswith('<text'):`
			`output_vrt += '<text>\n'`
			`if line.startswith('</text>'):`
			`output_vrt += '</text>\n'`
			`elif line.startswith('<s'):`
			`output_vrt += '<s>\n'`
			`elif line.startswith('</s>'):`
			`output_vrt += '</s>\n'`
			`elif line.startswith('<ent'):`
			`output_vrt += line`
			`elif line.startswith('</ent>'):`
			`output_vrt += line`
			`continue`
			`if multi_line_tag_definition and line.rstrip().endswith('>'):`
			`multi_line_tag_definition = False`
			`continue`
			`pos_attrs = line.rstrip('\n').split('\t')`
Fix corpus building process for vrt files without entities 2022-11-25 09:46:46 +00:00			`if not has_ent_as_s_attr and len(pos_attrs) > 4:`
normalize vrt on build 2022-04-12 14:11:40 +00:00			`if pos_attrs[4].lower() in ['null', 'none']:`
			`if current_ent:`
			`output_vrt += '</ent>\n'`
			`current_ent = None`
			`else:`
			`if current_ent is None:`
			`output_vrt += f'<ent type="{pos_attrs[4]}">\n'`
			`current_ent = pos_attrs[4]`
			`elif current_ent != pos_attrs[4]:`
			`output_vrt += '</ent>\n'`
			`current_ent = None`
			`output_vrt += f'<ent type="{pos_attrs[4]}">\n'`
			`current_ent = pos_attrs[4]`
			`output_vrt += pos_attrs_to_string_function(pos_attrs)`

Code enhancements in vrt file normalizer module 2024-11-07 09:40:25 +00:00			`with output_file.open(mode='w') as f:`
normalize vrt on build 2022-04-12 14:11:40 +00:00			`f.write(output_vrt)`
Code enhancements in vrt file normalizer module 2024-11-07 09:40:25 +00:00

			`def _check_pos_attribute_order(vrt_lines: list[str]) -> list[str]:`
			`# The following orders are possible:`
			`# since 26.02.2019: 'word,lemma,simple_pos,pos,ner'`
			`# since 26.03.2021: 'word,pos,lemma,simple_pos,ner'`
			`# since 27.01.2022: 'word,pos,lemma,simple_pos'`
			`# This Function tries to find out which order we have by looking at the`
			`# number of attributes and the position of the simple_pos attribute`
			`SIMPLE_POS_LABELS = [`
			`'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'DET', 'INTJ', 'NOUN', 'NUM',`
			`'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X'`
			`]`
			`for line in vrt_lines:`
			`if line.startswith('<'):`
			`continue`
			`pos_attrs = line.rstrip('\n').split('\t')`
			`num_pos_attrs = len(pos_attrs)`
			`if num_pos_attrs == 4:`
			`if pos_attrs[3] in SIMPLE_POS_LABELS:`
			`return ['word', 'pos', 'lemma', 'simple_pos']`
			`continue`
			`elif num_pos_attrs == 5:`
			`if pos_attrs[2] in SIMPLE_POS_LABELS:`
			`return ['word', 'lemma', 'simple_pos', 'pos', 'ner']`
			`elif pos_attrs[3] in SIMPLE_POS_LABELS:`
			`return ['word', 'pos', 'lemma', 'simple_pos', 'ner']`
			`continue`
			`# TODO: raise exception "can't determine attribute order"`


			`def _check_has_ent_as_s_attr(vrt_lines: list[str]) -> bool:`
			`for line in vrt_lines:`
			`if line.startswith('<ent'):`
			`return True`
			`return False`


			`def _pos_attrs_to_string_1(pos_attrs: list[str]) -> str:`
			`return f'{pos_attrs[0]}\t{pos_attrs[3]}\t{pos_attrs[1]}\t{pos_attrs[2]}\n'`


			`def _pos_attrs_to_string_2(pos_attrs: list[str]) -> str:`
			`return f'{pos_attrs[0]}\t{pos_attrs[1]}\t{pos_attrs[2]}\t{pos_attrs[3]}\n'`