from flask import current_app


def normalize_vrt_file(input_file, output_file):
    def check_pos_attribute_order(vrt_lines):
        # The following orders are possible:
        # since 26.02.2019: 'word,lemma,simple_pos,pos,ner'
        # since 26.03.2021: 'word,pos,lemma,simple_pos,ner'
        # since 27.01.2022: 'word,pos,lemma,simple_pos'
        # This Function tries to find out which order we have by looking at the
        # number of attributes and the position of the simple_pos attribute
        SIMPLE_POS_LABELS = [
            'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ',
            'DET', 'INTJ', 'NOUN', 'NUM', 'PART',
            'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',
            'VERB', 'X'
        ]
        for line in vrt_lines:
            if line.startswith('<'):
                continue
            pos_attrs = line.rstrip('\n').split('\t')
            num_pos_attrs = len(pos_attrs)
            if num_pos_attrs == 4:
                if pos_attrs[3] in SIMPLE_POS_LABELS:
                    return ['word', 'pos', 'lemma', 'simple_pos']
                continue
            elif num_pos_attrs == 5:
                if pos_attrs[2] in SIMPLE_POS_LABELS:
                    return ['word', 'lemma', 'simple_pos', 'pos', 'ner']
                elif pos_attrs[3] in SIMPLE_POS_LABELS:
                    return ['word', 'pos', 'lemma', 'simple_pos', 'ner']
                continue
        return None


    def check_has_ent_as_s_attr(vrt_lines):
        for line in vrt_lines:
            if line.startswith('<ent'):
                return True
        return False


    def pos_attrs_to_string_1(pos_attrs):
        return f'{pos_attrs[0]}\t{pos_attrs[3]}\t{pos_attrs[1]}\t{pos_attrs[2]}\n'


    def pos_attrs_to_string_2(pos_attrs):
        return f'{pos_attrs[0]}\t{pos_attrs[1]}\t{pos_attrs[2]}\t{pos_attrs[3]}\n'

    current_app.logger.info(f'Converting {input_file}...')

    with open(input_file) as f:
        input_vrt_lines = f.readlines()

    pos_attr_order = check_pos_attribute_order(input_vrt_lines)
    has_ent_as_s_attr = check_has_ent_as_s_attr(input_vrt_lines)

    current_app.logger.info(f'Detected pos_attr_order: [{",".join(pos_attr_order)}]')
    current_app.logger.info(f'Detected has_ent_as_s_attr: {has_ent_as_s_attr}')

    if pos_attr_order == ['word', 'lemma', 'simple_pos', 'pos', 'ner']:
        pos_attrs_to_string_function = pos_attrs_to_string_1
    elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos', 'ner']:
        pos_attrs_to_string_function = pos_attrs_to_string_2
    elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos']:
        pos_attrs_to_string_function = pos_attrs_to_string_2
    else:
        raise Exception('Can not handle format')

    current_ent = None
    multi_line_tag_definition = False
    output_vrt = ''
    for line in input_vrt_lines:
        if line.strip() == '':
            continue
        if line.startswith('<'):
            if not has_ent_as_s_attr:
                if current_ent is not None:
                    output_vrt += '</ent>\n'
                    current_ent = None
            if not line.rstrip().endswith('>'):
                multi_line_tag_definition = True
            if line.startswith('<text'):
                output_vrt += '<text>\n'
            if line.startswith('</text>'):
                output_vrt += '</text>\n'
            elif line.startswith('<s'):
                output_vrt += '<s>\n'
            elif line.startswith('</s>'):
                output_vrt += '</s>\n'
            elif line.startswith('<ent'):
                output_vrt += line
            elif line.startswith('</ent>'):
                output_vrt += line
            continue
        if multi_line_tag_definition and line.rstrip().endswith('>'):
            multi_line_tag_definition = False
            continue
        pos_attrs = line.rstrip('\n').split('\t')
        if not has_ent_as_s_attr:
            if pos_attrs[4].lower() in ['null', 'none']:
                if current_ent:
                    output_vrt += '</ent>\n'
                    current_ent = None
            else:
                if current_ent is None:
                    output_vrt += f'<ent type="{pos_attrs[4]}">\n'
                    current_ent = pos_attrs[4]
                elif current_ent != pos_attrs[4]:
                    output_vrt += '</ent>\n'
                    current_ent = None
                    output_vrt += f'<ent type="{pos_attrs[4]}">\n'
                    current_ent = pos_attrs[4]
        output_vrt += pos_attrs_to_string_function(pos_attrs)

    with open(output_file, 'w') as f:
        f.write(output_vrt)