nopaque/app/converters/vrt.py

118 lines
4.6 KiB
Python
Raw Permalink Normal View History

2022-04-12 14:11:40 +00:00
from flask import current_app
from pathlib import Path
2022-04-12 14:11:40 +00:00
def normalize_vrt_file(input_file: Path, output_file: Path):
2022-04-12 14:11:40 +00:00
current_app.logger.info(f'Converting {input_file}...')
with input_file.open() as f:
2022-04-12 14:11:40 +00:00
input_vrt_lines = f.readlines()
pos_attr_order = _check_pos_attribute_order(input_vrt_lines)
has_ent_as_s_attr = _check_has_ent_as_s_attr(input_vrt_lines)
2022-04-12 14:11:40 +00:00
current_app.logger.info(f'Detected pos_attr_order: [{",".join(pos_attr_order)}]')
current_app.logger.info(f'Detected has_ent_as_s_attr: {has_ent_as_s_attr}')
if pos_attr_order == ['word', 'lemma', 'simple_pos', 'pos', 'ner']:
pos_attrs_to_string_function = _pos_attrs_to_string_1
2022-04-12 14:11:40 +00:00
elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos', 'ner']:
pos_attrs_to_string_function = _pos_attrs_to_string_2
2022-04-12 14:11:40 +00:00
elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos']:
pos_attrs_to_string_function = _pos_attrs_to_string_2
2022-04-12 14:11:40 +00:00
else:
raise Exception('Can not handle format')
current_ent = None
multi_line_tag_definition = False
output_vrt = ''
for line in input_vrt_lines:
if line.strip() == '':
continue
if line.startswith('<'):
if not has_ent_as_s_attr:
if current_ent is not None:
output_vrt += '</ent>\n'
current_ent = None
if not line.rstrip().endswith('>'):
multi_line_tag_definition = True
if line.startswith('<text'):
output_vrt += '<text>\n'
if line.startswith('</text>'):
output_vrt += '</text>\n'
elif line.startswith('<s'):
output_vrt += '<s>\n'
elif line.startswith('</s>'):
output_vrt += '</s>\n'
elif line.startswith('<ent'):
output_vrt += line
elif line.startswith('</ent>'):
output_vrt += line
continue
if multi_line_tag_definition and line.rstrip().endswith('>'):
multi_line_tag_definition = False
continue
pos_attrs = line.rstrip('\n').split('\t')
if not has_ent_as_s_attr and len(pos_attrs) > 4:
2022-04-12 14:11:40 +00:00
if pos_attrs[4].lower() in ['null', 'none']:
if current_ent:
output_vrt += '</ent>\n'
current_ent = None
else:
if current_ent is None:
output_vrt += f'<ent type="{pos_attrs[4]}">\n'
current_ent = pos_attrs[4]
elif current_ent != pos_attrs[4]:
output_vrt += '</ent>\n'
current_ent = None
output_vrt += f'<ent type="{pos_attrs[4]}">\n'
current_ent = pos_attrs[4]
output_vrt += pos_attrs_to_string_function(pos_attrs)
with output_file.open(mode='w') as f:
2022-04-12 14:11:40 +00:00
f.write(output_vrt)
def _check_pos_attribute_order(vrt_lines: list[str]) -> list[str]:
# The following orders are possible:
# since 26.02.2019: 'word,lemma,simple_pos,pos,ner'
# since 26.03.2021: 'word,pos,lemma,simple_pos,ner'
# since 27.01.2022: 'word,pos,lemma,simple_pos'
# This Function tries to find out which order we have by looking at the
# number of attributes and the position of the simple_pos attribute
SIMPLE_POS_LABELS = [
'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'DET', 'INTJ', 'NOUN', 'NUM',
'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X'
]
for line in vrt_lines:
if line.startswith('<'):
continue
pos_attrs = line.rstrip('\n').split('\t')
num_pos_attrs = len(pos_attrs)
if num_pos_attrs == 4:
if pos_attrs[3] in SIMPLE_POS_LABELS:
return ['word', 'pos', 'lemma', 'simple_pos']
continue
elif num_pos_attrs == 5:
if pos_attrs[2] in SIMPLE_POS_LABELS:
return ['word', 'lemma', 'simple_pos', 'pos', 'ner']
elif pos_attrs[3] in SIMPLE_POS_LABELS:
return ['word', 'pos', 'lemma', 'simple_pos', 'ner']
continue
# TODO: raise exception "can't determine attribute order"
def _check_has_ent_as_s_attr(vrt_lines: list[str]) -> bool:
for line in vrt_lines:
if line.startswith('<ent'):
return True
return False
def _pos_attrs_to_string_1(pos_attrs: list[str]) -> str:
return f'{pos_attrs[0]}\t{pos_attrs[3]}\t{pos_attrs[1]}\t{pos_attrs[2]}\n'
def _pos_attrs_to_string_2(pos_attrs: list[str]) -> str:
return f'{pos_attrs[0]}\t{pos_attrs[1]}\t{pos_attrs[2]}\t{pos_attrs[3]}\n'