mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2024-12-25 19:04:18 +00:00
Code enhancements in vrt file normalizer module
This commit is contained in:
parent
909b130285
commit
b4faa1c695
@ -1,69 +1,25 @@
|
|||||||
from flask import current_app
|
from flask import current_app
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
def normalize_vrt_file(input_file, output_file):
|
def normalize_vrt_file(input_file: Path, output_file: Path):
|
||||||
def check_pos_attribute_order(vrt_lines):
|
|
||||||
# The following orders are possible:
|
|
||||||
# since 26.02.2019: 'word,lemma,simple_pos,pos,ner'
|
|
||||||
# since 26.03.2021: 'word,pos,lemma,simple_pos,ner'
|
|
||||||
# since 27.01.2022: 'word,pos,lemma,simple_pos'
|
|
||||||
# This Function tries to find out which order we have by looking at the
|
|
||||||
# number of attributes and the position of the simple_pos attribute
|
|
||||||
SIMPLE_POS_LABELS = [
|
|
||||||
'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ',
|
|
||||||
'DET', 'INTJ', 'NOUN', 'NUM', 'PART',
|
|
||||||
'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',
|
|
||||||
'VERB', 'X'
|
|
||||||
]
|
|
||||||
for line in vrt_lines:
|
|
||||||
if line.startswith('<'):
|
|
||||||
continue
|
|
||||||
pos_attrs = line.rstrip('\n').split('\t')
|
|
||||||
num_pos_attrs = len(pos_attrs)
|
|
||||||
if num_pos_attrs == 4:
|
|
||||||
if pos_attrs[3] in SIMPLE_POS_LABELS:
|
|
||||||
return ['word', 'pos', 'lemma', 'simple_pos']
|
|
||||||
continue
|
|
||||||
elif num_pos_attrs == 5:
|
|
||||||
if pos_attrs[2] in SIMPLE_POS_LABELS:
|
|
||||||
return ['word', 'lemma', 'simple_pos', 'pos', 'ner']
|
|
||||||
elif pos_attrs[3] in SIMPLE_POS_LABELS:
|
|
||||||
return ['word', 'pos', 'lemma', 'simple_pos', 'ner']
|
|
||||||
continue
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def check_has_ent_as_s_attr(vrt_lines):
|
|
||||||
for line in vrt_lines:
|
|
||||||
if line.startswith('<ent'):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def pos_attrs_to_string_1(pos_attrs):
|
|
||||||
return f'{pos_attrs[0]}\t{pos_attrs[3]}\t{pos_attrs[1]}\t{pos_attrs[2]}\n'
|
|
||||||
|
|
||||||
|
|
||||||
def pos_attrs_to_string_2(pos_attrs):
|
|
||||||
return f'{pos_attrs[0]}\t{pos_attrs[1]}\t{pos_attrs[2]}\t{pos_attrs[3]}\n'
|
|
||||||
|
|
||||||
current_app.logger.info(f'Converting {input_file}...')
|
current_app.logger.info(f'Converting {input_file}...')
|
||||||
|
|
||||||
with open(input_file) as f:
|
with input_file.open() as f:
|
||||||
input_vrt_lines = f.readlines()
|
input_vrt_lines = f.readlines()
|
||||||
|
|
||||||
pos_attr_order = check_pos_attribute_order(input_vrt_lines)
|
pos_attr_order = _check_pos_attribute_order(input_vrt_lines)
|
||||||
has_ent_as_s_attr = check_has_ent_as_s_attr(input_vrt_lines)
|
has_ent_as_s_attr = _check_has_ent_as_s_attr(input_vrt_lines)
|
||||||
|
|
||||||
current_app.logger.info(f'Detected pos_attr_order: [{",".join(pos_attr_order)}]')
|
current_app.logger.info(f'Detected pos_attr_order: [{",".join(pos_attr_order)}]')
|
||||||
current_app.logger.info(f'Detected has_ent_as_s_attr: {has_ent_as_s_attr}')
|
current_app.logger.info(f'Detected has_ent_as_s_attr: {has_ent_as_s_attr}')
|
||||||
|
|
||||||
if pos_attr_order == ['word', 'lemma', 'simple_pos', 'pos', 'ner']:
|
if pos_attr_order == ['word', 'lemma', 'simple_pos', 'pos', 'ner']:
|
||||||
pos_attrs_to_string_function = pos_attrs_to_string_1
|
pos_attrs_to_string_function = _pos_attrs_to_string_1
|
||||||
elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos', 'ner']:
|
elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos', 'ner']:
|
||||||
pos_attrs_to_string_function = pos_attrs_to_string_2
|
pos_attrs_to_string_function = _pos_attrs_to_string_2
|
||||||
elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos']:
|
elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos']:
|
||||||
pos_attrs_to_string_function = pos_attrs_to_string_2
|
pos_attrs_to_string_function = _pos_attrs_to_string_2
|
||||||
else:
|
else:
|
||||||
raise Exception('Can not handle format')
|
raise Exception('Can not handle format')
|
||||||
|
|
||||||
@ -113,5 +69,49 @@ def normalize_vrt_file(input_file, output_file):
|
|||||||
current_ent = pos_attrs[4]
|
current_ent = pos_attrs[4]
|
||||||
output_vrt += pos_attrs_to_string_function(pos_attrs)
|
output_vrt += pos_attrs_to_string_function(pos_attrs)
|
||||||
|
|
||||||
with open(output_file, 'w') as f:
|
with output_file.open(mode='w') as f:
|
||||||
f.write(output_vrt)
|
f.write(output_vrt)
|
||||||
|
|
||||||
|
|
||||||
|
def _check_pos_attribute_order(vrt_lines: list[str]) -> list[str]:
|
||||||
|
# The following orders are possible:
|
||||||
|
# since 26.02.2019: 'word,lemma,simple_pos,pos,ner'
|
||||||
|
# since 26.03.2021: 'word,pos,lemma,simple_pos,ner'
|
||||||
|
# since 27.01.2022: 'word,pos,lemma,simple_pos'
|
||||||
|
# This Function tries to find out which order we have by looking at the
|
||||||
|
# number of attributes and the position of the simple_pos attribute
|
||||||
|
SIMPLE_POS_LABELS = [
|
||||||
|
'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'DET', 'INTJ', 'NOUN', 'NUM',
|
||||||
|
'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X'
|
||||||
|
]
|
||||||
|
for line in vrt_lines:
|
||||||
|
if line.startswith('<'):
|
||||||
|
continue
|
||||||
|
pos_attrs = line.rstrip('\n').split('\t')
|
||||||
|
num_pos_attrs = len(pos_attrs)
|
||||||
|
if num_pos_attrs == 4:
|
||||||
|
if pos_attrs[3] in SIMPLE_POS_LABELS:
|
||||||
|
return ['word', 'pos', 'lemma', 'simple_pos']
|
||||||
|
continue
|
||||||
|
elif num_pos_attrs == 5:
|
||||||
|
if pos_attrs[2] in SIMPLE_POS_LABELS:
|
||||||
|
return ['word', 'lemma', 'simple_pos', 'pos', 'ner']
|
||||||
|
elif pos_attrs[3] in SIMPLE_POS_LABELS:
|
||||||
|
return ['word', 'pos', 'lemma', 'simple_pos', 'ner']
|
||||||
|
continue
|
||||||
|
# TODO: raise exception "can't determine attribute order"
|
||||||
|
|
||||||
|
|
||||||
|
def _check_has_ent_as_s_attr(vrt_lines: list[str]) -> bool:
|
||||||
|
for line in vrt_lines:
|
||||||
|
if line.startswith('<ent'):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _pos_attrs_to_string_1(pos_attrs: list[str]) -> str:
|
||||||
|
return f'{pos_attrs[0]}\t{pos_attrs[3]}\t{pos_attrs[1]}\t{pos_attrs[2]}\n'
|
||||||
|
|
||||||
|
|
||||||
|
def _pos_attrs_to_string_2(pos_attrs: list[str]) -> str:
|
||||||
|
return f'{pos_attrs[0]}\t{pos_attrs[1]}\t{pos_attrs[2]}\t{pos_attrs[3]}\n'
|
||||||
|
Loading…
Reference in New Issue
Block a user