normalize vrt on build

This commit is contained in:
Patrick Jentsch 2022-04-12 16:11:40 +02:00
parent 99ddd2e3dd
commit 4146e3789b
3 changed files with 140 additions and 123 deletions

View File

@ -4,6 +4,7 @@ from app.models import User, Corpus, CorpusFile
from datetime import datetime from datetime import datetime
import json import json
import os import os
import shutil
def convert(json_db_file, data_dir): def convert(json_db_file, data_dir):
@ -14,7 +15,7 @@ def convert(json_db_file, data_dir):
if not json_user['confirmed']: if not json_user['confirmed']:
current_app.logger.info(f'Skip unconfirmed user {json_user["username"]}') current_app.logger.info(f'Skip unconfirmed user {json_user["username"]}')
continue continue
user_dir = os.path.join(data_dir, json_user['id']) user_dir = os.path.join(data_dir, str(json_user['id']))
convert_user(json_user, user_dir) convert_user(json_user, user_dir)
db.session.commit() db.session.commit()
@ -42,7 +43,7 @@ def convert_user(json_user, user_dir):
if not json_corpus['files'].values(): if not json_corpus['files'].values():
current_app.logger.info(f'Skip empty corpus {json_corpus["title"]}') current_app.logger.info(f'Skip empty corpus {json_corpus["title"]}')
continue continue
corpus_dir = os.path.join(user_dir, 'corpora', json_corpus['id']) corpus_dir = os.path.join(user_dir, 'corpora', str(json_corpus['id']))
convert_corpus(json_corpus, user, corpus_dir) convert_corpus(json_corpus, user, corpus_dir)
current_app.logger.info('Done') current_app.logger.info('Done')
@ -66,12 +67,11 @@ def convert_corpus(json_corpus, user, corpus_dir):
db.session.rollback() db.session.rollback()
raise Exception('Internal Server Error') raise Exception('Internal Server Error')
for json_corpus_file in json_corpus['files'].values(): for json_corpus_file in json_corpus['files'].values():
corpus_file_dir = os.path.join(corpus_dir, 'files', json_corpus_file['id']) convert_corpus_file(json_corpus_file, corpus, corpus_dir)
convert_corpus_file(json_corpus_file, corpus, corpus_file_dir)
current_app.logger.info('Done') current_app.logger.info('Done')
def convert_corpus_file(json_corpus_file, corpus, corpus_file_dir): def convert_corpus_file(json_corpus_file, corpus, corpus_dir):
current_app.logger.info(f'Create CorpusFile {json_corpus_file["title"]}...') current_app.logger.info(f'Create CorpusFile {json_corpus_file["title"]}...')
corpus_file = CorpusFile( corpus_file = CorpusFile(
corpus=corpus, corpus=corpus,
@ -94,122 +94,15 @@ def convert_corpus_file(json_corpus_file, corpus, corpus_file_dir):
db.session.flush(objects=[corpus_file]) db.session.flush(objects=[corpus_file])
db.session.refresh(corpus_file) db.session.refresh(corpus_file)
try: try:
convert_vrt( shutil.copy2(
os.path.join(corpus_file_dir, json_corpus_file['filename']), os.path.join(corpus_dir, json_corpus_file['filename']),
corpus_file.path corpus_file.path
) )
except OSError as e: except:
current_app.logger.error(e) current_app.logger.warning(
db.session.rollback() 'Can not convert corpus file: '
raise Exception('Internal Server Error') f'{os.path.join(corpus_dir, json_corpus_file["filename"])}'
' -> '
f'{corpus_file.path}'
)
current_app.logger.info('Done') current_app.logger.info('Done')
def convert_vrt(input_file, output_file):
def check_pos_attribute_order(vrt_lines):
# The following orders are possible:
# since 26.02.2019: 'word,lemma,simple_pos,pos,ner'
# since 26.03.2021: 'word,pos,lemma,simple_pos,ner'
# since 27.01.2022: 'word,pos,lemma,simple_pos'
# This Function tries to find out which order we have by looking at the
# number of attributes and the position of the simple_pos attribute
SIMPLE_POS_LABELS = [
'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ',
'DET', 'INTJ', 'NOUN', 'NUM', 'PART',
'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',
'VERB', 'X'
]
for line in vrt_lines:
if line.startswith('<'):
continue
pos_attrs = line.rstrip('\n').split('\t')
num_pos_attrs = len(pos_attrs)
if num_pos_attrs == 4:
if pos_attrs[3] in SIMPLE_POS_LABELS:
return ['word', 'pos', 'lemma', 'simple_pos']
continue
elif num_pos_attrs == 5:
if pos_attrs[2] in SIMPLE_POS_LABELS:
return ['word', 'lemma', 'simple_pos', 'pos', 'ner']
elif pos_attrs[3] in SIMPLE_POS_LABELS:
return ['word', 'pos', 'lemma', 'simple_pos', 'ner']
continue
return None
def check_has_ent_as_s_attr(vrt_lines):
for line in vrt_lines:
if line.startswith('<ent'):
return True
return False
def pos_attrs_to_string_1(pos_attrs):
return f'{pos_attrs[0]}\t{pos_attrs[3]}\t{pos_attrs[1]}\t{pos_attrs[2]}\n'
def pos_attrs_to_string_2(pos_attrs):
return f'{pos_attrs[0]}\t{pos_attrs[1]}\t{pos_attrs[2]}\t{pos_attrs[3]}\n'
with open(input_file) as f:
input_vrt_lines = f.readlines()
pos_attr_order = check_pos_attribute_order(input_vrt_lines)
has_ent_as_s_attr = check_has_ent_as_s_attr(input_vrt_lines)
print(f'Detected pos_attr_order: [{",".join(pos_attr_order)}]')
print(f'Detected has_ent_as_s_attr: {has_ent_as_s_attr}')
if pos_attr_order == ['word', 'lemma', 'simple_pos', 'pos', 'ner']:
pos_attrs_to_string_function = pos_attrs_to_string_1
elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos', 'ner']:
pos_attrs_to_string_function = pos_attrs_to_string_2
elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos']:
pos_attrs_to_string_function = pos_attrs_to_string_2
else:
raise Exception('Can not handle format')
current_ent = None
output_vrt = ''
for line in input_vrt_lines:
if line.strip() == '':
continue
if line.startswith('<'):
if not has_ent_as_s_attr:
if current_ent is not None:
output_vrt += '</ent>\n'
current_ent = None
if (
line.startswith('<corpus')
or line.startswith('</corpus')
or line.startswith('<nlp')
):
continue
elif line.startswith('<text'):
output_vrt += '<text>\n'
continue
elif line.startswith('<s'):
output_vrt += '<s>\n'
continue
output_vrt += line
continue
pos_attrs = line.rstrip('\n').split('\t')
if not has_ent_as_s_attr:
if pos_attrs[4].lower() in ['null', 'none']:
if current_ent:
output_vrt += '</ent>\n'
current_ent = None
else:
if current_ent is None:
output_vrt += f'<ent type="{pos_attrs[4]}">\n'
current_ent = pos_attrs[4]
elif current_ent != pos_attrs[4]:
output_vrt += '</ent>\n'
current_ent = None
output_vrt += f'<ent type="{pos_attrs[4]}">\n'
current_ent = pos_attrs[4]
output_vrt += pos_attrs_to_string_function(pos_attrs)
with open(output_file, 'w') as f:
f.write(output_vrt)

117
app/converters/vrt.py Normal file
View File

@ -0,0 +1,117 @@
from flask import current_app
def normalize_vrt_file(input_file, output_file):
def check_pos_attribute_order(vrt_lines):
# The following orders are possible:
# since 26.02.2019: 'word,lemma,simple_pos,pos,ner'
# since 26.03.2021: 'word,pos,lemma,simple_pos,ner'
# since 27.01.2022: 'word,pos,lemma,simple_pos'
# This Function tries to find out which order we have by looking at the
# number of attributes and the position of the simple_pos attribute
SIMPLE_POS_LABELS = [
'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ',
'DET', 'INTJ', 'NOUN', 'NUM', 'PART',
'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',
'VERB', 'X'
]
for line in vrt_lines:
if line.startswith('<'):
continue
pos_attrs = line.rstrip('\n').split('\t')
num_pos_attrs = len(pos_attrs)
if num_pos_attrs == 4:
if pos_attrs[3] in SIMPLE_POS_LABELS:
return ['word', 'pos', 'lemma', 'simple_pos']
continue
elif num_pos_attrs == 5:
if pos_attrs[2] in SIMPLE_POS_LABELS:
return ['word', 'lemma', 'simple_pos', 'pos', 'ner']
elif pos_attrs[3] in SIMPLE_POS_LABELS:
return ['word', 'pos', 'lemma', 'simple_pos', 'ner']
continue
return None
def check_has_ent_as_s_attr(vrt_lines):
for line in vrt_lines:
if line.startswith('<ent'):
return True
return False
def pos_attrs_to_string_1(pos_attrs):
return f'{pos_attrs[0]}\t{pos_attrs[3]}\t{pos_attrs[1]}\t{pos_attrs[2]}\n'
def pos_attrs_to_string_2(pos_attrs):
return f'{pos_attrs[0]}\t{pos_attrs[1]}\t{pos_attrs[2]}\t{pos_attrs[3]}\n'
current_app.logger.info(f'Converting {input_file}...')
with open(input_file) as f:
input_vrt_lines = f.readlines()
pos_attr_order = check_pos_attribute_order(input_vrt_lines)
has_ent_as_s_attr = check_has_ent_as_s_attr(input_vrt_lines)
current_app.logger.info(f'Detected pos_attr_order: [{",".join(pos_attr_order)}]')
current_app.logger.info(f'Detected has_ent_as_s_attr: {has_ent_as_s_attr}')
if pos_attr_order == ['word', 'lemma', 'simple_pos', 'pos', 'ner']:
pos_attrs_to_string_function = pos_attrs_to_string_1
elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos', 'ner']:
pos_attrs_to_string_function = pos_attrs_to_string_2
elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos']:
pos_attrs_to_string_function = pos_attrs_to_string_2
else:
raise Exception('Can not handle format')
current_ent = None
multi_line_tag_definition = False
output_vrt = ''
for line in input_vrt_lines:
if line.strip() == '':
continue
if line.startswith('<'):
if not has_ent_as_s_attr:
if current_ent is not None:
output_vrt += '</ent>\n'
current_ent = None
if not line.rstrip().endswith('>'):
multi_line_tag_definition = True
if line.startswith('<text'):
output_vrt += '<text>\n'
if line.startswith('</text>'):
output_vrt += '</text>\n'
elif line.startswith('<s'):
output_vrt += '<s>\n'
elif line.startswith('</s>'):
output_vrt += '</s>\n'
elif line.startswith('<ent'):
output_vrt += line
elif line.startswith('</ent>'):
output_vrt += line
continue
if multi_line_tag_definition and line.rstrip().endswith('>'):
multi_line_tag_definition = False
continue
pos_attrs = line.rstrip('\n').split('\t')
if not has_ent_as_s_attr:
if pos_attrs[4].lower() in ['null', 'none']:
if current_ent:
output_vrt += '</ent>\n'
current_ent = None
else:
if current_ent is None:
output_vrt += f'<ent type="{pos_attrs[4]}">\n'
current_ent = pos_attrs[4]
elif current_ent != pos_attrs[4]:
output_vrt += '</ent>\n'
current_ent = None
output_vrt += f'<ent type="{pos_attrs[4]}">\n'
current_ent = pos_attrs[4]
output_vrt += pos_attrs_to_string_function(pos_attrs)
with open(output_file, 'w') as f:
f.write(output_vrt)

View File

@ -1,3 +1,4 @@
from app.converters.vrt import normalize_vrt_file
from datetime import datetime, timedelta from datetime import datetime, timedelta
from enum import IntEnum from enum import IntEnum
from flask import current_app, url_for from flask import current_app, url_for
@ -854,7 +855,13 @@ class Corpus(HashidMixin, db.Model):
def build(self): def build(self):
corpus_element = ET.fromstring('<corpus>\n</corpus>') corpus_element = ET.fromstring('<corpus>\n</corpus>')
for corpus_file in self.files: for corpus_file in self.files:
element_tree = ET.parse(corpus_file.path) normalized_vrt_path = os.path.join(self.path, 'cwb', f'{corpus_file.id}.norm.vrt')
try:
normalize_vrt_file(corpus_file.path, normalized_vrt_path)
except:
self.status = CorpusStatus.FAILED
return
element_tree = ET.parse(normalized_vrt_path)
text_element = element_tree.getroot() text_element = element_tree.getroot()
text_element.set('address', corpus_file.address or 'NULL') text_element.set('address', corpus_file.address or 'NULL')
text_element.set('author', corpus_file.author) text_element.set('author', corpus_file.author)