mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2025-06-11 08:30:41 +00:00
Rename all services, use scss, cleanup, add sandpaper conversion script
This commit is contained in:
0
app/converters/__init__.py
Normal file
0
app/converters/__init__.py
Normal file
215
app/converters/sandpaper.py
Normal file
215
app/converters/sandpaper.py
Normal file
@ -0,0 +1,215 @@
|
||||
from flask import current_app
|
||||
from app import db
|
||||
from app.models import User, Corpus, CorpusFile
|
||||
from datetime import datetime
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
def convert(json_db_file, data_dir):
|
||||
with open(json_db_file, 'r') as f:
|
||||
json_db = json.loads(f.read())
|
||||
|
||||
for json_user in json_db:
|
||||
if not json_user['confirmed']:
|
||||
current_app.logger.info(f'Skip unconfirmed user {json_user["username"]}')
|
||||
continue
|
||||
user_dir = os.path.join(data_dir, json_user['id'])
|
||||
convert_user(json_user, user_dir)
|
||||
db.session.commit()
|
||||
|
||||
|
||||
def convert_user(json_user, user_dir):
|
||||
current_app.logger.info(f'Create User {json_user["username"]}...')
|
||||
user = User(
|
||||
confirmed=json_user['confirmed'],
|
||||
email=json_user['email'],
|
||||
last_seen=datetime.fromtimestamp(json_user['last_seen']),
|
||||
member_since=datetime.fromtimestamp(json_user['member_since']),
|
||||
password_hash=json_user['password_hash'], # TODO: Needs to be added manually
|
||||
username=json_user['username']
|
||||
)
|
||||
db.session.add(user)
|
||||
db.session.flush(objects=[user])
|
||||
db.session.refresh(user)
|
||||
try:
|
||||
user.makedirs()
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
raise Exception('Internal Server Error')
|
||||
for json_corpus in json_user['corpora'].values():
|
||||
if not json_corpus['files'].values():
|
||||
current_app.logger.info(f'Skip empty corpus {json_corpus["title"]}')
|
||||
continue
|
||||
corpus_dir = os.path.join(user_dir, 'corpora', json_corpus['id'])
|
||||
convert_corpus(json_corpus, user, corpus_dir)
|
||||
current_app.logger.info('Done')
|
||||
|
||||
|
||||
def convert_corpus(json_corpus, user, corpus_dir):
|
||||
current_app.logger.info(f'Create Corpus {json_corpus["title"]}...')
|
||||
corpus = Corpus(
|
||||
user=user,
|
||||
creation_date=datetime.fromtimestamp(json_corpus['creation_date']),
|
||||
description=json_corpus['description'],
|
||||
last_edited_date=datetime.fromtimestamp(json_corpus['last_edited_date']),
|
||||
title=json_corpus['title']
|
||||
)
|
||||
db.session.add(corpus)
|
||||
db.session.flush(objects=[corpus])
|
||||
db.session.refresh(corpus)
|
||||
try:
|
||||
corpus.makedirs()
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
raise Exception('Internal Server Error')
|
||||
for json_corpus_file in json_corpus['files'].values():
|
||||
corpus_file_dir = os.path.join(corpus_dir, 'files', json_corpus_file['id'])
|
||||
convert_corpus_file(json_corpus_file, corpus, corpus_file_dir)
|
||||
current_app.logger.info('Done')
|
||||
|
||||
|
||||
def convert_corpus_file(json_corpus_file, corpus, corpus_file_dir):
|
||||
current_app.logger.info(f'Create CorpusFile {json_corpus_file["title"]}...')
|
||||
corpus_file = CorpusFile(
|
||||
corpus=corpus,
|
||||
address=json_corpus_file['address'],
|
||||
author=json_corpus_file['author'],
|
||||
booktitle=json_corpus_file['booktitle'],
|
||||
chapter=json_corpus_file['chapter'],
|
||||
editor=json_corpus_file['editor'],
|
||||
filename=json_corpus_file['filename'],
|
||||
institution=json_corpus_file['institution'],
|
||||
journal=json_corpus_file['journal'],
|
||||
mimetype='application/vrt+xml',
|
||||
pages=json_corpus_file['pages'],
|
||||
publisher=json_corpus_file['publisher'],
|
||||
publishing_year=json_corpus_file['publishing_year'],
|
||||
school=json_corpus_file['school'],
|
||||
title=json_corpus_file['title']
|
||||
)
|
||||
db.session.add(corpus_file)
|
||||
db.session.flush(objects=[corpus_file])
|
||||
db.session.refresh(corpus_file)
|
||||
try:
|
||||
convert_vrt(
|
||||
os.path.join(corpus_file_dir, json_corpus_file['filename']),
|
||||
corpus_file.path
|
||||
)
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
raise Exception('Internal Server Error')
|
||||
current_app.logger.info('Done')
|
||||
|
||||
|
||||
def convert_vrt(input_file, output_file):
|
||||
def check_pos_attribute_order(vrt_lines):
|
||||
# The following orders are possible:
|
||||
# since 26.02.2019: 'word,lemma,simple_pos,pos,ner'
|
||||
# since 26.03.2021: 'word,pos,lemma,simple_pos,ner'
|
||||
# since 27.01.2022: 'word,pos,lemma,simple_pos'
|
||||
# This Function tries to find out which order we have by looking at the
|
||||
# number of attributes and the position of the simple_pos attribute
|
||||
SIMPLE_POS_LABELS = [
|
||||
'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ',
|
||||
'DET', 'INTJ', 'NOUN', 'NUM', 'PART',
|
||||
'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',
|
||||
'VERB', 'X'
|
||||
]
|
||||
for line in vrt_lines:
|
||||
if line.startswith('<'):
|
||||
continue
|
||||
pos_attrs = line.rstrip('\n').split('\t')
|
||||
num_pos_attrs = len(pos_attrs)
|
||||
if num_pos_attrs == 4:
|
||||
if pos_attrs[3] in SIMPLE_POS_LABELS:
|
||||
return ['word', 'pos', 'lemma', 'simple_pos']
|
||||
continue
|
||||
elif num_pos_attrs == 5:
|
||||
if pos_attrs[2] in SIMPLE_POS_LABELS:
|
||||
return ['word', 'lemma', 'simple_pos', 'pos', 'ner']
|
||||
elif pos_attrs[3] in SIMPLE_POS_LABELS:
|
||||
return ['word', 'pos', 'lemma', 'simple_pos', 'ner']
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def check_has_ent_as_s_attr(vrt_lines):
|
||||
for line in vrt_lines:
|
||||
if line.startswith('<ent'):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def pos_attrs_to_string_1(pos_attrs):
|
||||
return f'{pos_attrs[0]}\t{pos_attrs[3]}\t{pos_attrs[1]}\t{pos_attrs[2]}\n'
|
||||
|
||||
|
||||
def pos_attrs_to_string_2(pos_attrs):
|
||||
return f'{pos_attrs[0]}\t{pos_attrs[1]}\t{pos_attrs[2]}\t{pos_attrs[3]}\n'
|
||||
|
||||
|
||||
with open(input_file) as f:
|
||||
input_vrt_lines = f.readlines()
|
||||
|
||||
pos_attr_order = check_pos_attribute_order(input_vrt_lines)
|
||||
has_ent_as_s_attr = check_has_ent_as_s_attr(input_vrt_lines)
|
||||
|
||||
print(f'Detected pos_attr_order: [{",".join(pos_attr_order)}]')
|
||||
print(f'Detected has_ent_as_s_attr: {has_ent_as_s_attr}')
|
||||
|
||||
if pos_attr_order == ['word', 'lemma', 'simple_pos', 'pos', 'ner']:
|
||||
pos_attrs_to_string_function = pos_attrs_to_string_1
|
||||
elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos', 'ner']:
|
||||
pos_attrs_to_string_function = pos_attrs_to_string_2
|
||||
elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos']:
|
||||
pos_attrs_to_string_function = pos_attrs_to_string_2
|
||||
else:
|
||||
raise Exception('Can not handle format')
|
||||
|
||||
current_ent = None
|
||||
output_vrt = ''
|
||||
for line in input_vrt_lines:
|
||||
if line.strip() == '':
|
||||
continue
|
||||
if line.startswith('<'):
|
||||
if not has_ent_as_s_attr:
|
||||
if current_ent is not None:
|
||||
output_vrt += '</ent>\n'
|
||||
current_ent = None
|
||||
if (
|
||||
line.startswith('<corpus')
|
||||
or line.startswith('</corpus')
|
||||
or line.startswith('<nlp')
|
||||
):
|
||||
continue
|
||||
elif line.startswith('<text'):
|
||||
output_vrt += '<text>\n'
|
||||
continue
|
||||
elif line.startswith('<s'):
|
||||
output_vrt += '<s>\n'
|
||||
continue
|
||||
output_vrt += line
|
||||
continue
|
||||
pos_attrs = line.rstrip('\n').split('\t')
|
||||
if not has_ent_as_s_attr:
|
||||
if pos_attrs[4].lower() in ['null', 'none']:
|
||||
if current_ent:
|
||||
output_vrt += '</ent>\n'
|
||||
current_ent = None
|
||||
else:
|
||||
if current_ent is None:
|
||||
output_vrt += f'<ent type="{pos_attrs[4]}">\n'
|
||||
current_ent = pos_attrs[4]
|
||||
elif current_ent != pos_attrs[4]:
|
||||
output_vrt += '</ent>\n'
|
||||
current_ent = None
|
||||
output_vrt += f'<ent type="{pos_attrs[4]}">\n'
|
||||
current_ent = pos_attrs[4]
|
||||
output_vrt += pos_attrs_to_string_function(pos_attrs)
|
||||
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(output_vrt)
|
Reference in New Issue
Block a user