from flask import current_app from app import db from app.models import User, Corpus, CorpusFile from datetime import datetime import json import os def convert(json_db_file, data_dir): with open(json_db_file, 'r') as f: json_db = json.loads(f.read()) for json_user in json_db: if not json_user['confirmed']: current_app.logger.info(f'Skip unconfirmed user {json_user["username"]}') continue user_dir = os.path.join(data_dir, json_user['id']) convert_user(json_user, user_dir) db.session.commit() def convert_user(json_user, user_dir): current_app.logger.info(f'Create User {json_user["username"]}...') user = User( confirmed=json_user['confirmed'], email=json_user['email'], last_seen=datetime.fromtimestamp(json_user['last_seen']), member_since=datetime.fromtimestamp(json_user['member_since']), password_hash=json_user['password_hash'], # TODO: Needs to be added manually username=json_user['username'] ) db.session.add(user) db.session.flush(objects=[user]) db.session.refresh(user) try: user.makedirs() except OSError as e: current_app.logger.error(e) db.session.rollback() raise Exception('Internal Server Error') for json_corpus in json_user['corpora'].values(): if not json_corpus['files'].values(): current_app.logger.info(f'Skip empty corpus {json_corpus["title"]}') continue corpus_dir = os.path.join(user_dir, 'corpora', json_corpus['id']) convert_corpus(json_corpus, user, corpus_dir) current_app.logger.info('Done') def convert_corpus(json_corpus, user, corpus_dir): current_app.logger.info(f'Create Corpus {json_corpus["title"]}...') corpus = Corpus( user=user, creation_date=datetime.fromtimestamp(json_corpus['creation_date']), description=json_corpus['description'], last_edited_date=datetime.fromtimestamp(json_corpus['last_edited_date']), title=json_corpus['title'] ) db.session.add(corpus) db.session.flush(objects=[corpus]) db.session.refresh(corpus) try: corpus.makedirs() except OSError as e: current_app.logger.error(e) db.session.rollback() raise Exception('Internal Server Error') for json_corpus_file in json_corpus['files'].values(): corpus_file_dir = os.path.join(corpus_dir, 'files', json_corpus_file['id']) convert_corpus_file(json_corpus_file, corpus, corpus_file_dir) current_app.logger.info('Done') def convert_corpus_file(json_corpus_file, corpus, corpus_file_dir): current_app.logger.info(f'Create CorpusFile {json_corpus_file["title"]}...') corpus_file = CorpusFile( corpus=corpus, address=json_corpus_file['address'], author=json_corpus_file['author'], booktitle=json_corpus_file['booktitle'], chapter=json_corpus_file['chapter'], editor=json_corpus_file['editor'], filename=json_corpus_file['filename'], institution=json_corpus_file['institution'], journal=json_corpus_file['journal'], mimetype='application/vrt+xml', pages=json_corpus_file['pages'], publisher=json_corpus_file['publisher'], publishing_year=json_corpus_file['publishing_year'], school=json_corpus_file['school'], title=json_corpus_file['title'] ) db.session.add(corpus_file) db.session.flush(objects=[corpus_file]) db.session.refresh(corpus_file) try: convert_vrt( os.path.join(corpus_file_dir, json_corpus_file['filename']), corpus_file.path ) except OSError as e: current_app.logger.error(e) db.session.rollback() raise Exception('Internal Server Error') current_app.logger.info('Done') def convert_vrt(input_file, output_file): def check_pos_attribute_order(vrt_lines): # The following orders are possible: # since 26.02.2019: 'word,lemma,simple_pos,pos,ner' # since 26.03.2021: 'word,pos,lemma,simple_pos,ner' # since 27.01.2022: 'word,pos,lemma,simple_pos' # This Function tries to find out which order we have by looking at the # number of attributes and the position of the simple_pos attribute SIMPLE_POS_LABELS = [ 'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X' ] for line in vrt_lines: if line.startswith('<'): continue pos_attrs = line.rstrip('\n').split('\t') num_pos_attrs = len(pos_attrs) if num_pos_attrs == 4: if pos_attrs[3] in SIMPLE_POS_LABELS: return ['word', 'pos', 'lemma', 'simple_pos'] continue elif num_pos_attrs == 5: if pos_attrs[2] in SIMPLE_POS_LABELS: return ['word', 'lemma', 'simple_pos', 'pos', 'ner'] elif pos_attrs[3] in SIMPLE_POS_LABELS: return ['word', 'pos', 'lemma', 'simple_pos', 'ner'] continue return None def check_has_ent_as_s_attr(vrt_lines): for line in vrt_lines: if line.startswith('\n' current_ent = pos_attrs[4] elif current_ent != pos_attrs[4]: output_vrt += '\n' current_ent = None output_vrt += f'\n' current_ent = pos_attrs[4] output_vrt += pos_attrs_to_string_function(pos_attrs) with open(output_file, 'w') as f: f.write(output_vrt)