from datetime import datetime from flask import current_app from pathlib import Path from typing import Dict, List import json import shutil from app import db from app.models import User, Corpus, CorpusFile class SandpaperConverter: def __init__(self, json_db_file: Path, data_dir: Path): self.json_db_file = json_db_file self.data_dir = data_dir def run(self): with self.json_db_file.open('r') as f: json_db: List[Dict] = json.load(f) for json_user in json_db: if not json_user['confirmed']: current_app.logger.info(f'Skip unconfirmed user {json_user["username"]}') continue user_dir = self.data_dir / f'{json_user["id"]}' self.convert_user(json_user, user_dir) db.session.commit() def convert_user(self, json_user: Dict, user_dir: Path): current_app.logger.info(f'Create User {json_user["username"]}...') try: user = User.create( confirmed=json_user['confirmed'], email=json_user['email'], last_seen=datetime.fromtimestamp(json_user['last_seen']), member_since=datetime.fromtimestamp(json_user['member_since']), password_hash=json_user['password_hash'], # TODO: Needs to be added manually username=json_user['username'] ) except OSError: raise Exception('Internal Server Error') for json_corpus in json_user['corpora'].values(): if not json_corpus['files'].values(): current_app.logger.info(f'Skip empty corpus {json_corpus["title"]}') continue corpus_dir = user_dir / 'corpora' / f'{json_corpus["id"]}' self.convert_corpus(json_corpus, user, corpus_dir) current_app.logger.info('Done') def convert_corpus(self, json_corpus: Dict, user: User, corpus_dir: Path): current_app.logger.info(f'Create Corpus {json_corpus["title"]}...') try: corpus = Corpus.create( user=user, creation_date=datetime.fromtimestamp(json_corpus['creation_date']), description=json_corpus['description'], title=json_corpus['title'] ) except OSError: raise Exception('Internal Server Error') for json_corpus_file in json_corpus['files'].values(): self.convert_corpus_file(json_corpus_file, corpus, corpus_dir) current_app.logger.info('Done') def convert_corpus_file(self, json_corpus_file: Dict, corpus: Corpus, corpus_dir: Path): current_app.logger.info(f'Create CorpusFile {json_corpus_file["title"]}...') corpus_file = CorpusFile( corpus=corpus, address=json_corpus_file['address'], author=json_corpus_file['author'], booktitle=json_corpus_file['booktitle'], chapter=json_corpus_file['chapter'], editor=json_corpus_file['editor'], filename=json_corpus_file['filename'], institution=json_corpus_file['institution'], journal=json_corpus_file['journal'], mimetype='application/vrt+xml', pages=json_corpus_file['pages'], publisher=json_corpus_file['publisher'], publishing_year=json_corpus_file['publishing_year'], school=json_corpus_file['school'], title=json_corpus_file['title'] ) db.session.add(corpus_file) db.session.flush(objects=[corpus_file]) db.session.refresh(corpus_file) try: shutil.copy2( corpus_dir / json_corpus_file['filename'], corpus_file.path ) except: current_app.logger.warning( 'Can not convert corpus file: ' f'{corpus_dir / json_corpus_file["filename"]}' ' -> ' f'{corpus_file.path}' ) current_app.logger.info('Done')