nopaque/app/converters/sandpaper.py

109 lines
3.8 KiB
Python
Raw Normal View History

from flask import current_app
from app import db
from app.models import User, Corpus, CorpusFile
from datetime import datetime
import json
import os
2022-04-12 16:11:40 +02:00
import shutil
def convert(json_db_file, data_dir):
with open(json_db_file, 'r') as f:
json_db = json.loads(f.read())
2022-04-12 16:11:40 +02:00
for json_user in json_db:
if not json_user['confirmed']:
current_app.logger.info(f'Skip unconfirmed user {json_user["username"]}')
continue
2022-04-12 16:11:40 +02:00
user_dir = os.path.join(data_dir, str(json_user['id']))
convert_user(json_user, user_dir)
db.session.commit()
def convert_user(json_user, user_dir):
current_app.logger.info(f'Create User {json_user["username"]}...')
user = User(
confirmed=json_user['confirmed'],
email=json_user['email'],
last_seen=datetime.fromtimestamp(json_user['last_seen']),
member_since=datetime.fromtimestamp(json_user['member_since']),
password_hash=json_user['password_hash'], # TODO: Needs to be added manually
username=json_user['username']
)
db.session.add(user)
db.session.flush(objects=[user])
db.session.refresh(user)
try:
user.makedirs()
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
raise Exception('Internal Server Error')
for json_corpus in json_user['corpora'].values():
if not json_corpus['files'].values():
current_app.logger.info(f'Skip empty corpus {json_corpus["title"]}')
continue
2022-04-12 16:11:40 +02:00
corpus_dir = os.path.join(user_dir, 'corpora', str(json_corpus['id']))
convert_corpus(json_corpus, user, corpus_dir)
current_app.logger.info('Done')
def convert_corpus(json_corpus, user, corpus_dir):
current_app.logger.info(f'Create Corpus {json_corpus["title"]}...')
corpus = Corpus(
user=user,
creation_date=datetime.fromtimestamp(json_corpus['creation_date']),
description=json_corpus['description'],
last_edited_date=datetime.fromtimestamp(json_corpus['last_edited_date']),
title=json_corpus['title']
)
db.session.add(corpus)
db.session.flush(objects=[corpus])
db.session.refresh(corpus)
try:
corpus.makedirs()
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
raise Exception('Internal Server Error')
for json_corpus_file in json_corpus['files'].values():
2022-04-12 16:11:40 +02:00
convert_corpus_file(json_corpus_file, corpus, corpus_dir)
current_app.logger.info('Done')
2022-04-12 16:11:40 +02:00
def convert_corpus_file(json_corpus_file, corpus, corpus_dir):
current_app.logger.info(f'Create CorpusFile {json_corpus_file["title"]}...')
corpus_file = CorpusFile(
corpus=corpus,
address=json_corpus_file['address'],
author=json_corpus_file['author'],
booktitle=json_corpus_file['booktitle'],
chapter=json_corpus_file['chapter'],
editor=json_corpus_file['editor'],
filename=json_corpus_file['filename'],
institution=json_corpus_file['institution'],
journal=json_corpus_file['journal'],
mimetype='application/vrt+xml',
pages=json_corpus_file['pages'],
publisher=json_corpus_file['publisher'],
publishing_year=json_corpus_file['publishing_year'],
school=json_corpus_file['school'],
title=json_corpus_file['title']
)
db.session.add(corpus_file)
db.session.flush(objects=[corpus_file])
db.session.refresh(corpus_file)
try:
2022-04-12 16:11:40 +02:00
shutil.copy2(
os.path.join(corpus_dir, json_corpus_file['filename']),
corpus_file.path
)
2022-04-12 16:11:40 +02:00
except:
current_app.logger.warning(
'Can not convert corpus file: '
f'{os.path.join(corpus_dir, json_corpus_file["filename"])}'
' -> '
f'{corpus_file.path}'
)
current_app.logger.info('Done')