2022-04-04 13:31:09 +02:00
|
|
|
from datetime import datetime
|
2024-04-10 13:34:48 +02:00
|
|
|
from flask import current_app
|
2024-03-07 15:49:04 +01:00
|
|
|
from pathlib import Path
|
|
|
|
from typing import Dict, List
|
2022-04-04 13:31:09 +02:00
|
|
|
import json
|
2022-04-12 16:11:40 +02:00
|
|
|
import shutil
|
2024-04-10 13:34:48 +02:00
|
|
|
from app import db
|
|
|
|
from app.models import User, Corpus, CorpusFile
|
2022-04-04 13:31:09 +02:00
|
|
|
|
|
|
|
|
2023-05-15 12:00:13 +02:00
|
|
|
class SandpaperConverter:
|
2024-03-07 15:49:04 +01:00
|
|
|
def __init__(self, json_db_file: Path, data_dir: Path):
|
2023-05-15 12:00:13 +02:00
|
|
|
self.json_db_file = json_db_file
|
|
|
|
self.data_dir = data_dir
|
2022-04-12 16:11:40 +02:00
|
|
|
|
2023-05-15 12:00:13 +02:00
|
|
|
def run(self):
|
2024-03-07 15:49:04 +01:00
|
|
|
with self.json_db_file.open('r') as f:
|
|
|
|
json_db: List[Dict] = json.load(f)
|
2022-04-04 13:31:09 +02:00
|
|
|
|
2023-05-15 12:00:13 +02:00
|
|
|
for json_user in json_db:
|
|
|
|
if not json_user['confirmed']:
|
|
|
|
current_app.logger.info(f'Skip unconfirmed user {json_user["username"]}')
|
|
|
|
continue
|
2024-03-07 15:49:04 +01:00
|
|
|
user_dir = self.data_dir / f'{json_user["id"]}'
|
2023-05-15 12:00:13 +02:00
|
|
|
self.convert_user(json_user, user_dir)
|
|
|
|
db.session.commit()
|
2022-04-04 13:31:09 +02:00
|
|
|
|
|
|
|
|
2024-03-07 15:49:04 +01:00
|
|
|
def convert_user(self, json_user: Dict, user_dir: Path):
|
2023-05-15 12:00:13 +02:00
|
|
|
current_app.logger.info(f'Create User {json_user["username"]}...')
|
|
|
|
try:
|
2024-03-07 15:49:04 +01:00
|
|
|
user = User.create(
|
|
|
|
confirmed=json_user['confirmed'],
|
|
|
|
email=json_user['email'],
|
|
|
|
last_seen=datetime.fromtimestamp(json_user['last_seen']),
|
|
|
|
member_since=datetime.fromtimestamp(json_user['member_since']),
|
|
|
|
password_hash=json_user['password_hash'], # TODO: Needs to be added manually
|
|
|
|
username=json_user['username']
|
|
|
|
)
|
|
|
|
except OSError:
|
2023-05-15 12:00:13 +02:00
|
|
|
raise Exception('Internal Server Error')
|
|
|
|
for json_corpus in json_user['corpora'].values():
|
|
|
|
if not json_corpus['files'].values():
|
|
|
|
current_app.logger.info(f'Skip empty corpus {json_corpus["title"]}')
|
|
|
|
continue
|
2024-03-07 15:49:04 +01:00
|
|
|
corpus_dir = user_dir / 'corpora' / f'{json_corpus["id"]}'
|
2023-05-15 12:00:13 +02:00
|
|
|
self.convert_corpus(json_corpus, user, corpus_dir)
|
|
|
|
current_app.logger.info('Done')
|
2022-04-04 13:31:09 +02:00
|
|
|
|
|
|
|
|
2024-03-07 15:49:04 +01:00
|
|
|
def convert_corpus(self, json_corpus: Dict, user: User, corpus_dir: Path):
|
2023-05-15 12:00:13 +02:00
|
|
|
current_app.logger.info(f'Create Corpus {json_corpus["title"]}...')
|
|
|
|
try:
|
2024-03-07 15:49:04 +01:00
|
|
|
corpus = Corpus.create(
|
|
|
|
user=user,
|
|
|
|
creation_date=datetime.fromtimestamp(json_corpus['creation_date']),
|
|
|
|
description=json_corpus['description'],
|
|
|
|
title=json_corpus['title']
|
|
|
|
)
|
|
|
|
except OSError:
|
2023-05-15 12:00:13 +02:00
|
|
|
raise Exception('Internal Server Error')
|
|
|
|
for json_corpus_file in json_corpus['files'].values():
|
|
|
|
self.convert_corpus_file(json_corpus_file, corpus, corpus_dir)
|
|
|
|
current_app.logger.info('Done')
|
|
|
|
|
|
|
|
|
2024-03-07 15:49:04 +01:00
|
|
|
def convert_corpus_file(self, json_corpus_file: Dict, corpus: Corpus, corpus_dir: Path):
|
2023-05-15 12:00:13 +02:00
|
|
|
current_app.logger.info(f'Create CorpusFile {json_corpus_file["title"]}...')
|
|
|
|
corpus_file = CorpusFile(
|
|
|
|
corpus=corpus,
|
|
|
|
address=json_corpus_file['address'],
|
|
|
|
author=json_corpus_file['author'],
|
|
|
|
booktitle=json_corpus_file['booktitle'],
|
|
|
|
chapter=json_corpus_file['chapter'],
|
|
|
|
editor=json_corpus_file['editor'],
|
|
|
|
filename=json_corpus_file['filename'],
|
|
|
|
institution=json_corpus_file['institution'],
|
|
|
|
journal=json_corpus_file['journal'],
|
|
|
|
mimetype='application/vrt+xml',
|
|
|
|
pages=json_corpus_file['pages'],
|
|
|
|
publisher=json_corpus_file['publisher'],
|
|
|
|
publishing_year=json_corpus_file['publishing_year'],
|
|
|
|
school=json_corpus_file['school'],
|
|
|
|
title=json_corpus_file['title']
|
2022-04-12 16:11:40 +02:00
|
|
|
)
|
2023-05-15 12:00:13 +02:00
|
|
|
db.session.add(corpus_file)
|
|
|
|
db.session.flush(objects=[corpus_file])
|
|
|
|
db.session.refresh(corpus_file)
|
|
|
|
try:
|
|
|
|
shutil.copy2(
|
2024-03-07 15:49:04 +01:00
|
|
|
corpus_dir / json_corpus_file['filename'],
|
2023-05-15 12:00:13 +02:00
|
|
|
corpus_file.path
|
|
|
|
)
|
|
|
|
except:
|
|
|
|
current_app.logger.warning(
|
|
|
|
'Can not convert corpus file: '
|
2024-03-07 15:49:04 +01:00
|
|
|
f'{corpus_dir / json_corpus_file["filename"]}'
|
2023-05-15 12:00:13 +02:00
|
|
|
' -> '
|
|
|
|
f'{corpus_file.path}'
|
|
|
|
)
|
|
|
|
current_app.logger.info('Done')
|