mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2025-01-26 01:20:34 +00:00
Merge branch 'public-corpus' of gitlab.ub.uni-bielefeld.de:sfb1288inf/nopaque into public-corpus
This commit is contained in:
commit
49ff1aa284
@ -13,6 +13,7 @@ from flask_paranoid import Paranoid
|
||||
from flask_socketio import SocketIO
|
||||
from flask_sqlalchemy import SQLAlchemy
|
||||
from flask_hashids import Hashids
|
||||
from werkzeug.exceptions import HTTPException
|
||||
|
||||
|
||||
apifairy = APIFairy()
|
||||
@ -35,7 +36,7 @@ socketio = SocketIO()
|
||||
|
||||
def create_app(config: Config = Config) -> Flask:
|
||||
''' Creates an initialized Flask (WSGI Application) object. '''
|
||||
app: Flask = Flask(__name__)
|
||||
app = Flask(__name__)
|
||||
app.config.from_object(config)
|
||||
config.init_app(app)
|
||||
docker_client.login(
|
||||
@ -57,12 +58,6 @@ def create_app(config: Config = Config) -> Flask:
|
||||
scheduler.init_app(app)
|
||||
socketio.init_app(app, message_queue=app.config['NOPAQUE_SOCKETIO_MESSAGE_QUEUE_URI']) # noqa
|
||||
|
||||
from .errors import init_app as init_error_handlers
|
||||
init_error_handlers(app)
|
||||
|
||||
from .cli import init_app as init_cli
|
||||
init_cli(app)
|
||||
|
||||
from .admin import bp as admin_blueprint
|
||||
default_breadcrumb_root(admin_blueprint, '.admin')
|
||||
app.register_blueprint(admin_blueprint, url_prefix='/admin')
|
||||
@ -80,7 +75,10 @@ def create_app(config: Config = Config) -> Flask:
|
||||
|
||||
from .corpora import bp as corpora_blueprint
|
||||
default_breadcrumb_root(corpora_blueprint, '.corpora')
|
||||
app.register_blueprint(corpora_blueprint, url_prefix='/corpora')
|
||||
app.register_blueprint(corpora_blueprint, cli_group='corpus', url_prefix='/corpora')
|
||||
|
||||
from .errors import bp as errors_bp
|
||||
app.register_blueprint(errors_bp)
|
||||
|
||||
from .jobs import bp as jobs_blueprint
|
||||
default_breadcrumb_root(jobs_blueprint, '.jobs')
|
||||
@ -88,7 +86,7 @@ def create_app(config: Config = Config) -> Flask:
|
||||
|
||||
from .main import bp as main_blueprint
|
||||
default_breadcrumb_root(main_blueprint, '.')
|
||||
app.register_blueprint(main_blueprint)
|
||||
app.register_blueprint(main_blueprint, cli_group=None)
|
||||
|
||||
from .services import bp as services_blueprint
|
||||
default_breadcrumb_root(services_blueprint, '.services')
|
||||
|
@ -1,10 +0,0 @@
|
||||
from .converter import init_app as converter_init_app
|
||||
from .corpus import init_app as corpus_init_app
|
||||
from .main import init_app as main_init_app
|
||||
|
||||
|
||||
|
||||
def init_app(app):
|
||||
converter_init_app(app)
|
||||
corpus_init_app(app)
|
||||
main_init_app(app)
|
@ -1,21 +0,0 @@
|
||||
import click
|
||||
|
||||
|
||||
def init_app(app):
|
||||
@app.cli.group('converter')
|
||||
def converter():
|
||||
''' Converter commands. '''
|
||||
pass
|
||||
|
||||
@converter.group('sandpaper')
|
||||
def sandpaper_converter():
|
||||
''' Sandpaper converter commands. '''
|
||||
pass
|
||||
|
||||
@sandpaper_converter.command('run')
|
||||
@click.argument('json_db')
|
||||
@click.argument('data_dir')
|
||||
def run_sandpaper_converter(json_db, data_dir):
|
||||
''' Run the sandpaper converter. '''
|
||||
from app.converters.sandpaper import convert
|
||||
convert(json_db, data_dir)
|
@ -1,23 +0,0 @@
|
||||
from app.models import Corpus, CorpusStatus
|
||||
|
||||
|
||||
def init_app(app):
|
||||
@app.cli.group('corpus')
|
||||
def corpus():
|
||||
''' Corpus commands. '''
|
||||
pass
|
||||
|
||||
@corpus.command('dismantle')
|
||||
def dismantle():
|
||||
''' Dismantle built corpora. '''
|
||||
status = [
|
||||
CorpusStatus.QUEUED,
|
||||
CorpusStatus.BUILDING,
|
||||
CorpusStatus.BUILT,
|
||||
CorpusStatus.STARTING_ANALYSIS_SESSION,
|
||||
CorpusStatus.RUNNING_ANALYSIS_SESSION,
|
||||
CorpusStatus.CANCELING_ANALYSIS_SESSION
|
||||
]
|
||||
for corpus in [x for x in Corpus.query.all() if x.status in status]:
|
||||
corpus.status = CorpusStatus.SUBMITTED
|
||||
corpus.num_analysis_sessions = 0
|
@ -1,45 +0,0 @@
|
||||
from flask import current_app
|
||||
from flask_migrate import upgrade
|
||||
import os
|
||||
from app.models import (
|
||||
CorpusFollowerRole,
|
||||
Role,
|
||||
SpaCyNLPPipelineModel,
|
||||
TesseractOCRPipelineModel,
|
||||
User
|
||||
)
|
||||
|
||||
|
||||
def init_app(app):
|
||||
@app.cli.command('deploy')
|
||||
def deploy():
|
||||
''' Run deployment tasks. '''
|
||||
# Make default directories
|
||||
print('Make default directories')
|
||||
base_dir = current_app.config['NOPAQUE_DATA_DIR']
|
||||
default_dirs = [
|
||||
os.path.join(base_dir, 'tmp'),
|
||||
os.path.join(base_dir, 'users')
|
||||
]
|
||||
for dir in default_dirs:
|
||||
if os.path.exists(dir):
|
||||
if not os.path.isdir(dir):
|
||||
raise NotADirectoryError(f'{dir} is not a directory')
|
||||
else:
|
||||
os.mkdir(dir)
|
||||
|
||||
# migrate database to latest revision
|
||||
print('Migrate database to latest revision')
|
||||
upgrade()
|
||||
|
||||
# Insert/Update default database values
|
||||
print('Insert/Update default Roles')
|
||||
Role.insert_defaults()
|
||||
print('Insert/Update default Users')
|
||||
User.insert_defaults()
|
||||
print('Insert/Update default CorpusFollowerRoles')
|
||||
CorpusFollowerRole.insert_defaults()
|
||||
print('Insert/Update default SpaCyNLPPipelineModels')
|
||||
SpaCyNLPPipelineModel.insert_defaults()
|
||||
print('Insert/Update default TesseractOCRPipelineModels')
|
||||
TesseractOCRPipelineModel.insert_defaults()
|
@ -15,7 +15,9 @@ def before_request():
|
||||
pass
|
||||
|
||||
|
||||
from . import routes
|
||||
from . import spacy_nlp_pipeline_models
|
||||
from . import tesseract_ocr_pipeline_models
|
||||
from . import transkribus_htr_pipeline_models
|
||||
from . import (
|
||||
routes,
|
||||
spacy_nlp_pipeline_models,
|
||||
tesseract_ocr_pipeline_models,
|
||||
transkribus_htr_pipeline_models
|
||||
)
|
||||
|
22
app/converters/cli.py
Normal file
22
app/converters/cli.py
Normal file
@ -0,0 +1,22 @@
|
||||
import click
|
||||
from . import bp
|
||||
from .sandpaper import SandpaperConverter
|
||||
|
||||
|
||||
@bp.cli.group('converter')
|
||||
def converter():
|
||||
''' Converter commands. '''
|
||||
pass
|
||||
|
||||
@converter.group('sandpaper')
|
||||
def sandpaper_converter():
|
||||
''' Sandpaper converter commands. '''
|
||||
pass
|
||||
|
||||
@sandpaper_converter.command('run')
|
||||
@click.argument('json_db_file')
|
||||
@click.argument('data_dir')
|
||||
def run_sandpaper_converter(json_db_file, data_dir):
|
||||
''' Run the sandpaper converter. '''
|
||||
sandpaper_converter = SandpaperConverter(json_db_file, data_dir)
|
||||
sandpaper_converter.run()
|
@ -7,101 +7,106 @@ import os
|
||||
import shutil
|
||||
|
||||
|
||||
def convert(json_db_file, data_dir):
|
||||
with open(json_db_file, 'r') as f:
|
||||
json_db = json.loads(f.read())
|
||||
class SandpaperConverter:
|
||||
def __init__(self, json_db_file, data_dir):
|
||||
self.json_db_file = json_db_file
|
||||
self.data_dir = data_dir
|
||||
|
||||
for json_user in json_db:
|
||||
if not json_user['confirmed']:
|
||||
current_app.logger.info(f'Skip unconfirmed user {json_user["username"]}')
|
||||
continue
|
||||
user_dir = os.path.join(data_dir, str(json_user['id']))
|
||||
convert_user(json_user, user_dir)
|
||||
db.session.commit()
|
||||
def run(self):
|
||||
with open(self.json_db_file, 'r') as f:
|
||||
json_db = json.loads(f.read())
|
||||
|
||||
for json_user in json_db:
|
||||
if not json_user['confirmed']:
|
||||
current_app.logger.info(f'Skip unconfirmed user {json_user["username"]}')
|
||||
continue
|
||||
user_dir = os.path.join(self.data_dir, str(json_user['id']))
|
||||
self.convert_user(json_user, user_dir)
|
||||
db.session.commit()
|
||||
|
||||
|
||||
def convert_user(json_user, user_dir):
|
||||
current_app.logger.info(f'Create User {json_user["username"]}...')
|
||||
user = User(
|
||||
confirmed=json_user['confirmed'],
|
||||
email=json_user['email'],
|
||||
last_seen=datetime.fromtimestamp(json_user['last_seen']),
|
||||
member_since=datetime.fromtimestamp(json_user['member_since']),
|
||||
password_hash=json_user['password_hash'], # TODO: Needs to be added manually
|
||||
username=json_user['username']
|
||||
)
|
||||
db.session.add(user)
|
||||
db.session.flush(objects=[user])
|
||||
db.session.refresh(user)
|
||||
try:
|
||||
user.makedirs()
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
raise Exception('Internal Server Error')
|
||||
for json_corpus in json_user['corpora'].values():
|
||||
if not json_corpus['files'].values():
|
||||
current_app.logger.info(f'Skip empty corpus {json_corpus["title"]}')
|
||||
continue
|
||||
corpus_dir = os.path.join(user_dir, 'corpora', str(json_corpus['id']))
|
||||
convert_corpus(json_corpus, user, corpus_dir)
|
||||
current_app.logger.info('Done')
|
||||
|
||||
|
||||
def convert_corpus(json_corpus, user, corpus_dir):
|
||||
current_app.logger.info(f'Create Corpus {json_corpus["title"]}...')
|
||||
corpus = Corpus(
|
||||
user=user,
|
||||
creation_date=datetime.fromtimestamp(json_corpus['creation_date']),
|
||||
description=json_corpus['description'],
|
||||
title=json_corpus['title']
|
||||
)
|
||||
db.session.add(corpus)
|
||||
db.session.flush(objects=[corpus])
|
||||
db.session.refresh(corpus)
|
||||
try:
|
||||
corpus.makedirs()
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
raise Exception('Internal Server Error')
|
||||
for json_corpus_file in json_corpus['files'].values():
|
||||
convert_corpus_file(json_corpus_file, corpus, corpus_dir)
|
||||
current_app.logger.info('Done')
|
||||
|
||||
|
||||
def convert_corpus_file(json_corpus_file, corpus, corpus_dir):
|
||||
current_app.logger.info(f'Create CorpusFile {json_corpus_file["title"]}...')
|
||||
corpus_file = CorpusFile(
|
||||
corpus=corpus,
|
||||
address=json_corpus_file['address'],
|
||||
author=json_corpus_file['author'],
|
||||
booktitle=json_corpus_file['booktitle'],
|
||||
chapter=json_corpus_file['chapter'],
|
||||
editor=json_corpus_file['editor'],
|
||||
filename=json_corpus_file['filename'],
|
||||
institution=json_corpus_file['institution'],
|
||||
journal=json_corpus_file['journal'],
|
||||
mimetype='application/vrt+xml',
|
||||
pages=json_corpus_file['pages'],
|
||||
publisher=json_corpus_file['publisher'],
|
||||
publishing_year=json_corpus_file['publishing_year'],
|
||||
school=json_corpus_file['school'],
|
||||
title=json_corpus_file['title']
|
||||
)
|
||||
db.session.add(corpus_file)
|
||||
db.session.flush(objects=[corpus_file])
|
||||
db.session.refresh(corpus_file)
|
||||
try:
|
||||
shutil.copy2(
|
||||
os.path.join(corpus_dir, json_corpus_file['filename']),
|
||||
corpus_file.path
|
||||
def convert_user(self, json_user, user_dir):
|
||||
current_app.logger.info(f'Create User {json_user["username"]}...')
|
||||
user = User(
|
||||
confirmed=json_user['confirmed'],
|
||||
email=json_user['email'],
|
||||
last_seen=datetime.fromtimestamp(json_user['last_seen']),
|
||||
member_since=datetime.fromtimestamp(json_user['member_since']),
|
||||
password_hash=json_user['password_hash'], # TODO: Needs to be added manually
|
||||
username=json_user['username']
|
||||
)
|
||||
except:
|
||||
current_app.logger.warning(
|
||||
'Can not convert corpus file: '
|
||||
f'{os.path.join(corpus_dir, json_corpus_file["filename"])}'
|
||||
' -> '
|
||||
f'{corpus_file.path}'
|
||||
db.session.add(user)
|
||||
db.session.flush(objects=[user])
|
||||
db.session.refresh(user)
|
||||
try:
|
||||
user.makedirs()
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
raise Exception('Internal Server Error')
|
||||
for json_corpus in json_user['corpora'].values():
|
||||
if not json_corpus['files'].values():
|
||||
current_app.logger.info(f'Skip empty corpus {json_corpus["title"]}')
|
||||
continue
|
||||
corpus_dir = os.path.join(user_dir, 'corpora', str(json_corpus['id']))
|
||||
self.convert_corpus(json_corpus, user, corpus_dir)
|
||||
current_app.logger.info('Done')
|
||||
|
||||
|
||||
def convert_corpus(self, json_corpus, user, corpus_dir):
|
||||
current_app.logger.info(f'Create Corpus {json_corpus["title"]}...')
|
||||
corpus = Corpus(
|
||||
user=user,
|
||||
creation_date=datetime.fromtimestamp(json_corpus['creation_date']),
|
||||
description=json_corpus['description'],
|
||||
title=json_corpus['title']
|
||||
)
|
||||
current_app.logger.info('Done')
|
||||
db.session.add(corpus)
|
||||
db.session.flush(objects=[corpus])
|
||||
db.session.refresh(corpus)
|
||||
try:
|
||||
corpus.makedirs()
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
raise Exception('Internal Server Error')
|
||||
for json_corpus_file in json_corpus['files'].values():
|
||||
self.convert_corpus_file(json_corpus_file, corpus, corpus_dir)
|
||||
current_app.logger.info('Done')
|
||||
|
||||
|
||||
def convert_corpus_file(self, json_corpus_file, corpus, corpus_dir):
|
||||
current_app.logger.info(f'Create CorpusFile {json_corpus_file["title"]}...')
|
||||
corpus_file = CorpusFile(
|
||||
corpus=corpus,
|
||||
address=json_corpus_file['address'],
|
||||
author=json_corpus_file['author'],
|
||||
booktitle=json_corpus_file['booktitle'],
|
||||
chapter=json_corpus_file['chapter'],
|
||||
editor=json_corpus_file['editor'],
|
||||
filename=json_corpus_file['filename'],
|
||||
institution=json_corpus_file['institution'],
|
||||
journal=json_corpus_file['journal'],
|
||||
mimetype='application/vrt+xml',
|
||||
pages=json_corpus_file['pages'],
|
||||
publisher=json_corpus_file['publisher'],
|
||||
publishing_year=json_corpus_file['publishing_year'],
|
||||
school=json_corpus_file['school'],
|
||||
title=json_corpus_file['title']
|
||||
)
|
||||
db.session.add(corpus_file)
|
||||
db.session.flush(objects=[corpus_file])
|
||||
db.session.refresh(corpus_file)
|
||||
try:
|
||||
shutil.copy2(
|
||||
os.path.join(corpus_dir, json_corpus_file['filename']),
|
||||
corpus_file.path
|
||||
)
|
||||
except:
|
||||
current_app.logger.warning(
|
||||
'Can not convert corpus file: '
|
||||
f'{os.path.join(corpus_dir, json_corpus_file["filename"])}'
|
||||
' -> '
|
||||
f'{corpus_file.path}'
|
||||
)
|
||||
current_app.logger.info('Done')
|
||||
|
@ -3,6 +3,7 @@ from flask_login import login_required
|
||||
|
||||
|
||||
bp = Blueprint('corpora', __name__)
|
||||
bp.cli.short_help = 'Corpus commands.'
|
||||
|
||||
|
||||
@bp.before_request
|
||||
@ -15,6 +16,4 @@ def before_request():
|
||||
pass
|
||||
|
||||
|
||||
from . import cqi_over_socketio, routes, json_routes
|
||||
from . import files
|
||||
from . import followers
|
||||
from . import cli, cqi_over_socketio, files, followers, routes, json_routes
|
||||
|
24
app/corpora/cli.py
Normal file
24
app/corpora/cli.py
Normal file
@ -0,0 +1,24 @@
|
||||
from app.models import Corpus, CorpusStatus
|
||||
import os
|
||||
import shutil
|
||||
from app import db
|
||||
from . import bp
|
||||
|
||||
|
||||
@bp.cli.command('reset')
|
||||
def reset():
|
||||
''' Reset built corpora. '''
|
||||
status = [
|
||||
CorpusStatus.QUEUED,
|
||||
CorpusStatus.BUILDING,
|
||||
CorpusStatus.BUILT,
|
||||
CorpusStatus.STARTING_ANALYSIS_SESSION,
|
||||
CorpusStatus.RUNNING_ANALYSIS_SESSION,
|
||||
CorpusStatus.CANCELING_ANALYSIS_SESSION
|
||||
]
|
||||
for corpus in [x for x in Corpus.query.all() if x.status in status]:
|
||||
print(f'Resetting corpus {corpus}')
|
||||
shutil.rmtree(os.path.join(corpus.path, 'cwb'), ignore_errors=True)
|
||||
corpus.status = CorpusStatus.UNPREPARED
|
||||
corpus.num_analysis_sessions = 0
|
||||
db.session.commit()
|
@ -1,6 +1,5 @@
|
||||
from werkzeug.exceptions import HTTPException
|
||||
from .handlers import generic
|
||||
from flask import Blueprint
|
||||
|
||||
|
||||
def init_app(app):
|
||||
app.register_error_handler(HTTPException, generic)
|
||||
bp = Blueprint('errors', __name__)
|
||||
from . import handlers
|
||||
|
@ -1,13 +1,14 @@
|
||||
from flask import jsonify, render_template, request, Response
|
||||
from flask import jsonify, render_template, request
|
||||
from werkzeug.exceptions import HTTPException
|
||||
from typing import Tuple, Union
|
||||
from . import bp
|
||||
|
||||
|
||||
def generic(error: HTTPException) -> Tuple[Union[str, Response], int]:
|
||||
''' Generic error handler '''
|
||||
accent_json: bool = request.accept_mimetypes.accept_json
|
||||
accept_html: bool = request.accept_mimetypes.accept_html
|
||||
if accent_json and not accept_html:
|
||||
response: Response = jsonify(str(error))
|
||||
@bp.app_errorhandler(HTTPException)
|
||||
def handle_http_exception(error):
|
||||
''' Generic HTTP exception handler '''
|
||||
accept_json = request.accept_mimetypes.accept_json
|
||||
accept_html = request.accept_mimetypes.accept_html
|
||||
if accept_json and not accept_html:
|
||||
response = jsonify(str(error))
|
||||
return response, error.code
|
||||
return render_template('errors/error.html.j2', error=error), error.code
|
||||
|
@ -2,4 +2,4 @@ from flask import Blueprint
|
||||
|
||||
|
||||
bp = Blueprint('main', __name__, cli_group=None)
|
||||
from . import routes
|
||||
from . import cli, routes
|
||||
|
45
app/main/cli.py
Normal file
45
app/main/cli.py
Normal file
@ -0,0 +1,45 @@
|
||||
from flask import current_app
|
||||
from flask_migrate import upgrade
|
||||
import os
|
||||
from app.models import (
|
||||
CorpusFollowerRole,
|
||||
Role,
|
||||
SpaCyNLPPipelineModel,
|
||||
TesseractOCRPipelineModel,
|
||||
User
|
||||
)
|
||||
from . import bp
|
||||
|
||||
|
||||
@bp.cli.command('deploy')
|
||||
def deploy():
|
||||
''' Run deployment tasks. '''
|
||||
# Make default directories
|
||||
print('Make default directories')
|
||||
base_dir = current_app.config['NOPAQUE_DATA_DIR']
|
||||
default_dirs = [
|
||||
os.path.join(base_dir, 'tmp'),
|
||||
os.path.join(base_dir, 'users')
|
||||
]
|
||||
for dir in default_dirs:
|
||||
if os.path.exists(dir):
|
||||
if not os.path.isdir(dir):
|
||||
raise NotADirectoryError(f'{dir} is not a directory')
|
||||
else:
|
||||
os.mkdir(dir)
|
||||
|
||||
# migrate database to latest revision
|
||||
print('Migrate database to latest revision')
|
||||
upgrade()
|
||||
|
||||
# Insert/Update default database values
|
||||
print('Insert/Update default Roles')
|
||||
Role.insert_defaults()
|
||||
print('Insert/Update default Users')
|
||||
User.insert_defaults()
|
||||
print('Insert/Update default CorpusFollowerRoles')
|
||||
CorpusFollowerRole.insert_defaults()
|
||||
print('Insert/Update default SpaCyNLPPipelineModels')
|
||||
SpaCyNLPPipelineModel.insert_defaults()
|
||||
print('Insert/Update default TesseractOCRPipelineModels')
|
||||
TesseractOCRPipelineModel.insert_defaults()
|
@ -1,5 +0,0 @@
|
||||
from flask import Blueprint
|
||||
|
||||
|
||||
bp = Blueprint('tests', __name__)
|
||||
from . import cli
|
@ -15,5 +15,4 @@ def before_request():
|
||||
pass
|
||||
|
||||
|
||||
from . import events, json_routes, routes
|
||||
from . import settings
|
||||
from . import events, json_routes, routes, settings
|
||||
|
Loading…
x
Reference in New Issue
Block a user