nopaque/app/corpora/routes.py

353 lines
13 KiB
Python
Raw Normal View History

from app import db
from app.models import Corpus, CorpusFile, CorpusStatus
from flask import (
abort,
current_app,
flash,
make_response,
redirect,
render_template,
url_for,
send_from_directory
)
2020-04-06 14:12:22 +02:00
from flask_login import current_user, login_required
from werkzeug.utils import secure_filename
from zipfile import ZipFile
from . import bp
from . import tasks
from .forms import (
AddCorpusFileForm,
AddCorpusForm,
EditCorpusFileForm,
ImportCorpusForm
)
2020-04-06 14:12:22 +02:00
import os
import shutil
import tempfile
import glob
import xml.etree.ElementTree as ET
2020-04-06 14:12:22 +02:00
@bp.route('/add', methods=['GET', 'POST'])
2020-04-06 14:12:22 +02:00
@login_required
def add_corpus():
2021-11-16 15:23:57 +01:00
form = AddCorpusForm(prefix='add-corpus-form')
if form.validate_on_submit():
2021-11-16 15:23:57 +01:00
corpus = Corpus(
2021-11-30 16:22:16 +01:00
user=current_user,
2021-11-16 15:23:57 +01:00
description=form.description.data,
title=form.title.data
)
2020-04-06 14:12:22 +02:00
db.session.add(corpus)
db.session.flush()
db.session.refresh(corpus)
2020-04-06 14:12:22 +02:00
try:
corpus.makedirs()
2021-11-16 15:23:57 +01:00
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error', category='error')
abort(500)
db.session.commit()
flash(f'Corpus "{corpus.title}" added', category='corpus')
return redirect(url_for('.corpus', corpus_id=corpus.id))
return render_template(
'corpora/add_corpus.html.j2',
form=form,
title='Add corpus'
)
2020-04-06 14:12:22 +02:00
@bp.route('/import', methods=['GET', 'POST'])
@login_required
def import_corpus():
form = ImportCorpusForm(prefix='import-corpus-form')
if form.is_submitted():
if not form.validate():
return make_response(form.errors, 400)
2021-11-16 15:23:57 +01:00
corpus = Corpus(
2021-11-30 16:22:16 +01:00
user=current_user,
2021-11-16 15:23:57 +01:00
description=form.description.data,
title=form.title.data
)
db.session.add(corpus)
db.session.flush(objects=[corpus])
db.session.refresh(corpus)
try:
corpus.makedirs()
2021-11-16 15:23:57 +01:00
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error', category='error')
return make_response({'redirect_url': url_for('.import_corpus')}, 500) # noqa
# Save the uploaded zip file in a temporary directory
tmp_dir_base = os.path.join(current_app.config['NOPAQUE_DATA_DIR'], 'tmp') # noqa
with tempfile.TemporaryDirectory(dir=tmp_dir_base) as tmp_dir:
archive_file = os.path.join(tmp_dir, 'corpus.zip')
try:
form.archive.data.save(archive_file)
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error1', category='error')
return make_response({'redirect_url': url_for('.import_corpus')}, 500) # noqa
shutil.unpack_archive(archive_file, extract_dir=tmp_dir)
for vrt_filename in [x for x in os.listdir(tmp_dir) if x.endswith('.vrt')]:
vrt_file = os.path.join(tmp_dir, vrt_filename)
element_tree = ET.parse(vrt_file)
text_node = element_tree.find('text')
corpus_file = CorpusFile(
author=text_node.get('author'),
corpus=corpus,
filename=vrt_filename,
mimetype='application/vrt+xml',
publishing_year=int(text_node.get('publishing_year')),
title=text_node.get('title')
)
if 'address' not in text_node.attrib:
corpus_file.address = text_node.get('address')
if 'booktitle' not in text_node.attrib:
corpus_file.booktitle = text_node.get('booktitle')
if 'chapter' not in text_node.attrib:
corpus_file.chapter = text_node.get('chapter')
if 'editor' not in text_node.attrib:
corpus_file.editor = text_node.get('editor')
if 'institution' not in text_node.attrib:
corpus_file.institution = text_node.get('institution')
if 'journal' not in text_node.attrib:
corpus_file.journal = text_node.get('journal')
if 'pages' not in text_node.attrib:
corpus_file.pages = text_node.get('pages')
if 'publisher' not in text_node.attrib:
corpus_file.publisher = text_node.get('publisher')
if 'school' not in text_node.attrib:
corpus_file.school = text_node.get('school')
db.session.add(corpus_file)
db.session.flush(objects=[corpus_file])
db.session.refresh(corpus)
try:
shutil.copy2(vrt_file, corpus_file.path)
except Exception as e:
db.session.rollback()
flash('Internal Server Error2', category='error')
return make_response({'redirect_url': url_for('.import_corpus')}, 500) # noqa
db.session.commit()
flash(f'Corpus "{corpus.title}" imported', 'corpus')
return make_response({'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201)
return render_template(
'corpora/import_corpus.html.j2',
form=form,
title='Import Corpus'
)
2021-11-30 16:22:16 +01:00
@bp.route('/<hashid:corpus_id>')
2020-04-06 14:12:22 +02:00
@login_required
def corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
2021-11-30 16:22:16 +01:00
if not (corpus.user == current_user or current_user.is_administrator()):
2020-04-06 14:12:22 +02:00
abort(403)
return render_template(
'corpora/corpus.html.j2',
corpus=corpus,
title='Corpus'
)
2021-11-30 16:22:16 +01:00
@bp.route('/<hashid:corpus_id>/analyse')
2021-11-16 15:23:57 +01:00
@login_required
def analyse_corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
return render_template(
'corpora/analyse_corpus.html.j2',
corpus=corpus,
title=f'Analyse Corpus {corpus.title}'
)
@bp.route('/<hashid:corpus_id>/build')
@login_required
def build_corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.user == current_user or current_user.is_administrator()):
abort(403)
if corpus.files.all():
tasks.build_corpus(corpus_id)
flash(
f'Corpus "{corpus.title}" marked for building',
category='corpus'
)
else:
flash(
f'Can\'t build corpus "{corpus.title}": No corpus file(s)',
category='error'
)
return redirect(url_for('.corpus', corpus_id=corpus_id))
2021-11-30 16:22:16 +01:00
@bp.route('/<hashid:corpus_id>/delete')
2020-04-06 14:12:22 +02:00
@login_required
def delete_corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
2021-11-30 16:22:16 +01:00
if not (corpus.user == current_user or current_user.is_administrator()):
2020-04-06 14:12:22 +02:00
abort(403)
flash(f'Corpus "{corpus.title}" marked for deletion', 'corpus')
tasks.delete_corpus(corpus_id)
2020-04-06 14:12:22 +02:00
return redirect(url_for('main.dashboard'))
@bp.route('/<hashid:corpus_id>/export')
@login_required
def export_corpus(corpus_id):
abort(503)
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.user == current_user or current_user.is_administrator()):
abort(403)
return send_from_directory(
as_attachment=True,
directory=os.path.join(corpus.user.path, 'corpora'),
filename=corpus.archive_file,
mimetype='zip'
)
@bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>', methods=['GET', 'POST']) # noqa
@login_required
def corpus_file(corpus_id, corpus_file_id):
corpus_file = CorpusFile.query.filter(
CorpusFile.corpus_id == corpus_id,
CorpusFile.id == corpus_file_id
).first_or_404()
if not (
corpus_file.corpus.user == current_user
or current_user.is_administrator()
):
abort(403)
form = EditCorpusFileForm(prefix='edit-corpus-file-form')
if form.validate_on_submit():
corpus_file.address = form.address.data
corpus_file.author = form.author.data
corpus_file.booktitle = form.booktitle.data
corpus_file.chapter = form.chapter.data
corpus_file.editor = form.editor.data
corpus_file.institution = form.institution.data
corpus_file.journal = form.journal.data
corpus_file.pages = form.pages.data
corpus_file.publisher = form.publisher.data
corpus_file.publishing_year = form.publishing_year.data
corpus_file.school = form.school.data
corpus_file.title = form.title.data
corpus_file.corpus.status = CorpusStatus.UNPREPARED
db.session.commit()
flash(f'Corpus file "{corpus_file.filename}" edited', category='corpus') # noqa
return redirect(url_for('.corpus', corpus_id=corpus_id))
# If no form is submitted or valid, fill out fields with current values
form.address.data = corpus_file.address
form.author.data = corpus_file.author
form.booktitle.data = corpus_file.booktitle
form.chapter.data = corpus_file.chapter
form.editor.data = corpus_file.editor
form.institution.data = corpus_file.institution
form.journal.data = corpus_file.journal
form.pages.data = corpus_file.pages
form.publisher.data = corpus_file.publisher
form.publishing_year.data = corpus_file.publishing_year
form.school.data = corpus_file.school
form.title.data = corpus_file.title
return render_template(
'corpora/corpus_file.html.j2',
corpus=corpus_file.corpus,
corpus_file=corpus_file,
form=form,
title='Edit corpus file'
)
2021-11-30 16:22:16 +01:00
@bp.route('/<hashid:corpus_id>/files/add', methods=['GET', 'POST'])
2020-04-06 14:12:22 +02:00
@login_required
def add_corpus_file(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
2021-11-30 16:22:16 +01:00
if not (corpus.user == current_user or current_user.is_administrator()):
2020-04-06 14:12:22 +02:00
abort(403)
2022-04-12 16:11:24 +02:00
form = AddCorpusFileForm(prefix='add-corpus-file-form')
if form.is_submitted():
if not form.validate():
return make_response(form.errors, 400)
2020-04-06 14:12:22 +02:00
# Save the file
2021-11-16 15:23:57 +01:00
corpus_file = CorpusFile(
address=form.address.data,
author=form.author.data,
booktitle=form.booktitle.data,
chapter=form.chapter.data,
corpus=corpus,
editor=form.editor.data,
2022-04-13 09:08:11 +02:00
filename=form.vrt.data.filename,
2021-11-16 15:23:57 +01:00
institution=form.institution.data,
journal=form.journal.data,
mimetype='application/vrt+xml',
2021-11-16 15:23:57 +01:00
pages=form.pages.data,
publisher=form.publisher.data,
publishing_year=form.publishing_year.data,
school=form.school.data,
title=form.title.data
)
2020-04-06 14:12:22 +02:00
db.session.add(corpus_file)
db.session.flush(objects=[corpus_file])
db.session.refresh(corpus_file)
try:
2022-04-13 09:08:11 +02:00
form.vrt.data.save(corpus_file.path)
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error', category='error')
return make_response({'redirect_url': url_for('.add_corpus_file', corpus_id=corpus.id)}, 500) # noqa
corpus.status = CorpusStatus.UNPREPARED
2020-04-06 14:12:22 +02:00
db.session.commit()
flash(f'Corpus file "{corpus_file.filename}" added', category='corpus')
return make_response({'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201) # noqa
return render_template(
'corpora/add_corpus_file.html.j2',
corpus=corpus,
form=form,
title='Add corpus file'
)
2020-04-06 14:12:22 +02:00
2021-11-30 16:22:16 +01:00
@bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>/delete')
2020-04-06 14:12:22 +02:00
@login_required
def delete_corpus_file(corpus_id, corpus_file_id):
corpus_file = CorpusFile.query.filter(
CorpusFile.corpus_id == corpus_id,
CorpusFile.id == corpus_file_id
).first_or_404()
if not (
corpus_file.corpus.user == current_user
or current_user.is_administrator()
):
2020-04-06 14:12:22 +02:00
abort(403)
2021-12-08 14:45:05 +01:00
flash(
f'Corpus file "{corpus_file.filename}" marked for deletion',
category='corpus'
)
tasks.delete_corpus_file(corpus_file_id)
return redirect(url_for('.corpus', corpus_id=corpus_id))
2020-04-06 14:12:22 +02:00
2021-11-30 16:22:16 +01:00
@bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>/download')
2020-04-06 14:12:22 +02:00
@login_required
def download_corpus_file(corpus_id, corpus_file_id):
corpus_file = CorpusFile.query.filter(
CorpusFile.corpus_id == corpus_id,
CorpusFile.id == corpus_file_id
).first_or_404()
if not (
corpus_file.corpus.user == current_user
or current_user.is_administrator()
):
2020-04-06 14:12:22 +02:00
abort(403)
return send_from_directory(
as_attachment=True,
2022-04-12 16:11:24 +02:00
attachment_filename=corpus_file.filename,
directory=os.path.dirname(corpus_file.path),
2022-04-12 16:11:24 +02:00
filename=os.path.basename(corpus_file.path)
)