nopaque/app/corpora/routes.py

347 lines
12 KiB
Python
Raw Normal View History

from app import db
from app.models import Corpus, CorpusFile, CorpusStatus
from flask import (
abort,
current_app,
flash,
make_response,
redirect,
render_template,
url_for,
send_from_directory
)
2020-04-06 14:12:22 +02:00
from flask_login import current_user, login_required
from werkzeug.utils import secure_filename
from zipfile import ZipFile
from . import bp
from . import tasks
from .forms import (
AddCorpusFileForm,
AddCorpusForm,
EditCorpusFileForm,
ImportCorpusForm
)
from .import_corpus import check_zip_contents
2020-04-06 14:12:22 +02:00
import os
import shutil
import glob
import xml.etree.ElementTree as ET
2020-04-06 14:12:22 +02:00
@bp.route('/add', methods=['GET', 'POST'])
2020-04-06 14:12:22 +02:00
@login_required
def add_corpus():
2021-11-16 15:23:57 +01:00
form = AddCorpusForm(prefix='add-corpus-form')
if form.validate_on_submit():
2021-11-16 15:23:57 +01:00
corpus = Corpus(
2021-11-30 16:22:16 +01:00
user=current_user,
2021-11-16 15:23:57 +01:00
description=form.description.data,
title=form.title.data
)
2020-04-06 14:12:22 +02:00
db.session.add(corpus)
db.session.flush()
db.session.refresh(corpus)
2020-04-06 14:12:22 +02:00
try:
corpus.makedirs()
2021-11-16 15:23:57 +01:00
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error', category='error')
abort(500)
db.session.commit()
flash(f'Corpus "{corpus.title}" added', category='corpus')
return redirect(url_for('.corpus', corpus_id=corpus.id))
return render_template(
'corpora/add_corpus.html.j2',
form=form,
title='Add corpus'
)
2020-04-06 14:12:22 +02:00
@bp.route('/<hashid:corpus_id>/export')
@login_required
def export_corpus(corpus_id):
abort(503)
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.user == current_user or current_user.is_administrator()):
abort(403)
return send_from_directory(
as_attachment=True,
directory=os.path.join(corpus.user.path, 'corpora'),
filename=corpus.archive_file,
mimetype='zip'
)
@bp.route('/import', methods=['GET', 'POST'])
@login_required
def import_corpus():
2021-11-16 15:23:57 +01:00
abort(503)
form = ImportCorpusForm()
if form.is_submitted():
if not form.validate():
return make_response(form.errors, 400)
2021-11-16 15:23:57 +01:00
corpus = Corpus(
2021-11-30 16:22:16 +01:00
user=current_user,
2021-11-16 15:23:57 +01:00
description=form.description.data,
title=form.title.data
)
db.session.add(corpus)
db.session.flush()
db.session.refresh(corpus)
try:
os.makedirs(corpus.path)
2021-11-16 15:23:57 +01:00
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error', category='error')
return make_response({'redirect_url': url_for('.import_corpus')}, 500) # noqa
# Upload zip
archive_file = os.path.join(corpus.path, form.file.data.filename)
form.file.data.save(archive_file)
# Some checks to verify it is a valid exported corpus
with ZipFile(archive_file, 'r') as zip:
contents = zip.namelist()
if set(check_zip_contents).issubset(contents):
# Unzip
shutil.unpack_archive(archive_file, corpus.path)
# Register vrt files to corpus
vrts = glob.glob(corpus.path + '/*.vrt')
for file in vrts:
element_tree = ET.parse(file)
text_node = element_tree.find('text')
corpus_file = CorpusFile(
address=text_node.get('address', 'NULL'),
author=text_node.get('author', 'NULL'),
booktitle=text_node.get('booktitle', 'NULL'),
chapter=text_node.get('chapter', 'NULL'),
corpus=corpus,
editor=text_node.get('editor', 'NULL'),
filename=os.path.basename(file),
institution=text_node.get('institution', 'NULL'),
journal=text_node.get('journal', 'NULL'),
pages=text_node.get('pages', 'NULL'),
publisher=text_node.get('publisher', 'NULL'),
publishing_year=text_node.get('publishing_year', ''),
school=text_node.get('school', 'NULL'),
title=text_node.get('title', 'NULL')
)
db.session.add(corpus_file)
# finish import and redirect to imported corpus
corpus.status = CorpusStatus.BUILT
db.session.commit()
os.remove(archive_file)
flash(f'Corpus "{corpus.title}" imported', 'corpus')
return make_response(
{'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201)
else:
# If imported zip is not valid delete corpus and give feedback
flash(
f'Can\'t import corpus "{corpus.title}": Invalid archive file',
category='error'
)
tasks.delete_corpus(corpus.id)
return make_response({'redirect_url': url_for('.import_corpus')}, 201) # noqa
return render_template(
'corpora/import_corpus.html.j2',
form=form,
title='Import Corpus'
)
2021-11-30 16:22:16 +01:00
@bp.route('/<hashid:corpus_id>')
2020-04-06 14:12:22 +02:00
@login_required
def corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
2021-11-30 16:22:16 +01:00
if not (corpus.user == current_user or current_user.is_administrator()):
2020-04-06 14:12:22 +02:00
abort(403)
return render_template(
'corpora/corpus.html.j2',
corpus=corpus,
title='Corpus'
)
2021-11-30 16:22:16 +01:00
@bp.route('/<hashid:corpus_id>/analyse')
2021-11-16 15:23:57 +01:00
@login_required
def analyse_corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
return render_template(
'corpora/analyse_corpus.html.j2',
corpus=corpus,
title=f'Analyse Corpus {corpus.title}'
)
2021-11-30 16:22:16 +01:00
@bp.route('/<hashid:corpus_id>/delete')
2020-04-06 14:12:22 +02:00
@login_required
def delete_corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
2021-11-30 16:22:16 +01:00
if not (corpus.user == current_user or current_user.is_administrator()):
2020-04-06 14:12:22 +02:00
abort(403)
flash(f'Corpus "{corpus.title}" marked for deletion', 'corpus')
tasks.delete_corpus(corpus_id)
2020-04-06 14:12:22 +02:00
return redirect(url_for('main.dashboard'))
2021-11-30 16:22:16 +01:00
@bp.route('/<hashid:corpus_id>/files/add', methods=['GET', 'POST'])
2020-04-06 14:12:22 +02:00
@login_required
def add_corpus_file(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
2021-11-30 16:22:16 +01:00
if not (corpus.user == current_user or current_user.is_administrator()):
2020-04-06 14:12:22 +02:00
abort(403)
form = AddCorpusFileForm(corpus, prefix='add-corpus-file-form')
if form.is_submitted():
if not form.validate():
return make_response(form.errors, 400)
2020-04-06 14:12:22 +02:00
# Save the file
filename = secure_filename(form.file.data.filename)
2021-11-16 15:23:57 +01:00
corpus_file = CorpusFile(
address=form.address.data,
author=form.author.data,
booktitle=form.booktitle.data,
chapter=form.chapter.data,
corpus=corpus,
editor=form.editor.data,
filename=filename,
2021-11-16 15:23:57 +01:00
institution=form.institution.data,
journal=form.journal.data,
mimetype='application/vrt+xml',
2021-11-16 15:23:57 +01:00
pages=form.pages.data,
publisher=form.publisher.data,
publishing_year=form.publishing_year.data,
school=form.school.data,
title=form.title.data
)
2020-04-06 14:12:22 +02:00
db.session.add(corpus_file)
db.session.flush(objects=[corpus_file])
db.session.refresh(corpus_file)
try:
form.file.data.save(corpus_file.path)
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error', category='error')
return make_response({'redirect_url': url_for('.add_corpus_file', corpus_id=corpus.id)}, 500) # noqa
corpus.status = CorpusStatus.UNPREPARED
2020-04-06 14:12:22 +02:00
db.session.commit()
flash(f'Corpus file "{corpus_file.filename}" added', category='corpus')
return make_response({'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201) # noqa
return render_template(
'corpora/add_corpus_file.html.j2',
corpus=corpus,
form=form,
title='Add corpus file'
)
2020-04-06 14:12:22 +02:00
2021-11-30 16:22:16 +01:00
@bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>/delete')
2020-04-06 14:12:22 +02:00
@login_required
def delete_corpus_file(corpus_id, corpus_file_id):
corpus_file = CorpusFile.query.filter(
CorpusFile.corpus_id == corpus_id,
CorpusFile.id == corpus_file_id
).first_or_404()
if not (
corpus_file.corpus.user == current_user
or current_user.is_administrator()
):
2020-04-06 14:12:22 +02:00
abort(403)
2021-12-08 14:45:05 +01:00
flash(
f'Corpus file "{corpus_file.filename}" marked for deletion',
category='corpus'
)
tasks.delete_corpus_file(corpus_file_id)
return redirect(url_for('.corpus', corpus_id=corpus_id))
2020-04-06 14:12:22 +02:00
2021-11-30 16:22:16 +01:00
@bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>/download')
2020-04-06 14:12:22 +02:00
@login_required
def download_corpus_file(corpus_id, corpus_file_id):
corpus_file = CorpusFile.query.filter(
CorpusFile.corpus_id == corpus_id,
CorpusFile.id == corpus_file_id
).first_or_404()
if not (
corpus_file.corpus.user == current_user
or current_user.is_administrator()
):
2020-04-06 14:12:22 +02:00
abort(403)
return send_from_directory(
as_attachment=True,
directory=os.path.dirname(corpus_file.path),
filename=corpus_file.filename
)
2020-04-06 14:12:22 +02:00
@bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>', methods=['GET', 'POST']) # noqa
2020-04-06 14:12:22 +02:00
@login_required
2020-07-30 14:17:51 +02:00
def corpus_file(corpus_id, corpus_file_id):
corpus_file = CorpusFile.query.filter(
CorpusFile.corpus_id == corpus_id,
CorpusFile.id == corpus_file_id
).first_or_404()
if not (
corpus_file.corpus.user == current_user
or current_user.is_administrator()
):
abort(403)
form = EditCorpusFileForm(prefix='edit-corpus-file-form')
if form.validate_on_submit():
corpus_file.address = form.address.data
corpus_file.author = form.author.data
corpus_file.booktitle = form.booktitle.data
corpus_file.chapter = form.chapter.data
corpus_file.editor = form.editor.data
corpus_file.institution = form.institution.data
corpus_file.journal = form.journal.data
corpus_file.pages = form.pages.data
corpus_file.publisher = form.publisher.data
corpus_file.publishing_year = form.publishing_year.data
corpus_file.school = form.school.data
corpus_file.title = form.title.data
corpus_file.corpus.status = CorpusStatus.UNPREPARED
2020-04-06 14:12:22 +02:00
db.session.commit()
flash(f'Corpus file "{corpus_file.filename}" edited', category='corpus') # noqa
return redirect(url_for('.corpus', corpus_id=corpus_id))
2020-04-17 11:04:09 +02:00
# If no form is submitted or valid, fill out fields with current values
form.address.data = corpus_file.address
form.author.data = corpus_file.author
form.booktitle.data = corpus_file.booktitle
form.chapter.data = corpus_file.chapter
form.editor.data = corpus_file.editor
form.institution.data = corpus_file.institution
form.journal.data = corpus_file.journal
form.pages.data = corpus_file.pages
form.publisher.data = corpus_file.publisher
form.publishing_year.data = corpus_file.publishing_year
form.school.data = corpus_file.school
form.title.data = corpus_file.title
return render_template(
'corpora/corpus_file.html.j2',
corpus=corpus,
corpus_file=corpus_file,
form=form,
title='Edit corpus file'
)
2020-04-06 14:12:22 +02:00
2021-12-08 14:45:05 +01:00
@bp.route('/<hashid:corpus_id>/build')
2020-04-06 14:12:22 +02:00
@login_required
2021-12-08 14:45:05 +01:00
def build_corpus(corpus_id):
2020-04-06 14:12:22 +02:00
corpus = Corpus.query.get_or_404(corpus_id)
2021-11-30 16:22:16 +01:00
if not (corpus.user == current_user or current_user.is_administrator()):
2020-04-06 14:12:22 +02:00
abort(403)
2020-04-17 11:04:09 +02:00
if corpus.files.all():
tasks.build_corpus(corpus_id)
flash(
f'Corpus "{corpus.title}" marked for building',
category='corpus'
)
2020-04-06 14:12:22 +02:00
else:
flash(
f'Can\'t build corpus "{corpus.title}": No corpus file(s)',
category='error'
)
return redirect(url_for('.corpus', corpus_id=corpus_id))