nopaque/app/corpora/routes.py

289 lines
11 KiB
Python
Raw Normal View History

2021-11-16 14:23:57 +00:00
from flask import (abort, current_app, flash, make_response, redirect,
2020-04-06 12:12:22 +00:00
render_template, url_for, send_from_directory)
from flask_login import current_user, login_required
from . import bp
from . import tasks
2021-11-16 14:23:57 +00:00
from .forms import (AddCorpusFileForm, AddCorpusForm, EditCorpusFileForm,
ImportCorpusForm)
from .. import db
2021-11-16 14:23:57 +00:00
from ..models import Corpus, CorpusFile
2020-04-06 12:12:22 +00:00
import os
import shutil
import glob
import xml.etree.ElementTree as ET
from zipfile import ZipFile
from .import_corpus import check_zip_contents
2020-04-06 12:12:22 +00:00
@bp.route('/add', methods=['GET', 'POST'])
2020-04-06 12:12:22 +00:00
@login_required
def add_corpus():
2021-11-16 14:23:57 +00:00
form = AddCorpusForm(prefix='add-corpus-form')
if form.validate_on_submit():
2021-11-16 14:23:57 +00:00
corpus = Corpus(
2021-11-30 15:22:16 +00:00
user=current_user,
2021-11-16 14:23:57 +00:00
description=form.description.data,
title=form.title.data
)
2020-04-06 12:12:22 +00:00
db.session.add(corpus)
db.session.flush()
db.session.refresh(corpus)
2020-04-06 12:12:22 +00:00
try:
os.makedirs(corpus.path)
2021-11-16 14:23:57 +00:00
except OSError as e:
current_app.logger.error(f'Could not add corpus: {e}')
db.session.rollback()
2021-11-16 14:23:57 +00:00
flash('Internal Server Error', 'error')
abort(500)
else:
db.session.commit()
2021-12-08 13:45:05 +00:00
flash(f'Corpus "{corpus.title}" added!', 'corpus')
return redirect(url_for('.corpus', corpus_id=corpus.id))
return render_template('corpora/add_corpus.html.j2', form=form,
2020-04-06 12:12:22 +00:00
title='Add corpus')
@bp.route('/import', methods=['GET', 'POST'])
@login_required
def import_corpus():
2021-11-16 14:23:57 +00:00
abort(503)
form = ImportCorpusForm()
if form.is_submitted():
if not form.validate():
return make_response(form.errors, 400)
2021-11-16 14:23:57 +00:00
corpus = Corpus(
2021-11-30 15:22:16 +00:00
user=current_user,
2021-11-16 14:23:57 +00:00
description=form.description.data,
title=form.title.data
)
db.session.add(corpus)
db.session.flush()
db.session.refresh(corpus)
try:
os.makedirs(corpus.path)
2021-11-16 14:23:57 +00:00
except OSError as e:
current_app.logger.error(f'Could not import corpus: {e}')
db.session.rollback()
flash('Internal Server Error', 'error')
return make_response(
{'redirect_url': url_for('.import_corpus')}, 500)
# Upload zip
archive_file = os.path.join(corpus.path, form.file.data.filename)
form.file.data.save(archive_file)
# Some checks to verify it is a valid exported corpus
with ZipFile(archive_file, 'r') as zip:
contents = zip.namelist()
if set(check_zip_contents).issubset(contents):
# Unzip
shutil.unpack_archive(archive_file, corpus.path)
# Register vrt files to corpus
vrts = glob.glob(corpus.path + '/*.vrt')
for file in vrts:
element_tree = ET.parse(file)
text_node = element_tree.find('text')
corpus_file = CorpusFile(
address=text_node.get('address', 'NULL'),
author=text_node.get('author', 'NULL'),
booktitle=text_node.get('booktitle', 'NULL'),
chapter=text_node.get('chapter', 'NULL'),
corpus=corpus,
editor=text_node.get('editor', 'NULL'),
filename=os.path.basename(file),
institution=text_node.get('institution', 'NULL'),
journal=text_node.get('journal', 'NULL'),
pages=text_node.get('pages', 'NULL'),
publisher=text_node.get('publisher', 'NULL'),
publishing_year=text_node.get('publishing_year', ''),
school=text_node.get('school', 'NULL'),
title=text_node.get('title', 'NULL')
)
db.session.add(corpus_file)
# finish import and redirect to imported corpus
corpus.status = 'prepared'
db.session.commit()
os.remove(archive_file)
2021-12-08 13:45:05 +00:00
flash(f'Corpus "{corpus.title}" imported!', 'corpus')
return make_response(
{'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201)
else:
# If imported zip is not valid delete corpus and give feedback
flash('Can not import corpus "{}" not imported: Invalid archive file!', 'error') # noqa
tasks.delete_corpus(corpus.id)
return make_response(
{'redirect_url': url_for('.import_corpus')}, 201)
return render_template('corpora/import_corpus.html.j2', form=form,
title='Import Corpus')
2021-11-30 15:22:16 +00:00
@bp.route('/<hashid:corpus_id>')
2020-04-06 12:12:22 +00:00
@login_required
def corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
2021-11-30 15:22:16 +00:00
if not (corpus.user == current_user or current_user.is_administrator()):
2020-04-06 12:12:22 +00:00
abort(403)
corpus_files = [corpus_file.to_dict() for corpus_file in corpus.files]
return render_template('corpora/corpus.html.j2', corpus=corpus,
corpus_files=corpus_files, title='Corpus')
2021-11-30 15:22:16 +00:00
@bp.route('/<hashid:corpus_id>/analyse')
2021-11-16 14:23:57 +00:00
@login_required
def analyse_corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
return render_template(
'corpora/analyse_corpus.html.j2',
corpus=corpus,
title=f'Analyse Corpus {corpus.title}'
)
2021-11-30 15:22:16 +00:00
@bp.route('/<hashid:corpus_id>/download')
@login_required
2021-01-13 14:23:04 +00:00
def download_corpus(corpus_id):
2021-11-16 14:23:57 +00:00
abort(503)
corpus = Corpus.query.get_or_404(corpus_id)
2021-11-30 15:22:16 +00:00
if not (corpus.user == current_user or current_user.is_administrator()):
abort(403)
return send_from_directory(
as_attachment=True,
2021-11-30 15:22:16 +00:00
directory=os.path.join(corpus.user.path, 'corpora'),
filename=corpus.archive_file,
mimetype='zip'
)
2021-11-30 15:22:16 +00:00
@bp.route('/<hashid:corpus_id>/delete')
2020-04-06 12:12:22 +00:00
@login_required
def delete_corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
2021-11-30 15:22:16 +00:00
if not (corpus.user == current_user or current_user.is_administrator()):
2020-04-06 12:12:22 +00:00
abort(403)
2021-12-08 13:45:05 +00:00
flash(f'Corpus "{corpus.title}" marked for deletion!', 'corpus')
tasks.delete_corpus(corpus_id)
2020-04-06 12:12:22 +00:00
return redirect(url_for('main.dashboard'))
2021-11-30 15:22:16 +00:00
@bp.route('/<hashid:corpus_id>/files/add', methods=['GET', 'POST'])
2020-04-06 12:12:22 +00:00
@login_required
def add_corpus_file(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
2021-11-30 15:22:16 +00:00
if not (corpus.user == current_user or current_user.is_administrator()):
2020-04-06 12:12:22 +00:00
abort(403)
form = AddCorpusFileForm(corpus, prefix='add-corpus-file-form')
if form.is_submitted():
if not form.validate():
return make_response(form.errors, 400)
2020-04-06 12:12:22 +00:00
# Save the file
form.file.data.save(os.path.join(corpus.path, form.file.data.filename))
2021-11-16 14:23:57 +00:00
corpus_file = CorpusFile(
address=form.address.data,
author=form.author.data,
booktitle=form.booktitle.data,
chapter=form.chapter.data,
corpus=corpus,
editor=form.editor.data,
filename=form.file.data.filename,
institution=form.institution.data,
journal=form.journal.data,
pages=form.pages.data,
publisher=form.publisher.data,
publishing_year=form.publishing_year.data,
school=form.school.data,
title=form.title.data
)
2020-04-06 12:12:22 +00:00
db.session.add(corpus_file)
corpus.status = 'unprepared'
2020-04-06 12:12:22 +00:00
db.session.commit()
2021-12-08 13:45:05 +00:00
flash(f'Corpus file "{corpus_file.filename}" added!', 'corpus')
return make_response({'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201) # noqa
return render_template('corpora/add_corpus_file.html.j2', corpus=corpus,
form=form, title='Add corpus file')
2020-04-06 12:12:22 +00:00
2021-11-30 15:22:16 +00:00
@bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>/delete')
2020-04-06 12:12:22 +00:00
@login_required
def delete_corpus_file(corpus_id, corpus_file_id):
corpus_file = CorpusFile.query.get_or_404(corpus_file_id)
if not corpus_file.corpus_id == corpus_id:
abort(404)
2021-11-30 15:22:16 +00:00
if not (corpus_file.corpus.user == current_user
2020-04-06 12:12:22 +00:00
or current_user.is_administrator()):
abort(403)
2021-12-08 13:45:05 +00:00
flash(
f'Corpus file "{corpus_file.filename}" marked for deletion!', 'corpus')
tasks.delete_corpus_file(corpus_file_id)
return redirect(url_for('.corpus', corpus_id=corpus_id))
2020-04-06 12:12:22 +00:00
2021-11-30 15:22:16 +00:00
@bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>/download')
2020-04-06 12:12:22 +00:00
@login_required
def download_corpus_file(corpus_id, corpus_file_id):
corpus_file = CorpusFile.query.get_or_404(corpus_file_id)
if not corpus_file.corpus_id == corpus_id:
abort(404)
2021-11-30 15:22:16 +00:00
if not (corpus_file.corpus.user == current_user
2020-04-06 12:12:22 +00:00
or current_user.is_administrator()):
abort(403)
return send_from_directory(as_attachment=True,
directory=os.path.dirname(corpus_file.path),
2020-04-06 12:12:22 +00:00
filename=corpus_file.filename)
2021-11-30 15:22:16 +00:00
@bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>', methods=['GET', 'POST'])
2020-04-06 12:12:22 +00:00
@login_required
2020-07-30 12:17:51 +00:00
def corpus_file(corpus_id, corpus_file_id):
corpus = Corpus.query.get_or_404(corpus_id)
2021-11-30 15:22:16 +00:00
if not (corpus.user == current_user or current_user.is_administrator()):
abort(403)
2020-04-06 12:12:22 +00:00
corpus_file = CorpusFile.query.get_or_404(corpus_file_id)
if corpus_file.corpus != corpus:
2020-04-06 12:12:22 +00:00
abort(404)
form = EditCorpusFileForm(prefix='edit-corpus-file-form')
if form.validate_on_submit():
corpus_file.address = form.address.data
corpus_file.author = form.author.data
corpus_file.booktitle = form.booktitle.data
corpus_file.chapter = form.chapter.data
corpus_file.editor = form.editor.data
corpus_file.institution = form.institution.data
corpus_file.journal = form.journal.data
corpus_file.pages = form.pages.data
corpus_file.publisher = form.publisher.data
corpus_file.publishing_year = form.publishing_year.data
corpus_file.school = form.school.data
corpus_file.title = form.title.data
corpus.status = 'unprepared'
2020-04-06 12:12:22 +00:00
db.session.commit()
2021-12-08 13:45:05 +00:00
flash(f'Corpus file "{corpus_file.filename}" edited!', 'corpus')
return redirect(url_for('.corpus', corpus_id=corpus_id))
2020-04-17 09:04:09 +00:00
# If no form is submitted or valid, fill out fields with current values
form.address.data = corpus_file.address
form.author.data = corpus_file.author
form.booktitle.data = corpus_file.booktitle
form.chapter.data = corpus_file.chapter
form.editor.data = corpus_file.editor
form.institution.data = corpus_file.institution
form.journal.data = corpus_file.journal
form.pages.data = corpus_file.pages
form.publisher.data = corpus_file.publisher
form.publishing_year.data = corpus_file.publishing_year
form.school.data = corpus_file.school
form.title.data = corpus_file.title
return render_template('corpora/corpus_file.html.j2', corpus=corpus,
corpus_file=corpus_file, form=form,
2020-04-17 09:04:09 +00:00
title='Edit corpus file')
2020-04-06 12:12:22 +00:00
2021-12-08 13:45:05 +00:00
@bp.route('/<hashid:corpus_id>/build')
2020-04-06 12:12:22 +00:00
@login_required
2021-12-08 13:45:05 +00:00
def build_corpus(corpus_id):
2020-04-06 12:12:22 +00:00
corpus = Corpus.query.get_or_404(corpus_id)
2021-11-30 15:22:16 +00:00
if not (corpus.user == current_user or current_user.is_administrator()):
2020-04-06 12:12:22 +00:00
abort(403)
2020-04-17 09:04:09 +00:00
if corpus.files.all():
tasks.build_corpus(corpus_id)
2021-12-08 13:45:05 +00:00
flash(f'Corpus "{corpus.title}" marked for building!', 'corpus')
2020-04-06 12:12:22 +00:00
else:
2021-12-08 13:45:05 +00:00
flash(f'Can\'t build corpus "{corpus.title}": No corpus file(s)!', 'error') # noqa
return redirect(url_for('.corpus', corpus_id=corpus_id))