2022-02-08 11:26:20 +00:00
|
|
|
from app import db
|
|
|
|
from app.models import Corpus, CorpusFile, CorpusStatus
|
|
|
|
from flask import (
|
|
|
|
abort,
|
|
|
|
current_app,
|
|
|
|
flash,
|
|
|
|
make_response,
|
|
|
|
redirect,
|
|
|
|
render_template,
|
|
|
|
url_for,
|
|
|
|
send_from_directory
|
|
|
|
)
|
2020-04-06 12:12:22 +00:00
|
|
|
from flask_login import current_user, login_required
|
2022-02-03 11:39:16 +00:00
|
|
|
from werkzeug.utils import secure_filename
|
2022-02-08 11:26:20 +00:00
|
|
|
from zipfile import ZipFile
|
2021-09-13 09:45:43 +00:00
|
|
|
from . import bp
|
2020-04-21 16:34:21 +00:00
|
|
|
from . import tasks
|
2022-02-08 11:26:20 +00:00
|
|
|
from .forms import (
|
|
|
|
AddCorpusFileForm,
|
|
|
|
AddCorpusForm,
|
|
|
|
EditCorpusFileForm,
|
|
|
|
ImportCorpusForm
|
|
|
|
)
|
|
|
|
from .import_corpus import check_zip_contents
|
2020-04-06 12:12:22 +00:00
|
|
|
import os
|
2020-10-29 14:20:30 +00:00
|
|
|
import shutil
|
|
|
|
import glob
|
|
|
|
import xml.etree.ElementTree as ET
|
2020-04-06 12:12:22 +00:00
|
|
|
|
|
|
|
|
2021-09-13 09:45:43 +00:00
|
|
|
@bp.route('/add', methods=['GET', 'POST'])
|
2020-04-06 12:12:22 +00:00
|
|
|
@login_required
|
|
|
|
def add_corpus():
|
2021-11-16 14:23:57 +00:00
|
|
|
form = AddCorpusForm(prefix='add-corpus-form')
|
2020-11-13 09:01:51 +00:00
|
|
|
if form.validate_on_submit():
|
2021-11-16 14:23:57 +00:00
|
|
|
corpus = Corpus(
|
2021-11-30 15:22:16 +00:00
|
|
|
user=current_user,
|
2021-11-16 14:23:57 +00:00
|
|
|
description=form.description.data,
|
|
|
|
title=form.title.data
|
|
|
|
)
|
2020-04-06 12:12:22 +00:00
|
|
|
db.session.add(corpus)
|
2021-02-01 11:51:07 +00:00
|
|
|
db.session.flush()
|
|
|
|
db.session.refresh(corpus)
|
2020-04-06 12:12:22 +00:00
|
|
|
try:
|
2022-02-03 11:39:16 +00:00
|
|
|
corpus.makedirs()
|
2021-11-16 14:23:57 +00:00
|
|
|
except OSError as e:
|
2022-02-03 11:39:16 +00:00
|
|
|
current_app.logger.error(e)
|
2021-02-01 11:51:07 +00:00
|
|
|
db.session.rollback()
|
2022-02-08 11:26:20 +00:00
|
|
|
flash('Internal Server Error', category='error')
|
2020-11-13 09:01:51 +00:00
|
|
|
abort(500)
|
2022-02-03 11:39:16 +00:00
|
|
|
db.session.commit()
|
2022-02-08 11:26:20 +00:00
|
|
|
flash(f'Corpus "{corpus.title}" added', category='corpus')
|
2022-02-03 11:39:16 +00:00
|
|
|
return redirect(url_for('.corpus', corpus_id=corpus.id))
|
|
|
|
return render_template(
|
|
|
|
'corpora/add_corpus.html.j2',
|
|
|
|
form=form,
|
|
|
|
title='Add corpus'
|
|
|
|
)
|
2020-04-06 12:12:22 +00:00
|
|
|
|
|
|
|
|
2022-02-08 11:26:20 +00:00
|
|
|
@bp.route('/<hashid:corpus_id>/export')
|
|
|
|
@login_required
|
|
|
|
def export_corpus(corpus_id):
|
|
|
|
abort(503)
|
|
|
|
corpus = Corpus.query.get_or_404(corpus_id)
|
|
|
|
if not (corpus.user == current_user or current_user.is_administrator()):
|
|
|
|
abort(403)
|
|
|
|
return send_from_directory(
|
|
|
|
as_attachment=True,
|
|
|
|
directory=os.path.join(corpus.user.path, 'corpora'),
|
|
|
|
filename=corpus.archive_file,
|
|
|
|
mimetype='zip'
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2021-09-13 09:45:43 +00:00
|
|
|
@bp.route('/import', methods=['GET', 'POST'])
|
2020-10-29 14:20:30 +00:00
|
|
|
@login_required
|
|
|
|
def import_corpus():
|
2021-11-16 14:23:57 +00:00
|
|
|
abort(503)
|
2020-11-13 09:01:51 +00:00
|
|
|
form = ImportCorpusForm()
|
|
|
|
if form.is_submitted():
|
|
|
|
if not form.validate():
|
|
|
|
return make_response(form.errors, 400)
|
2021-11-16 14:23:57 +00:00
|
|
|
corpus = Corpus(
|
2021-11-30 15:22:16 +00:00
|
|
|
user=current_user,
|
2021-11-16 14:23:57 +00:00
|
|
|
description=form.description.data,
|
|
|
|
title=form.title.data
|
|
|
|
)
|
2020-10-29 14:20:30 +00:00
|
|
|
db.session.add(corpus)
|
2021-02-01 11:51:07 +00:00
|
|
|
db.session.flush()
|
|
|
|
db.session.refresh(corpus)
|
2020-10-29 14:20:30 +00:00
|
|
|
try:
|
2020-11-13 09:01:51 +00:00
|
|
|
os.makedirs(corpus.path)
|
2021-11-16 14:23:57 +00:00
|
|
|
except OSError as e:
|
2022-02-08 11:26:20 +00:00
|
|
|
current_app.logger.error(e)
|
2021-02-01 11:51:07 +00:00
|
|
|
db.session.rollback()
|
2022-02-08 11:26:20 +00:00
|
|
|
flash('Internal Server Error', category='error')
|
|
|
|
return make_response({'redirect_url': url_for('.import_corpus')}, 500) # noqa
|
2020-11-13 09:01:51 +00:00
|
|
|
# Upload zip
|
|
|
|
archive_file = os.path.join(corpus.path, form.file.data.filename)
|
|
|
|
form.file.data.save(archive_file)
|
|
|
|
# Some checks to verify it is a valid exported corpus
|
|
|
|
with ZipFile(archive_file, 'r') as zip:
|
|
|
|
contents = zip.namelist()
|
|
|
|
if set(check_zip_contents).issubset(contents):
|
|
|
|
# Unzip
|
|
|
|
shutil.unpack_archive(archive_file, corpus.path)
|
|
|
|
# Register vrt files to corpus
|
|
|
|
vrts = glob.glob(corpus.path + '/*.vrt')
|
|
|
|
for file in vrts:
|
|
|
|
element_tree = ET.parse(file)
|
|
|
|
text_node = element_tree.find('text')
|
|
|
|
corpus_file = CorpusFile(
|
|
|
|
address=text_node.get('address', 'NULL'),
|
|
|
|
author=text_node.get('author', 'NULL'),
|
|
|
|
booktitle=text_node.get('booktitle', 'NULL'),
|
|
|
|
chapter=text_node.get('chapter', 'NULL'),
|
|
|
|
corpus=corpus,
|
|
|
|
editor=text_node.get('editor', 'NULL'),
|
|
|
|
filename=os.path.basename(file),
|
|
|
|
institution=text_node.get('institution', 'NULL'),
|
|
|
|
journal=text_node.get('journal', 'NULL'),
|
|
|
|
pages=text_node.get('pages', 'NULL'),
|
|
|
|
publisher=text_node.get('publisher', 'NULL'),
|
|
|
|
publishing_year=text_node.get('publishing_year', ''),
|
|
|
|
school=text_node.get('school', 'NULL'),
|
|
|
|
title=text_node.get('title', 'NULL')
|
|
|
|
)
|
|
|
|
db.session.add(corpus_file)
|
|
|
|
# finish import and redirect to imported corpus
|
2022-02-08 11:26:20 +00:00
|
|
|
corpus.status = CorpusStatus.BUILT
|
2020-11-13 09:01:51 +00:00
|
|
|
db.session.commit()
|
|
|
|
os.remove(archive_file)
|
2022-02-08 11:26:20 +00:00
|
|
|
flash(f'Corpus "{corpus.title}" imported', 'corpus')
|
2020-11-13 09:01:51 +00:00
|
|
|
return make_response(
|
|
|
|
{'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201)
|
2020-10-29 14:20:30 +00:00
|
|
|
else:
|
2020-11-13 09:01:51 +00:00
|
|
|
# If imported zip is not valid delete corpus and give feedback
|
2022-02-08 11:26:20 +00:00
|
|
|
flash(
|
|
|
|
f'Can\'t import corpus "{corpus.title}": Invalid archive file',
|
|
|
|
category='error'
|
|
|
|
)
|
2020-11-13 09:01:51 +00:00
|
|
|
tasks.delete_corpus(corpus.id)
|
2022-02-08 11:26:20 +00:00
|
|
|
return make_response({'redirect_url': url_for('.import_corpus')}, 201) # noqa
|
|
|
|
return render_template(
|
|
|
|
'corpora/import_corpus.html.j2',
|
|
|
|
form=form,
|
|
|
|
title='Import Corpus'
|
|
|
|
)
|
2020-10-29 14:20:30 +00:00
|
|
|
|
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
@bp.route('/<hashid:corpus_id>')
|
2020-04-06 12:12:22 +00:00
|
|
|
@login_required
|
|
|
|
def corpus(corpus_id):
|
|
|
|
corpus = Corpus.query.get_or_404(corpus_id)
|
2021-11-30 15:22:16 +00:00
|
|
|
if not (corpus.user == current_user or current_user.is_administrator()):
|
2020-04-06 12:12:22 +00:00
|
|
|
abort(403)
|
2022-02-08 11:26:20 +00:00
|
|
|
return render_template(
|
|
|
|
'corpora/corpus.html.j2',
|
|
|
|
corpus=corpus,
|
|
|
|
title='Corpus'
|
|
|
|
)
|
2020-03-28 18:29:19 +00:00
|
|
|
|
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
@bp.route('/<hashid:corpus_id>/analyse')
|
2021-11-16 14:23:57 +00:00
|
|
|
@login_required
|
|
|
|
def analyse_corpus(corpus_id):
|
|
|
|
corpus = Corpus.query.get_or_404(corpus_id)
|
|
|
|
return render_template(
|
|
|
|
'corpora/analyse_corpus.html.j2',
|
|
|
|
corpus=corpus,
|
|
|
|
title=f'Analyse Corpus {corpus.title}'
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
@bp.route('/<hashid:corpus_id>/delete')
|
2020-04-06 12:12:22 +00:00
|
|
|
@login_required
|
|
|
|
def delete_corpus(corpus_id):
|
|
|
|
corpus = Corpus.query.get_or_404(corpus_id)
|
2021-11-30 15:22:16 +00:00
|
|
|
if not (corpus.user == current_user or current_user.is_administrator()):
|
2020-04-06 12:12:22 +00:00
|
|
|
abort(403)
|
2022-02-08 11:26:20 +00:00
|
|
|
flash(f'Corpus "{corpus.title}" marked for deletion', 'corpus')
|
2020-04-21 16:34:21 +00:00
|
|
|
tasks.delete_corpus(corpus_id)
|
2020-04-06 12:12:22 +00:00
|
|
|
return redirect(url_for('main.dashboard'))
|
|
|
|
|
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
@bp.route('/<hashid:corpus_id>/files/add', methods=['GET', 'POST'])
|
2020-04-06 12:12:22 +00:00
|
|
|
@login_required
|
|
|
|
def add_corpus_file(corpus_id):
|
|
|
|
corpus = Corpus.query.get_or_404(corpus_id)
|
2021-11-30 15:22:16 +00:00
|
|
|
if not (corpus.user == current_user or current_user.is_administrator()):
|
2020-04-06 12:12:22 +00:00
|
|
|
abort(403)
|
2022-04-12 14:11:24 +00:00
|
|
|
form = AddCorpusFileForm(prefix='add-corpus-file-form')
|
2020-11-13 09:01:51 +00:00
|
|
|
if form.is_submitted():
|
|
|
|
if not form.validate():
|
|
|
|
return make_response(form.errors, 400)
|
2020-04-06 12:12:22 +00:00
|
|
|
# Save the file
|
2022-02-03 11:39:16 +00:00
|
|
|
filename = secure_filename(form.file.data.filename)
|
2021-11-16 14:23:57 +00:00
|
|
|
corpus_file = CorpusFile(
|
|
|
|
address=form.address.data,
|
|
|
|
author=form.author.data,
|
|
|
|
booktitle=form.booktitle.data,
|
|
|
|
chapter=form.chapter.data,
|
|
|
|
corpus=corpus,
|
|
|
|
editor=form.editor.data,
|
2022-02-03 11:39:16 +00:00
|
|
|
filename=filename,
|
2021-11-16 14:23:57 +00:00
|
|
|
institution=form.institution.data,
|
|
|
|
journal=form.journal.data,
|
2022-02-03 11:39:16 +00:00
|
|
|
mimetype='application/vrt+xml',
|
2021-11-16 14:23:57 +00:00
|
|
|
pages=form.pages.data,
|
|
|
|
publisher=form.publisher.data,
|
|
|
|
publishing_year=form.publishing_year.data,
|
|
|
|
school=form.school.data,
|
|
|
|
title=form.title.data
|
|
|
|
)
|
2020-04-06 12:12:22 +00:00
|
|
|
db.session.add(corpus_file)
|
2022-02-03 11:39:16 +00:00
|
|
|
db.session.flush(objects=[corpus_file])
|
|
|
|
db.session.refresh(corpus_file)
|
|
|
|
try:
|
|
|
|
form.file.data.save(corpus_file.path)
|
|
|
|
except OSError as e:
|
|
|
|
current_app.logger.error(e)
|
|
|
|
db.session.rollback()
|
2022-02-08 11:26:20 +00:00
|
|
|
flash('Internal Server Error', category='error')
|
2022-02-03 11:39:16 +00:00
|
|
|
return make_response({'redirect_url': url_for('.add_corpus_file', corpus_id=corpus.id)}, 500) # noqa
|
2022-02-08 11:26:20 +00:00
|
|
|
corpus.status = CorpusStatus.UNPREPARED
|
2020-04-06 12:12:22 +00:00
|
|
|
db.session.commit()
|
2022-02-08 11:26:20 +00:00
|
|
|
flash(f'Corpus file "{corpus_file.filename}" added', category='corpus')
|
2020-11-13 09:01:51 +00:00
|
|
|
return make_response({'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201) # noqa
|
2022-02-03 11:39:16 +00:00
|
|
|
return render_template(
|
|
|
|
'corpora/add_corpus_file.html.j2',
|
|
|
|
corpus=corpus,
|
|
|
|
form=form,
|
|
|
|
title='Add corpus file'
|
|
|
|
)
|
2020-04-06 12:12:22 +00:00
|
|
|
|
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
@bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>/delete')
|
2020-04-06 12:12:22 +00:00
|
|
|
@login_required
|
|
|
|
def delete_corpus_file(corpus_id, corpus_file_id):
|
2022-02-08 11:26:20 +00:00
|
|
|
corpus_file = CorpusFile.query.filter(
|
|
|
|
CorpusFile.corpus_id == corpus_id,
|
|
|
|
CorpusFile.id == corpus_file_id
|
|
|
|
).first_or_404()
|
|
|
|
if not (
|
|
|
|
corpus_file.corpus.user == current_user
|
|
|
|
or current_user.is_administrator()
|
|
|
|
):
|
2020-04-06 12:12:22 +00:00
|
|
|
abort(403)
|
2021-12-08 13:45:05 +00:00
|
|
|
flash(
|
2022-02-08 11:26:20 +00:00
|
|
|
f'Corpus file "{corpus_file.filename}" marked for deletion',
|
|
|
|
category='corpus'
|
|
|
|
)
|
2020-04-21 16:34:21 +00:00
|
|
|
tasks.delete_corpus_file(corpus_file_id)
|
2020-11-13 09:01:51 +00:00
|
|
|
return redirect(url_for('.corpus', corpus_id=corpus_id))
|
2020-04-06 12:12:22 +00:00
|
|
|
|
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
@bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>/download')
|
2020-04-06 12:12:22 +00:00
|
|
|
@login_required
|
|
|
|
def download_corpus_file(corpus_id, corpus_file_id):
|
2022-02-08 11:26:20 +00:00
|
|
|
corpus_file = CorpusFile.query.filter(
|
|
|
|
CorpusFile.corpus_id == corpus_id,
|
|
|
|
CorpusFile.id == corpus_file_id
|
|
|
|
).first_or_404()
|
|
|
|
if not (
|
|
|
|
corpus_file.corpus.user == current_user
|
|
|
|
or current_user.is_administrator()
|
|
|
|
):
|
2020-04-06 12:12:22 +00:00
|
|
|
abort(403)
|
2022-02-08 11:26:20 +00:00
|
|
|
return send_from_directory(
|
|
|
|
as_attachment=True,
|
2022-04-12 14:11:24 +00:00
|
|
|
attachment_filename=corpus_file.filename,
|
2022-02-08 11:26:20 +00:00
|
|
|
directory=os.path.dirname(corpus_file.path),
|
2022-04-12 14:11:24 +00:00
|
|
|
filename=os.path.basename(corpus_file.path)
|
2022-02-08 11:26:20 +00:00
|
|
|
)
|
2020-04-06 12:12:22 +00:00
|
|
|
|
|
|
|
|
2022-02-08 11:26:20 +00:00
|
|
|
@bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>', methods=['GET', 'POST']) # noqa
|
2020-04-06 12:12:22 +00:00
|
|
|
@login_required
|
2020-07-30 12:17:51 +00:00
|
|
|
def corpus_file(corpus_id, corpus_file_id):
|
2022-02-08 11:26:20 +00:00
|
|
|
corpus_file = CorpusFile.query.filter(
|
|
|
|
CorpusFile.corpus_id == corpus_id,
|
|
|
|
CorpusFile.id == corpus_file_id
|
|
|
|
).first_or_404()
|
|
|
|
if not (
|
|
|
|
corpus_file.corpus.user == current_user
|
|
|
|
or current_user.is_administrator()
|
|
|
|
):
|
2020-11-22 15:01:59 +00:00
|
|
|
abort(403)
|
2020-11-13 09:01:51 +00:00
|
|
|
form = EditCorpusFileForm(prefix='edit-corpus-file-form')
|
|
|
|
if form.validate_on_submit():
|
|
|
|
corpus_file.address = form.address.data
|
|
|
|
corpus_file.author = form.author.data
|
|
|
|
corpus_file.booktitle = form.booktitle.data
|
|
|
|
corpus_file.chapter = form.chapter.data
|
|
|
|
corpus_file.editor = form.editor.data
|
|
|
|
corpus_file.institution = form.institution.data
|
|
|
|
corpus_file.journal = form.journal.data
|
|
|
|
corpus_file.pages = form.pages.data
|
|
|
|
corpus_file.publisher = form.publisher.data
|
|
|
|
corpus_file.publishing_year = form.publishing_year.data
|
|
|
|
corpus_file.school = form.school.data
|
|
|
|
corpus_file.title = form.title.data
|
2022-02-08 11:26:20 +00:00
|
|
|
corpus_file.corpus.status = CorpusStatus.UNPREPARED
|
2020-04-06 12:12:22 +00:00
|
|
|
db.session.commit()
|
2022-02-08 11:26:20 +00:00
|
|
|
flash(f'Corpus file "{corpus_file.filename}" edited', category='corpus') # noqa
|
2020-11-13 09:01:51 +00:00
|
|
|
return redirect(url_for('.corpus', corpus_id=corpus_id))
|
2020-04-17 09:04:09 +00:00
|
|
|
# If no form is submitted or valid, fill out fields with current values
|
2020-11-13 09:01:51 +00:00
|
|
|
form.address.data = corpus_file.address
|
|
|
|
form.author.data = corpus_file.author
|
|
|
|
form.booktitle.data = corpus_file.booktitle
|
|
|
|
form.chapter.data = corpus_file.chapter
|
|
|
|
form.editor.data = corpus_file.editor
|
|
|
|
form.institution.data = corpus_file.institution
|
|
|
|
form.journal.data = corpus_file.journal
|
|
|
|
form.pages.data = corpus_file.pages
|
|
|
|
form.publisher.data = corpus_file.publisher
|
|
|
|
form.publishing_year.data = corpus_file.publishing_year
|
|
|
|
form.school.data = corpus_file.school
|
|
|
|
form.title.data = corpus_file.title
|
2022-02-08 11:26:20 +00:00
|
|
|
return render_template(
|
|
|
|
'corpora/corpus_file.html.j2',
|
2022-04-04 11:31:09 +00:00
|
|
|
corpus=corpus_file.corpus,
|
2022-02-08 11:26:20 +00:00
|
|
|
corpus_file=corpus_file,
|
|
|
|
form=form,
|
|
|
|
title='Edit corpus file'
|
|
|
|
)
|
2020-04-06 12:12:22 +00:00
|
|
|
|
|
|
|
|
2021-12-08 13:45:05 +00:00
|
|
|
@bp.route('/<hashid:corpus_id>/build')
|
2020-04-06 12:12:22 +00:00
|
|
|
@login_required
|
2021-12-08 13:45:05 +00:00
|
|
|
def build_corpus(corpus_id):
|
2020-04-06 12:12:22 +00:00
|
|
|
corpus = Corpus.query.get_or_404(corpus_id)
|
2021-11-30 15:22:16 +00:00
|
|
|
if not (corpus.user == current_user or current_user.is_administrator()):
|
2020-04-06 12:12:22 +00:00
|
|
|
abort(403)
|
2020-04-17 09:04:09 +00:00
|
|
|
if corpus.files.all():
|
2020-04-23 05:56:23 +00:00
|
|
|
tasks.build_corpus(corpus_id)
|
2022-02-08 11:26:20 +00:00
|
|
|
flash(
|
|
|
|
f'Corpus "{corpus.title}" marked for building',
|
|
|
|
category='corpus'
|
|
|
|
)
|
2020-04-06 12:12:22 +00:00
|
|
|
else:
|
2022-02-08 11:26:20 +00:00
|
|
|
flash(
|
|
|
|
f'Can\'t build corpus "{corpus.title}": No corpus file(s)',
|
|
|
|
category='error'
|
|
|
|
)
|
2020-11-13 09:01:51 +00:00
|
|
|
return redirect(url_for('.corpus', corpus_id=corpus_id))
|