nopaque/web/app/corpora/views.py

473 lines
21 KiB
Python
Raw Normal View History

from flask import (abort, current_app, flash, make_response, redirect, request,
2020-04-06 14:12:22 +02:00
render_template, url_for, send_from_directory)
from flask_login import current_user, login_required
from . import corpora
from . import tasks
from .forms import (AddCorpusFileForm, AddCorpusForm, AddQueryResultForm,
EditCorpusFileForm, QueryDownloadForm, QueryForm,
DisplayOptionsForm, InspectDisplayOptionsForm,
ImportCorpusForm)
from jsonschema import validate
from .. import db
from ..models import Corpus, CorpusFile, QueryResult
import json
2020-04-06 14:12:22 +02:00
import os
import shutil
import glob
import xml.etree.ElementTree as ET
from zipfile import ZipFile
from .import_corpus import check_zip_contents
2020-04-06 14:12:22 +02:00
@corpora.route('/add', methods=['GET', 'POST'])
@login_required
def add_corpus():
add_corpus_form = AddCorpusForm()
if add_corpus_form.validate_on_submit():
corpus = Corpus(creator=current_user,
description=add_corpus_form.description.data,
status='unprepared', title=add_corpus_form.title.data)
db.session.add(corpus)
db.session.commit()
dir = os.path.join(current_app.config['DATA_DIR'],
2020-04-06 14:12:22 +02:00
str(corpus.user_id), 'corpora', str(corpus.id))
try:
os.makedirs(dir)
except OSError:
2020-04-27 15:22:20 +02:00
flash('[ERROR]: Could not add corpus!', 'corpus')
2020-04-06 14:12:22 +02:00
corpus.delete()
else:
2020-04-27 15:22:20 +02:00
url = url_for('corpora.corpus', corpus_id=corpus.id)
flash('[<a href="{}">{}</a>] added'.format(url, corpus.title),
'corpus')
2020-04-06 14:12:22 +02:00
return redirect(url_for('corpora.corpus', corpus_id=corpus.id))
return render_template('corpora/add_corpus.html.j2',
add_corpus_form=add_corpus_form,
title='Add corpus')
@corpora.route('/import', methods=['GET', 'POST'])
@login_required
def import_corpus():
import_corpus_form = ImportCorpusForm()
if import_corpus_form.is_submitted():
if not import_corpus_form.validate():
return make_response(import_corpus_form.errors, 400)
corpus = Corpus(creator=current_user,
description=import_corpus_form.description.data,
status='unprepared',
title=import_corpus_form.title.data)
db.session.add(corpus)
db.session.commit()
dir = os.path.join(current_app.config['DATA_DIR'],
str(corpus.user_id), 'corpora', str(corpus.id))
try:
os.makedirs(dir)
except OSError:
flash('[ERROR]: Could not import corpus!', 'corpus')
corpus.delete()
else:
# Upload zip
archive_file = os.path.join(current_app.config['DATA_DIR'], dir,
import_corpus_form.file.data.filename)
corpus_dir = os.path.dirname(archive_file)
import_corpus_form.file.data.save(archive_file)
# Some checks to verify it is a valid exported corpus
with ZipFile(archive_file, 'r') as zip:
contents = zip.namelist()
if set(check_zip_contents).issubset(contents):
# Unzip
shutil.unpack_archive(archive_file, corpus_dir)
# Register vrt files to corpus
vrts = glob.glob(corpus_dir + '/*.vrt')
for file in vrts:
element_tree = ET.parse(file)
text_node = element_tree.find('text')
corpus_file = CorpusFile(
address=text_node.get('address', 'NULL'),
author=text_node.get('author', 'NULL'),
booktitle=text_node.get('booktitle', 'NULL'),
chapter=text_node.get('chapter', 'NULL'),
corpus=corpus,
dir=dir,
editor=text_node.get('editor', 'NULL'),
filename=os.path.basename(file),
institution=text_node.get('institution', 'NULL'),
journal=text_node.get('journal', 'NULL'),
pages=text_node.get('pages', 'NULL'),
publisher=text_node.get('publisher', 'NULL'),
publishing_year=text_node.get('publishing_year', ''),
school=text_node.get('school', 'NULL'),
title=text_node.get('title', 'NULL'))
db.session.add(corpus_file)
# finish import and got to imported corpus
url = url_for('corpora.corpus', corpus_id=corpus.id)
corpus.status = 'prepared'
db.session.commit()
os.remove(archive_file)
flash('[<a href="{}">{}</a>] imported'.format(url,
corpus.title),
'corpus')
return make_response(
{'redirect_url': url_for('corpora.corpus',
corpus_id=corpus.id)},
201)
else:
# If imported zip is not valid delete corpus and give feedback
corpus.delete()
db.session.commit()
flash('Imported corpus is not valid.', 'error')
return make_response(
{'redirect_url': url_for('corpora.import_corpus')},
201)
return render_template('corpora/import_corpus.html.j2',
import_corpus_form=import_corpus_form,
title='Import Corpus')
2020-04-06 14:12:22 +02:00
@corpora.route('/<int:corpus_id>')
@login_required
def corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.creator == current_user or current_user.is_administrator()):
abort(403)
corpus_files = [dict(filename=corpus_file.filename,
author=corpus_file.author,
title=corpus_file.title,
publishing_year=corpus_file.publishing_year,
corpus_id=corpus.id,
2020-10-28 16:13:41 +01:00
id=corpus_file.id)
for corpus_file in corpus.files]
return render_template('corpora/corpus.html.j2',
corpus=corpus,
corpus_files=corpus_files,
2020-04-06 14:12:22 +02:00
title='Corpus')
@corpora.route('/<int:corpus_id>/export')
@login_required
def export_corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.creator == current_user or current_user.is_administrator()):
abort(403)
dir = os.path.dirname(corpus.archive_file)
filename = os.path.basename(corpus.archive_file)
return send_from_directory(directory=dir,
filename=filename,
mimetype='zip',
as_attachment=True)
2020-04-06 14:27:14 +02:00
@corpora.route('/<int:corpus_id>/analyse')
@login_required
2020-04-06 14:27:14 +02:00
def analyse_corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
if corpus.status == 'prepared':
corpus.status = 'start analysis'
db.session.commit()
2020-04-06 14:09:41 +02:00
display_options_form = DisplayOptionsForm(
prefix='display-options-form',
result_context=request.args.get('context', 20),
results_per_page=request.args.get('results_per_page', 30))
2020-04-06 14:09:41 +02:00
query_form = QueryForm(prefix='query-form',
2020-04-06 14:12:22 +02:00
query=request.args.get('query'))
2020-04-20 13:48:40 +02:00
query_download_form = QueryDownloadForm(prefix='query-download-form')
inspect_display_options_form = InspectDisplayOptionsForm(
prefix='inspect-display-options-form')
return render_template(
'corpora/analyse_corpus.html.j2',
2020-10-29 14:45:55 +01:00
corpus=corpus,
corpus_id=corpus_id,
display_options_form=display_options_form,
query_form=query_form,
query_download_form=query_download_form,
inspect_display_options_form=inspect_display_options_form,
title='Corpus analysis')
2020-04-06 14:12:22 +02:00
@corpora.route('/<int:corpus_id>/delete')
@login_required
def delete_corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.creator == current_user or current_user.is_administrator()):
abort(403)
tasks.delete_corpus(corpus_id)
2020-04-27 15:22:20 +02:00
flash('Corpus deleted!', 'corpus')
2020-04-06 14:12:22 +02:00
return redirect(url_for('main.dashboard'))
@corpora.route('/<int:corpus_id>/files/add', methods=['GET', 'POST'])
@login_required
def add_corpus_file(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.creator == current_user or current_user.is_administrator()):
abort(403)
add_corpus_file_form = AddCorpusFileForm(corpus,
prefix='add-corpus-file-form')
if add_corpus_file_form.is_submitted():
if not add_corpus_file_form.validate():
return make_response(add_corpus_file_form.errors, 400)
2020-04-06 14:12:22 +02:00
# Save the file
dir = os.path.join(str(corpus.user_id), 'corpora', str(corpus.id))
add_corpus_file_form.file.data.save(
os.path.join(current_app.config['DATA_DIR'], dir,
add_corpus_file_form.file.data.filename))
corpus_file = CorpusFile(
address=add_corpus_file_form.address.data,
author=add_corpus_file_form.author.data,
booktitle=add_corpus_file_form.booktitle.data,
chapter=add_corpus_file_form.chapter.data,
corpus=corpus,
dir=dir,
editor=add_corpus_file_form.editor.data,
filename=add_corpus_file_form.file.data.filename,
institution=add_corpus_file_form.institution.data,
journal=add_corpus_file_form.journal.data,
pages=add_corpus_file_form.pages.data,
publisher=add_corpus_file_form.publisher.data,
publishing_year=add_corpus_file_form.publishing_year.data,
school=add_corpus_file_form.school.data,
title=add_corpus_file_form.title.data)
2020-04-06 14:12:22 +02:00
db.session.add(corpus_file)
corpus.status = 'unprepared'
2020-04-06 14:12:22 +02:00
db.session.commit()
2020-04-27 15:22:20 +02:00
flash('Corpus file added!', 'corpus')
return make_response(
{'redirect_url': url_for('corpora.corpus', corpus_id=corpus.id)},
201)
2020-04-06 14:12:22 +02:00
return render_template('corpora/add_corpus_file.html.j2',
2020-04-17 11:04:09 +02:00
corpus=corpus,
2020-04-06 14:12:22 +02:00
add_corpus_file_form=add_corpus_file_form,
2020-04-17 11:04:09 +02:00
title='Add corpus file')
2020-04-06 14:12:22 +02:00
@corpora.route('/<int:corpus_id>/files/<int:corpus_file_id>/delete')
@login_required
def delete_corpus_file(corpus_id, corpus_file_id):
corpus_file = CorpusFile.query.get_or_404(corpus_file_id)
if not corpus_file.corpus_id == corpus_id:
abort(404)
if not (corpus_file.corpus.creator == current_user
or current_user.is_administrator()):
abort(403)
tasks.delete_corpus_file(corpus_file_id)
2020-04-27 15:22:20 +02:00
flash('Corpus file deleted!', 'corpus')
2020-04-06 14:12:22 +02:00
return redirect(url_for('corpora.corpus', corpus_id=corpus_id))
@corpora.route('/<int:corpus_id>/files/<int:corpus_file_id>/download')
@login_required
def download_corpus_file(corpus_id, corpus_file_id):
corpus_file = CorpusFile.query.get_or_404(corpus_file_id)
if not corpus_file.corpus_id == corpus_id:
abort(404)
if not (corpus_file.corpus.creator == current_user
or current_user.is_administrator()):
abort(403)
dir = os.path.join(current_app.config['DATA_DIR'],
2020-04-06 14:12:22 +02:00
corpus_file.dir)
return send_from_directory(as_attachment=True, directory=dir,
filename=corpus_file.filename)
2020-07-30 14:17:51 +02:00
@corpora.route('/<int:corpus_id>/files/<int:corpus_file_id>',
2020-04-06 14:12:22 +02:00
methods=['GET', 'POST'])
@login_required
2020-07-30 14:17:51 +02:00
def corpus_file(corpus_id, corpus_file_id):
2020-04-17 11:04:09 +02:00
corpus = Corpus.query.get_or_404(corpus_id)
2020-04-06 14:12:22 +02:00
corpus_file = CorpusFile.query.get_or_404(corpus_file_id)
if not corpus_file.corpus_id == corpus_id:
abort(404)
if not (corpus_file.corpus.creator == current_user
or current_user.is_administrator()):
abort(403)
2020-04-17 11:04:09 +02:00
edit_corpus_file_form = EditCorpusFileForm(prefix='edit-corpus-file-form')
2020-04-06 14:12:22 +02:00
if edit_corpus_file_form.validate_on_submit():
2020-04-17 11:04:09 +02:00
corpus_file.address = edit_corpus_file_form.address.data
corpus_file.author = edit_corpus_file_form.author.data
corpus_file.booktitle = edit_corpus_file_form.booktitle.data
corpus_file.chapter = edit_corpus_file_form.chapter.data
corpus_file.editor = edit_corpus_file_form.editor.data
corpus_file.institution = edit_corpus_file_form.institution.data
corpus_file.journal = edit_corpus_file_form.journal.data
corpus_file.pages = edit_corpus_file_form.pages.data
corpus_file.publisher = edit_corpus_file_form.publisher.data
corpus_file.publishing_year = \
edit_corpus_file_form.publishing_year.data
corpus_file.school = edit_corpus_file_form.school.data
corpus_file.title = edit_corpus_file_form.title.data
corpus.status = 'unprepared'
2020-04-06 14:12:22 +02:00
db.session.commit()
2020-04-27 15:22:20 +02:00
flash('Corpus file edited!', 'corpus')
2020-04-06 14:12:22 +02:00
return redirect(url_for('corpora.corpus', corpus_id=corpus_id))
2020-04-17 11:04:09 +02:00
# If no form is submitted or valid, fill out fields with current values
edit_corpus_file_form.address.data = corpus_file.address
edit_corpus_file_form.author.data = corpus_file.author
edit_corpus_file_form.booktitle.data = corpus_file.booktitle
edit_corpus_file_form.chapter.data = corpus_file.chapter
edit_corpus_file_form.editor.data = corpus_file.editor
edit_corpus_file_form.institution.data = corpus_file.institution
edit_corpus_file_form.journal.data = corpus_file.journal
edit_corpus_file_form.pages.data = corpus_file.pages
edit_corpus_file_form.publisher.data = corpus_file.publisher
edit_corpus_file_form.publishing_year.data = corpus_file.publishing_year
edit_corpus_file_form.school.data = corpus_file.school
edit_corpus_file_form.title.data = corpus_file.title
2020-07-30 14:17:51 +02:00
return render_template('corpora/corpus_file.html.j2',
2020-04-17 11:04:09 +02:00
corpus_file=corpus_file, corpus=corpus,
2020-04-06 14:12:22 +02:00
edit_corpus_file_form=edit_corpus_file_form,
2020-04-17 11:04:09 +02:00
title='Edit corpus file')
2020-04-06 14:12:22 +02:00
@corpora.route('/<int:corpus_id>/prepare')
@login_required
def prepare_corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.creator == current_user or current_user.is_administrator()):
abort(403)
2020-04-17 11:04:09 +02:00
if corpus.files.all():
tasks.build_corpus(corpus_id)
flash('Building Corpus...', 'corpus')
2020-04-06 14:12:22 +02:00
else:
2020-04-27 15:22:20 +02:00
flash('Can not build corpus, please add corpus file(s).', 'corpus')
2020-04-06 14:12:22 +02:00
return redirect(url_for('corpora.corpus', corpus_id=corpus_id))
# Following are view functions to add, view etc. exported results.
@corpora.route('/result/add', methods=['GET', 'POST'])
@login_required
def add_query_result():
'''
View to import a result as a json file.
'''
add_query_result_form = AddQueryResultForm(prefix='add-query-result-form')
if add_query_result_form.is_submitted():
if not add_query_result_form.validate():
return make_response(add_query_result_form.errors, 400)
query_result = QueryResult(
creator=current_user,
description=add_query_result_form.description.data,
filename=add_query_result_form.file.data.filename,
title=add_query_result_form.title.data
)
db.session.add(query_result)
db.session.commit()
# create paths to save the uploaded json file
2020-10-08 15:34:28 +02:00
query_result_dir = os.path.join(current_app.config['DATA_DIR'],
str(current_user.id),
'query_results',
str(query_result.id))
try:
os.makedirs(query_result_dir)
except Exception:
db.session.delete(query_result)
db.session.commit()
flash('Internal Server Error', 'error')
redirect_url = url_for('corpora.add_query_result')
return make_response({'redirect_url': redirect_url}, 500)
# save the uploaded file
query_result_file_path = os.path.join(query_result_dir,
query_result.filename)
add_query_result_form.file.data.save(query_result_file_path)
# parse json from file
with open(query_result_file_path, 'r') as file:
query_result_file_content = json.load(file)
# parse json schema
with open('app/static/json_schema/nopaque_cqi_py_results_schema.json', 'r') as file: # noqa
schema = json.load(file)
try:
# validate imported json file
validate(instance=query_result_file_content, schema=schema)
except Exception:
tasks.delete_query_result(query_result.id)
flash('Uploaded file is invalid', 'result')
redirect_url = url_for('corpora.add_query_result')
return make_response({'redirect_url': redirect_url}, 201)
query_result_file_content.pop('matches')
query_result_file_content.pop('cpos_lookup')
query_result.query_metadata = query_result_file_content
db.session.commit()
flash('Query result added!', 'result')
redirect_url = url_for('corpora.query_result',
query_result_id=query_result.id)
return make_response({'redirect_url': redirect_url}, 201)
return render_template('corpora/query_results/add_query_result.html.j2',
add_query_result_form=add_query_result_form,
title='Add query result')
@corpora.route('/result/<int:query_result_id>')
@login_required
def query_result(query_result_id):
query_result = QueryResult.query.get_or_404(query_result_id)
if not (query_result.creator == current_user
or current_user.is_administrator()):
abort(403)
return render_template('corpora/query_results/query_result.html.j2',
query_result=query_result,
title='Query result')
@corpora.route('/result/<int:query_result_id>/inspect')
@login_required
def inspect_query_result(query_result_id):
'''
View to inspect imported result file in a corpus analysis like interface
'''
query_result = QueryResult.query.get_or_404(query_result_id)
query_metadata = query_result.query_metadata
if not (query_result.creator == current_user
or current_user.is_administrator()):
abort(403)
display_options_form = DisplayOptionsForm(
prefix='display-options-form',
results_per_page=request.args.get('results_per_page', 30),
result_context=request.args.get('context', 20)
)
inspect_display_options_form = InspectDisplayOptionsForm(
prefix='inspect-display-options-form'
)
query_result_file_path = os.path.join(
2020-10-08 15:34:28 +02:00
current_app.config['DATA_DIR'],
str(current_user.id),
'query_results',
str(query_result.id),
query_result.filename
)
with open(query_result_file_path, 'r') as query_result_file:
query_result_file_content = json.load(query_result_file)
return render_template('corpora/query_results/inspect.html.j2',
display_options_form=display_options_form,
2020-10-12 11:29:33 +02:00
inspect_display_options_form=inspect_display_options_form, # noqa
query_result_file_content=query_result_file_content,
query_metadata=query_metadata,
title='Inspect query result')
@corpora.route('/result/<int:query_result_id>/delete')
@login_required
def delete_query_result(query_result_id):
query_result = QueryResult.query.get_or_404(query_result_id)
if not (query_result.creator == current_user
or current_user.is_administrator()):
abort(403)
tasks.delete_query_result(query_result_id)
flash('Query result deleted!', 'result')
return redirect(url_for('services.service', service="corpus_analysis"))
@corpora.route('/result/<int:query_result_id>/download')
@login_required
def download_query_result(query_result_id):
query_result = QueryResult.query.get_or_404(query_result_id)
if not (query_result.creator == current_user
or current_user.is_administrator()):
abort(403)
2020-10-08 15:34:28 +02:00
query_result_dir = os.path.join(current_app.config['DATA_DIR'],
str(current_user.id),
'query_results',
str(query_result.id))
return send_from_directory(as_attachment=True,
directory=query_result_dir,
filename=query_result.filename)