nopaque/app/corpora/routes.py

415 lines
18 KiB
Python
Raw Normal View History

from flask import (abort, flash, make_response, redirect, request,
2020-04-06 14:12:22 +02:00
render_template, url_for, send_from_directory)
from flask_login import current_user, login_required
from . import bp
from . import tasks
from .forms import (AddCorpusFileForm, AddCorpusForm, AddQueryResultForm,
EditCorpusFileForm, QueryDownloadForm, QueryForm,
DisplayOptionsForm, InspectDisplayOptionsForm,
ImportCorpusForm)
from jsonschema import validate
from .. import db
from ..models import Corpus, CorpusFile, QueryResult
import json
import logging
2020-04-06 14:12:22 +02:00
import os
import shutil
import glob
import xml.etree.ElementTree as ET
from zipfile import ZipFile
from .import_corpus import check_zip_contents
2020-04-06 14:12:22 +02:00
@bp.route('/add', methods=['GET', 'POST'])
2020-04-06 14:12:22 +02:00
@login_required
def add_corpus():
form = AddCorpusForm()
if form.validate_on_submit():
2020-04-06 14:12:22 +02:00
corpus = Corpus(creator=current_user,
description=form.description.data,
title=form.title.data)
2020-04-06 14:12:22 +02:00
db.session.add(corpus)
db.session.flush()
db.session.refresh(corpus)
2020-04-06 14:12:22 +02:00
try:
os.makedirs(corpus.path)
2020-04-06 14:12:22 +02:00
except OSError:
logging.error('Make dir {} led to an OSError!'.format(corpus.path))
db.session.rollback()
abort(500)
else:
db.session.commit()
flash('Corpus "{}" added!'.format(corpus.title), 'corpus')
return redirect(url_for('.corpus', corpus_id=corpus.id))
return render_template('corpora/add_corpus.html.j2', form=form,
2020-04-06 14:12:22 +02:00
title='Add corpus')
@bp.route('/import', methods=['GET', 'POST'])
@login_required
def import_corpus():
form = ImportCorpusForm()
if form.is_submitted():
if not form.validate():
return make_response(form.errors, 400)
corpus = Corpus(creator=current_user,
description=form.description.data,
title=form.title.data)
db.session.add(corpus)
db.session.flush()
db.session.refresh(corpus)
try:
os.makedirs(corpus.path)
except OSError:
logging.error('Make dir {} led to an OSError!'.format(corpus.path))
db.session.rollback()
flash('Internal Server Error', 'error')
return make_response(
{'redirect_url': url_for('.import_corpus')}, 500)
# Upload zip
archive_file = os.path.join(corpus.path, form.file.data.filename)
form.file.data.save(archive_file)
# Some checks to verify it is a valid exported corpus
with ZipFile(archive_file, 'r') as zip:
contents = zip.namelist()
if set(check_zip_contents).issubset(contents):
# Unzip
shutil.unpack_archive(archive_file, corpus.path)
# Register vrt files to corpus
vrts = glob.glob(corpus.path + '/*.vrt')
for file in vrts:
element_tree = ET.parse(file)
text_node = element_tree.find('text')
corpus_file = CorpusFile(
address=text_node.get('address', 'NULL'),
author=text_node.get('author', 'NULL'),
booktitle=text_node.get('booktitle', 'NULL'),
chapter=text_node.get('chapter', 'NULL'),
corpus=corpus,
editor=text_node.get('editor', 'NULL'),
filename=os.path.basename(file),
institution=text_node.get('institution', 'NULL'),
journal=text_node.get('journal', 'NULL'),
pages=text_node.get('pages', 'NULL'),
publisher=text_node.get('publisher', 'NULL'),
publishing_year=text_node.get('publishing_year', ''),
school=text_node.get('school', 'NULL'),
title=text_node.get('title', 'NULL')
)
db.session.add(corpus_file)
# finish import and redirect to imported corpus
corpus.status = 'prepared'
db.session.commit()
os.remove(archive_file)
flash('Corpus "{}" imported!'.format(corpus.title), 'corpus')
return make_response(
{'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201)
else:
# If imported zip is not valid delete corpus and give feedback
flash('Can not import corpus "{}" not imported: Invalid archive file!', 'error') # noqa
tasks.delete_corpus(corpus.id)
return make_response(
{'redirect_url': url_for('.import_corpus')}, 201)
return render_template('corpora/import_corpus.html.j2', form=form,
title='Import Corpus')
@bp.route('/<int:corpus_id>')
2020-04-06 14:12:22 +02:00
@login_required
def corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.creator == current_user or current_user.is_administrator()):
abort(403)
corpus_files = [corpus_file.to_dict() for corpus_file in corpus.files]
return render_template('corpora/corpus.html.j2', corpus=corpus,
corpus_files=corpus_files, title='Corpus')
@bp.route('/<int:corpus_id>/download')
@login_required
2021-01-13 15:23:04 +01:00
def download_corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.creator == current_user or current_user.is_administrator()):
abort(403)
return send_from_directory(
as_attachment=True,
directory=os.path.join(corpus.creator.path, 'corpora'),
filename=corpus.archive_file,
mimetype='zip'
)
@bp.route('/<int:corpus_id>/analyse')
@login_required
2020-04-06 14:27:14 +02:00
def analyse_corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
2020-04-06 14:09:41 +02:00
display_options_form = DisplayOptionsForm(
prefix='display-options-form',
result_context=request.args.get('context', 20),
results_per_page=request.args.get('results_per_page', 30)
)
2020-04-06 14:09:41 +02:00
query_form = QueryForm(prefix='query-form',
2020-04-06 14:12:22 +02:00
query=request.args.get('query'))
2020-04-20 13:48:40 +02:00
query_download_form = QueryDownloadForm(prefix='query-download-form')
inspect_display_options_form = InspectDisplayOptionsForm(
prefix='inspect-display-options-form')
return render_template(
'corpora/analyse_corpus.html.j2',
2020-10-29 14:45:55 +01:00
corpus=corpus,
display_options_form=display_options_form,
inspect_display_options_form=inspect_display_options_form,
query_form=query_form,
query_download_form=query_download_form,
title='Corpus analysis'
)
2020-04-06 14:12:22 +02:00
@bp.route('/<int:corpus_id>/delete')
2020-04-06 14:12:22 +02:00
@login_required
def delete_corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.creator == current_user or current_user.is_administrator()):
abort(403)
flash('Corpus "{}" marked for deletion!'.format(corpus.title), 'corpus')
tasks.delete_corpus(corpus_id)
2020-04-06 14:12:22 +02:00
return redirect(url_for('main.dashboard'))
@bp.route('/<int:corpus_id>/files/add', methods=['GET', 'POST'])
2020-04-06 14:12:22 +02:00
@login_required
def add_corpus_file(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.creator == current_user or current_user.is_administrator()):
abort(403)
form = AddCorpusFileForm(corpus, prefix='add-corpus-file-form')
if form.is_submitted():
if not form.validate():
return make_response(form.errors, 400)
2020-04-06 14:12:22 +02:00
# Save the file
form.file.data.save(os.path.join(corpus.path, form.file.data.filename))
corpus_file = CorpusFile(address=form.address.data,
author=form.author.data,
booktitle=form.booktitle.data,
chapter=form.chapter.data,
corpus=corpus,
editor=form.editor.data,
filename=form.file.data.filename,
institution=form.institution.data,
journal=form.journal.data,
pages=form.pages.data,
publisher=form.publisher.data,
publishing_year=form.publishing_year.data,
school=form.school.data,
title=form.title.data)
2020-04-06 14:12:22 +02:00
db.session.add(corpus_file)
corpus.status = 'unprepared'
2020-04-06 14:12:22 +02:00
db.session.commit()
flash('Corpus file "{}" added!'.format(corpus_file.filename), 'corpus')
return make_response({'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201) # noqa
return render_template('corpora/add_corpus_file.html.j2', corpus=corpus,
form=form, title='Add corpus file')
2020-04-06 14:12:22 +02:00
@bp.route('/<int:corpus_id>/files/<int:corpus_file_id>/delete')
2020-04-06 14:12:22 +02:00
@login_required
def delete_corpus_file(corpus_id, corpus_file_id):
corpus_file = CorpusFile.query.get_or_404(corpus_file_id)
if not corpus_file.corpus_id == corpus_id:
abort(404)
if not (corpus_file.corpus.creator == current_user
or current_user.is_administrator()):
abort(403)
flash('Corpus file "{}" marked for deletion!'.format(corpus_file.filename), 'corpus') # noqa
tasks.delete_corpus_file(corpus_file_id)
return redirect(url_for('.corpus', corpus_id=corpus_id))
2020-04-06 14:12:22 +02:00
@bp.route('/<int:corpus_id>/files/<int:corpus_file_id>/download')
2020-04-06 14:12:22 +02:00
@login_required
def download_corpus_file(corpus_id, corpus_file_id):
corpus_file = CorpusFile.query.get_or_404(corpus_file_id)
if not corpus_file.corpus_id == corpus_id:
abort(404)
if not (corpus_file.corpus.creator == current_user
or current_user.is_administrator()):
abort(403)
return send_from_directory(as_attachment=True,
directory=os.path.dirname(corpus_file.path),
2020-04-06 14:12:22 +02:00
filename=corpus_file.filename)
@bp.route('/<int:corpus_id>/files/<int:corpus_file_id>', methods=['GET', 'POST'])
2020-04-06 14:12:22 +02:00
@login_required
2020-07-30 14:17:51 +02:00
def corpus_file(corpus_id, corpus_file_id):
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.creator == current_user or current_user.is_administrator()):
abort(403)
2020-04-06 14:12:22 +02:00
corpus_file = CorpusFile.query.get_or_404(corpus_file_id)
if corpus_file.corpus != corpus:
2020-04-06 14:12:22 +02:00
abort(404)
form = EditCorpusFileForm(prefix='edit-corpus-file-form')
if form.validate_on_submit():
corpus_file.address = form.address.data
corpus_file.author = form.author.data
corpus_file.booktitle = form.booktitle.data
corpus_file.chapter = form.chapter.data
corpus_file.editor = form.editor.data
corpus_file.institution = form.institution.data
corpus_file.journal = form.journal.data
corpus_file.pages = form.pages.data
corpus_file.publisher = form.publisher.data
corpus_file.publishing_year = form.publishing_year.data
corpus_file.school = form.school.data
corpus_file.title = form.title.data
corpus.status = 'unprepared'
2020-04-06 14:12:22 +02:00
db.session.commit()
flash('Corpus file "{}" edited!'.format(corpus_file.filename), 'corpus') # noqa
return redirect(url_for('.corpus', corpus_id=corpus_id))
2020-04-17 11:04:09 +02:00
# If no form is submitted or valid, fill out fields with current values
form.address.data = corpus_file.address
form.author.data = corpus_file.author
form.booktitle.data = corpus_file.booktitle
form.chapter.data = corpus_file.chapter
form.editor.data = corpus_file.editor
form.institution.data = corpus_file.institution
form.journal.data = corpus_file.journal
form.pages.data = corpus_file.pages
form.publisher.data = corpus_file.publisher
form.publishing_year.data = corpus_file.publishing_year
form.school.data = corpus_file.school
form.title.data = corpus_file.title
return render_template('corpora/corpus_file.html.j2', corpus=corpus,
corpus_file=corpus_file, form=form,
2020-04-17 11:04:09 +02:00
title='Edit corpus file')
2020-04-06 14:12:22 +02:00
@bp.route('/<int:corpus_id>/prepare')
2020-04-06 14:12:22 +02:00
@login_required
def prepare_corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.creator == current_user or current_user.is_administrator()):
abort(403)
2020-04-17 11:04:09 +02:00
if corpus.files.all():
tasks.build_corpus(corpus_id)
flash('Corpus "{}" has been marked to get build!'.format(corpus.title), 'corpus') # noqa
2020-04-06 14:12:22 +02:00
else:
flash('Can not build corpus "{}": No corpus file(s)!'.format(corpus.title), 'error') # noqa
return redirect(url_for('.corpus', corpus_id=corpus_id))
# Following are view functions to add, view etc. exported results.
@bp.route('/result/add', methods=['GET', 'POST'])
@login_required
def add_query_result():
'''
View to import a result as a json file.
'''
form = AddQueryResultForm(prefix='add-query-result-form')
if form.is_submitted():
if not form.validate():
return make_response(form.errors, 400)
query_result = QueryResult(creator=current_user,
description=form.description.data,
filename=form.file.data.filename,
title=form.title.data)
db.session.add(query_result)
2021-02-01 13:39:38 +01:00
db.session.flush()
db.session.refresh(query_result)
try:
2021-02-01 13:50:07 +01:00
os.makedirs(os.path.dirname(query_result.path))
except OSError:
logging.error('Make dir {} led to an OSError!'.format(query_result.path)) # noqa
2021-02-01 13:39:38 +01:00
db.session.rollback()
flash('Internal Server Error', 'error')
return make_response(
{'redirect_url': url_for('.add_query_result')}, 500)
# save the uploaded file
2021-02-01 13:50:07 +01:00
form.file.data.save(query_result.path)
# parse json from file
2021-02-01 13:50:07 +01:00
with open(query_result.path, 'r') as file:
query_result_file_content = json.load(file)
# parse json schema
# with open('app/static/json_schema/nopaque_cqi_py_results_schema.json', 'r') as file: # noqa
# schema = json.load(file)
# try:
# # validate imported json file
# validate(instance=query_result_file_content, schema=schema)
# except Exception:
# tasks.delete_query_result(query_result.id)
# flash('Uploaded file is invalid', 'result')
# return make_response(
# {'redirect_url': url_for('.add_query_result')}, 201)
query_result_file_content.pop('matches')
query_result_file_content.pop('cpos_lookup')
query_result.query_metadata = query_result_file_content
db.session.commit()
flash('Query result added!', 'result')
return make_response({'redirect_url': url_for('.query_result', query_result_id=query_result.id)}, 201) # noqa
return render_template('corpora/query_results/add_query_result.html.j2',
form=form, title='Add query result')
@bp.route('/result/<int:query_result_id>')
@login_required
def query_result(query_result_id):
query_result = QueryResult.query.get_or_404(query_result_id)
if not (query_result.creator == current_user
or current_user.is_administrator()):
abort(403)
return render_template('corpora/query_results/query_result.html.j2',
query_result=query_result, title='Query result')
@bp.route('/result/<int:query_result_id>/inspect')
@login_required
def inspect_query_result(query_result_id):
'''
View to inspect imported result file in a corpus analysis like interface
'''
query_result = QueryResult.query.get_or_404(query_result_id)
query_metadata = query_result.query_metadata
if not (query_result.creator == current_user
or current_user.is_administrator()):
abort(403)
display_options_form = DisplayOptionsForm(
prefix='display-options-form',
results_per_page=request.args.get('results_per_page', 30),
result_context=request.args.get('context', 20)
)
inspect_display_options_form = InspectDisplayOptionsForm(
prefix='inspect-display-options-form'
)
with open(query_result.path, 'r') as query_result_file:
query_result_file_content = json.load(query_result_file)
return render_template('corpora/query_results/inspect.html.j2',
query_result=query_result,
display_options_form=display_options_form,
2020-10-12 11:29:33 +02:00
inspect_display_options_form=inspect_display_options_form, # noqa
query_result_file_content=query_result_file_content,
query_metadata=query_metadata,
title='Inspect query result')
@bp.route('/result/<int:query_result_id>/delete')
@login_required
def delete_query_result(query_result_id):
query_result = QueryResult.query.get_or_404(query_result_id)
if not (query_result.creator == current_user
or current_user.is_administrator()):
abort(403)
flash('Query result "{}" has been marked for deletion!'.format(query_result), 'result') # noqa
tasks.delete_query_result(query_result_id)
return redirect(url_for('services.service', service="corpus_analysis"))
@bp.route('/result/<int:query_result_id>/download')
@login_required
def download_query_result(query_result_id):
query_result = QueryResult.query.get_or_404(query_result_id)
if not (query_result.creator == current_user
or current_user.is_administrator()):
abort(403)
return send_from_directory(as_attachment=True,
directory=os.path.dirname(query_result.path),
filename=query_result.filename)