Process corpus files in task, not in database model

This commit is contained in:
Patrick Jentsch 2020-04-23 07:56:23 +02:00
parent e882af8888
commit edc0b34032
4 changed files with 63 additions and 49 deletions

View File

@ -1,9 +1,52 @@
from .. import db
from ..decorators import background from ..decorators import background
from ..models import Corpus, CorpusFile from ..models import Corpus, CorpusFile
import xml.etree.ElementTree as ET
import os import os
import shutil import shutil
@background
def build_corpus(app, corpus_id):
with app.app_context():
corpus = Corpus.query.get(corpus_id)
if corpus is None:
return
corpus.status = 'File processing'
db.session.commit()
corpus_dir = os.path.join(app.config['NOPAQUE_STORAGE'],
str(corpus.user_id), 'corpora',
str(corpus.id))
output_dir = os.path.join(corpus_dir, 'merged')
shutil.rmtree(output_dir, ignore_errors=True)
os.mkdir(output_dir)
master_element_tree = ET.ElementTree(
ET.fromstring('<corpus>\n</corpus>'))
for corpus_file in corpus.files:
file = os.path.join(corpus_dir, corpus_file.filename)
element_tree = ET.parse(file)
text_node = element_tree.find('text')
text_node.set('address', corpus_file.address or "NULL")
text_node.set('author', corpus_file.author)
text_node.set('booktitle', corpus_file.booktitle or "NULL")
text_node.set('chapter', corpus_file.chapter or "NULL")
text_node.set('editor', corpus_file.editor or "NULL")
text_node.set('institution', corpus_file.institution or "NULL")
text_node.set('journal', corpus_file.journal or "NULL")
text_node.set('pages', corpus_file.pages or "NULL")
text_node.set('publisher', corpus_file.publisher or "NULL")
text_node.set('publishing_year', str(corpus_file.publishing_year))
text_node.set('school', corpus_file.school or "NULL")
text_node.set('title', corpus_file.title)
element_tree.write(file)
master_element_tree.getroot().insert(1, text_node)
output_file = os.path.join(output_dir, 'corpus.vrt')
master_element_tree.write(output_file, xml_declaration=True,
encoding='utf-8')
corpus.status = 'submitted'
db.session.commit()
@background @background
def delete_corpus(app, corpus_id): def delete_corpus(app, corpus_id):
with app.app_context(): with app.app_context():
@ -30,12 +73,3 @@ def delete_corpus_file(app, corpus_file_id):
pass pass
else: else:
corpus_file.delete() corpus_file.delete()
@background
def edit_corpus_file(app, corpus_file_id):
with app.app_context():
corpus_file = CorpusFile.query.get(corpus_file_id)
if corpus_file is None:
raise Exception('Corpus file {} not found!'.format(corpus_file_id))
corpus_file.insert_metadata()

View File

@ -60,14 +60,16 @@ def analyse_corpus(corpus_id):
query_form = QueryForm(prefix='query-form', query_form = QueryForm(prefix='query-form',
query=request.args.get('query')) query=request.args.get('query'))
query_download_form = QueryDownloadForm(prefix='query-download-form') query_download_form = QueryDownloadForm(prefix='query-download-form')
inspect_display_options_form = InspectDisplayOptionsForm(prefix='inspect-display-options-form') inspect_display_options_form = InspectDisplayOptionsForm(
return render_template('corpora/analyse_corpus.html.j2', prefix='inspect-display-options-form')
corpus_id=corpus_id, return render_template(
display_options_form=display_options_form, 'corpora/analyse_corpus.html.j2',
query_form=query_form, corpus_id=corpus_id,
query_download_form=query_download_form, display_options_form=display_options_form,
inspect_display_options_form=inspect_display_options_form, query_form=query_form,
title='Corpus analysis') query_download_form=query_download_form,
inspect_display_options_form=inspect_display_options_form,
title='Corpus analysis')
@corpora.route('/<int:corpus_id>/delete') @corpora.route('/<int:corpus_id>/delete')
@ -114,8 +116,8 @@ def add_corpus_file(corpus_id):
school=add_corpus_file_form.school.data, school=add_corpus_file_form.school.data,
title=add_corpus_file_form.title.data) title=add_corpus_file_form.title.data)
db.session.add(corpus_file) db.session.add(corpus_file)
corpus.status = 'unprepared'
db.session.commit() db.session.commit()
tasks.edit_corpus_file(corpus_file.id)
flash('Corpus file added!') flash('Corpus file added!')
return make_response( return make_response(
{'redirect_url': url_for('corpora.corpus', corpus_id=corpus.id)}, {'redirect_url': url_for('corpora.corpus', corpus_id=corpus.id)},
@ -181,8 +183,8 @@ def edit_corpus_file(corpus_id, corpus_file_id):
edit_corpus_file_form.publishing_year.data edit_corpus_file_form.publishing_year.data
corpus_file.school = edit_corpus_file_form.school.data corpus_file.school = edit_corpus_file_form.school.data
corpus_file.title = edit_corpus_file_form.title.data corpus_file.title = edit_corpus_file_form.title.data
corpus.status = 'unprepared'
db.session.commit() db.session.commit()
tasks.edit_corpus_file(corpus_file_id)
flash('Corpus file edited!') flash('Corpus file edited!')
return redirect(url_for('corpora.corpus', corpus_id=corpus_id)) return redirect(url_for('corpora.corpus', corpus_id=corpus_id))
# If no form is submitted or valid, fill out fields with current values # If no form is submitted or valid, fill out fields with current values
@ -211,9 +213,8 @@ def prepare_corpus(corpus_id):
if not (corpus.creator == current_user or current_user.is_administrator()): if not (corpus.creator == current_user or current_user.is_administrator()):
abort(403) abort(403)
if corpus.files.all(): if corpus.files.all():
corpus.status = 'submitted' tasks.build_corpus(corpus_id)
db.session.commit() flash('Corpus gets build now.')
flash('Corpus marked for preparation!')
else: else:
flash('Can not prepare corpus, please add corpus file(s).') flash('Can not build corpus, please add corpus file(s).')
return redirect(url_for('corpora.corpus', corpus_id=corpus_id)) return redirect(url_for('corpora.corpus', corpus_id=corpus_id))

View File

@ -7,7 +7,6 @@ from werkzeug.utils import secure_filename
from . import db, logger, login_manager from . import db, logger, login_manager
import os import os
import shutil import shutil
import xml.etree.ElementTree as ET
class Permission: class Permission:
@ -380,28 +379,6 @@ class CorpusFile(db.Model):
db.session.delete(self) db.session.delete(self)
db.session.commit() db.session.commit()
def insert_metadata(self):
file = os.path.join(current_app.config['NOPAQUE_STORAGE'],
self.dir, self.filename)
element_tree = ET.parse(file)
text_node = element_tree.find('text')
# TODO: USE OR
text_node.set('address', self.address if self.address else "NULL")
text_node.set('author', self.author)
text_node.set('booktitle', self.booktitle if self.booktitle else "NULL")
text_node.set('chapter', self.chapter if self.chapter else "NULL")
text_node.set('editor', self.editor if self.editor else "NULL")
text_node.set('institution', self.institution if self.institution else "NULL")
text_node.set('journal', self.journal if self.journal else "NULL")
text_node.set('pages', self.pages if self.pages else "NULL")
text_node.set('publisher', self.publisher if self.publisher else "NULL")
text_node.set('publishing_year', str(self.publishing_year))
text_node.set('school', self.school if self.school else "NULL")
text_node.set('title', self.title)
element_tree.write(file)
self.corpus.status = 'unprepared'
db.session.commit()
def to_dict(self): def to_dict(self):
return {'id': self.id, return {'id': self.id,
'address': self.address, 'address': self.address,
@ -447,9 +424,6 @@ class Corpus(db.Model):
'title': self.title, 'title': self.title,
'user_id': self.user_id} 'user_id': self.user_id}
def build(self):
pass
def delete(self): def delete(self):
for corpus_file in self.files: for corpus_file in self.files:
db.session.delete(corpus_file) db.session.delete(corpus_file)

View File

@ -5,6 +5,9 @@ networks:
external: external:
name: reverse-proxy name: reverse-proxy
volumes:
redis-trash1:
services: services:
web: web:
depends_on: depends_on:
@ -52,3 +55,5 @@ services:
- "/srv/nopaque/db:/var/lib/postgresql/data" - "/srv/nopaque/db:/var/lib/postgresql/data"
redis: redis:
image: redis:5 image: redis:5
volumes:
- "redis-trash1:/data"