From edc0b34032fbf9c07fc4ea7f079719c334f06a22 Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Thu, 23 Apr 2020 07:56:23 +0200 Subject: [PATCH] Process corpus files in task, not in database model --- app/corpora/tasks.py | 52 ++++++++++++++++++++++++++++++++++++-------- app/corpora/views.py | 29 ++++++++++++------------ app/models.py | 26 ---------------------- docker-compose.yml | 5 +++++ 4 files changed, 63 insertions(+), 49 deletions(-) diff --git a/app/corpora/tasks.py b/app/corpora/tasks.py index 4bd68ebf..480cb7aa 100644 --- a/app/corpora/tasks.py +++ b/app/corpora/tasks.py @@ -1,9 +1,52 @@ +from .. import db from ..decorators import background from ..models import Corpus, CorpusFile +import xml.etree.ElementTree as ET import os import shutil +@background +def build_corpus(app, corpus_id): + with app.app_context(): + corpus = Corpus.query.get(corpus_id) + if corpus is None: + return + corpus.status = 'File processing' + db.session.commit() + corpus_dir = os.path.join(app.config['NOPAQUE_STORAGE'], + str(corpus.user_id), 'corpora', + str(corpus.id)) + output_dir = os.path.join(corpus_dir, 'merged') + shutil.rmtree(output_dir, ignore_errors=True) + os.mkdir(output_dir) + master_element_tree = ET.ElementTree( + ET.fromstring('\n')) + for corpus_file in corpus.files: + file = os.path.join(corpus_dir, corpus_file.filename) + element_tree = ET.parse(file) + text_node = element_tree.find('text') + text_node.set('address', corpus_file.address or "NULL") + text_node.set('author', corpus_file.author) + text_node.set('booktitle', corpus_file.booktitle or "NULL") + text_node.set('chapter', corpus_file.chapter or "NULL") + text_node.set('editor', corpus_file.editor or "NULL") + text_node.set('institution', corpus_file.institution or "NULL") + text_node.set('journal', corpus_file.journal or "NULL") + text_node.set('pages', corpus_file.pages or "NULL") + text_node.set('publisher', corpus_file.publisher or "NULL") + text_node.set('publishing_year', str(corpus_file.publishing_year)) + text_node.set('school', corpus_file.school or "NULL") + text_node.set('title', corpus_file.title) + element_tree.write(file) + master_element_tree.getroot().insert(1, text_node) + output_file = os.path.join(output_dir, 'corpus.vrt') + master_element_tree.write(output_file, xml_declaration=True, + encoding='utf-8') + corpus.status = 'submitted' + db.session.commit() + + @background def delete_corpus(app, corpus_id): with app.app_context(): @@ -30,12 +73,3 @@ def delete_corpus_file(app, corpus_file_id): pass else: corpus_file.delete() - - -@background -def edit_corpus_file(app, corpus_file_id): - with app.app_context(): - corpus_file = CorpusFile.query.get(corpus_file_id) - if corpus_file is None: - raise Exception('Corpus file {} not found!'.format(corpus_file_id)) - corpus_file.insert_metadata() diff --git a/app/corpora/views.py b/app/corpora/views.py index 8f4053ab..1b98a300 100644 --- a/app/corpora/views.py +++ b/app/corpora/views.py @@ -60,14 +60,16 @@ def analyse_corpus(corpus_id): query_form = QueryForm(prefix='query-form', query=request.args.get('query')) query_download_form = QueryDownloadForm(prefix='query-download-form') - inspect_display_options_form = InspectDisplayOptionsForm(prefix='inspect-display-options-form') - return render_template('corpora/analyse_corpus.html.j2', - corpus_id=corpus_id, - display_options_form=display_options_form, - query_form=query_form, - query_download_form=query_download_form, - inspect_display_options_form=inspect_display_options_form, - title='Corpus analysis') + inspect_display_options_form = InspectDisplayOptionsForm( + prefix='inspect-display-options-form') + return render_template( + 'corpora/analyse_corpus.html.j2', + corpus_id=corpus_id, + display_options_form=display_options_form, + query_form=query_form, + query_download_form=query_download_form, + inspect_display_options_form=inspect_display_options_form, + title='Corpus analysis') @corpora.route('//delete') @@ -114,8 +116,8 @@ def add_corpus_file(corpus_id): school=add_corpus_file_form.school.data, title=add_corpus_file_form.title.data) db.session.add(corpus_file) + corpus.status = 'unprepared' db.session.commit() - tasks.edit_corpus_file(corpus_file.id) flash('Corpus file added!') return make_response( {'redirect_url': url_for('corpora.corpus', corpus_id=corpus.id)}, @@ -181,8 +183,8 @@ def edit_corpus_file(corpus_id, corpus_file_id): edit_corpus_file_form.publishing_year.data corpus_file.school = edit_corpus_file_form.school.data corpus_file.title = edit_corpus_file_form.title.data + corpus.status = 'unprepared' db.session.commit() - tasks.edit_corpus_file(corpus_file_id) flash('Corpus file edited!') return redirect(url_for('corpora.corpus', corpus_id=corpus_id)) # If no form is submitted or valid, fill out fields with current values @@ -211,9 +213,8 @@ def prepare_corpus(corpus_id): if not (corpus.creator == current_user or current_user.is_administrator()): abort(403) if corpus.files.all(): - corpus.status = 'submitted' - db.session.commit() - flash('Corpus marked for preparation!') + tasks.build_corpus(corpus_id) + flash('Corpus gets build now.') else: - flash('Can not prepare corpus, please add corpus file(s).') + flash('Can not build corpus, please add corpus file(s).') return redirect(url_for('corpora.corpus', corpus_id=corpus_id)) diff --git a/app/models.py b/app/models.py index 773883e9..0df3ded0 100644 --- a/app/models.py +++ b/app/models.py @@ -7,7 +7,6 @@ from werkzeug.utils import secure_filename from . import db, logger, login_manager import os import shutil -import xml.etree.ElementTree as ET class Permission: @@ -380,28 +379,6 @@ class CorpusFile(db.Model): db.session.delete(self) db.session.commit() - def insert_metadata(self): - file = os.path.join(current_app.config['NOPAQUE_STORAGE'], - self.dir, self.filename) - element_tree = ET.parse(file) - text_node = element_tree.find('text') - # TODO: USE OR - text_node.set('address', self.address if self.address else "NULL") - text_node.set('author', self.author) - text_node.set('booktitle', self.booktitle if self.booktitle else "NULL") - text_node.set('chapter', self.chapter if self.chapter else "NULL") - text_node.set('editor', self.editor if self.editor else "NULL") - text_node.set('institution', self.institution if self.institution else "NULL") - text_node.set('journal', self.journal if self.journal else "NULL") - text_node.set('pages', self.pages if self.pages else "NULL") - text_node.set('publisher', self.publisher if self.publisher else "NULL") - text_node.set('publishing_year', str(self.publishing_year)) - text_node.set('school', self.school if self.school else "NULL") - text_node.set('title', self.title) - element_tree.write(file) - self.corpus.status = 'unprepared' - db.session.commit() - def to_dict(self): return {'id': self.id, 'address': self.address, @@ -447,9 +424,6 @@ class Corpus(db.Model): 'title': self.title, 'user_id': self.user_id} - def build(self): - pass - def delete(self): for corpus_file in self.files: db.session.delete(corpus_file) diff --git a/docker-compose.yml b/docker-compose.yml index a4f33c9c..97c9cc53 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,6 +5,9 @@ networks: external: name: reverse-proxy +volumes: + redis-trash1: + services: web: depends_on: @@ -52,3 +55,5 @@ services: - "/srv/nopaque/db:/var/lib/postgresql/data" redis: image: redis:5 + volumes: + - "redis-trash1:/data"