diff --git a/app/corpora/tasks.py b/app/corpora/tasks.py
index 4bd68ebf..480cb7aa 100644
--- a/app/corpora/tasks.py
+++ b/app/corpora/tasks.py
@@ -1,9 +1,52 @@
+from .. import db
from ..decorators import background
from ..models import Corpus, CorpusFile
+import xml.etree.ElementTree as ET
import os
import shutil
+@background
+def build_corpus(app, corpus_id):
+ with app.app_context():
+ corpus = Corpus.query.get(corpus_id)
+ if corpus is None:
+ return
+ corpus.status = 'File processing'
+ db.session.commit()
+ corpus_dir = os.path.join(app.config['NOPAQUE_STORAGE'],
+ str(corpus.user_id), 'corpora',
+ str(corpus.id))
+ output_dir = os.path.join(corpus_dir, 'merged')
+ shutil.rmtree(output_dir, ignore_errors=True)
+ os.mkdir(output_dir)
+ master_element_tree = ET.ElementTree(
+ ET.fromstring('\n'))
+ for corpus_file in corpus.files:
+ file = os.path.join(corpus_dir, corpus_file.filename)
+ element_tree = ET.parse(file)
+ text_node = element_tree.find('text')
+ text_node.set('address', corpus_file.address or "NULL")
+ text_node.set('author', corpus_file.author)
+ text_node.set('booktitle', corpus_file.booktitle or "NULL")
+ text_node.set('chapter', corpus_file.chapter or "NULL")
+ text_node.set('editor', corpus_file.editor or "NULL")
+ text_node.set('institution', corpus_file.institution or "NULL")
+ text_node.set('journal', corpus_file.journal or "NULL")
+ text_node.set('pages', corpus_file.pages or "NULL")
+ text_node.set('publisher', corpus_file.publisher or "NULL")
+ text_node.set('publishing_year', str(corpus_file.publishing_year))
+ text_node.set('school', corpus_file.school or "NULL")
+ text_node.set('title', corpus_file.title)
+ element_tree.write(file)
+ master_element_tree.getroot().insert(1, text_node)
+ output_file = os.path.join(output_dir, 'corpus.vrt')
+ master_element_tree.write(output_file, xml_declaration=True,
+ encoding='utf-8')
+ corpus.status = 'submitted'
+ db.session.commit()
+
+
@background
def delete_corpus(app, corpus_id):
with app.app_context():
@@ -30,12 +73,3 @@ def delete_corpus_file(app, corpus_file_id):
pass
else:
corpus_file.delete()
-
-
-@background
-def edit_corpus_file(app, corpus_file_id):
- with app.app_context():
- corpus_file = CorpusFile.query.get(corpus_file_id)
- if corpus_file is None:
- raise Exception('Corpus file {} not found!'.format(corpus_file_id))
- corpus_file.insert_metadata()
diff --git a/app/corpora/views.py b/app/corpora/views.py
index 8f4053ab..1b98a300 100644
--- a/app/corpora/views.py
+++ b/app/corpora/views.py
@@ -60,14 +60,16 @@ def analyse_corpus(corpus_id):
query_form = QueryForm(prefix='query-form',
query=request.args.get('query'))
query_download_form = QueryDownloadForm(prefix='query-download-form')
- inspect_display_options_form = InspectDisplayOptionsForm(prefix='inspect-display-options-form')
- return render_template('corpora/analyse_corpus.html.j2',
- corpus_id=corpus_id,
- display_options_form=display_options_form,
- query_form=query_form,
- query_download_form=query_download_form,
- inspect_display_options_form=inspect_display_options_form,
- title='Corpus analysis')
+ inspect_display_options_form = InspectDisplayOptionsForm(
+ prefix='inspect-display-options-form')
+ return render_template(
+ 'corpora/analyse_corpus.html.j2',
+ corpus_id=corpus_id,
+ display_options_form=display_options_form,
+ query_form=query_form,
+ query_download_form=query_download_form,
+ inspect_display_options_form=inspect_display_options_form,
+ title='Corpus analysis')
@corpora.route('//delete')
@@ -114,8 +116,8 @@ def add_corpus_file(corpus_id):
school=add_corpus_file_form.school.data,
title=add_corpus_file_form.title.data)
db.session.add(corpus_file)
+ corpus.status = 'unprepared'
db.session.commit()
- tasks.edit_corpus_file(corpus_file.id)
flash('Corpus file added!')
return make_response(
{'redirect_url': url_for('corpora.corpus', corpus_id=corpus.id)},
@@ -181,8 +183,8 @@ def edit_corpus_file(corpus_id, corpus_file_id):
edit_corpus_file_form.publishing_year.data
corpus_file.school = edit_corpus_file_form.school.data
corpus_file.title = edit_corpus_file_form.title.data
+ corpus.status = 'unprepared'
db.session.commit()
- tasks.edit_corpus_file(corpus_file_id)
flash('Corpus file edited!')
return redirect(url_for('corpora.corpus', corpus_id=corpus_id))
# If no form is submitted or valid, fill out fields with current values
@@ -211,9 +213,8 @@ def prepare_corpus(corpus_id):
if not (corpus.creator == current_user or current_user.is_administrator()):
abort(403)
if corpus.files.all():
- corpus.status = 'submitted'
- db.session.commit()
- flash('Corpus marked for preparation!')
+ tasks.build_corpus(corpus_id)
+ flash('Corpus gets build now.')
else:
- flash('Can not prepare corpus, please add corpus file(s).')
+ flash('Can not build corpus, please add corpus file(s).')
return redirect(url_for('corpora.corpus', corpus_id=corpus_id))
diff --git a/app/models.py b/app/models.py
index 773883e9..0df3ded0 100644
--- a/app/models.py
+++ b/app/models.py
@@ -7,7 +7,6 @@ from werkzeug.utils import secure_filename
from . import db, logger, login_manager
import os
import shutil
-import xml.etree.ElementTree as ET
class Permission:
@@ -380,28 +379,6 @@ class CorpusFile(db.Model):
db.session.delete(self)
db.session.commit()
- def insert_metadata(self):
- file = os.path.join(current_app.config['NOPAQUE_STORAGE'],
- self.dir, self.filename)
- element_tree = ET.parse(file)
- text_node = element_tree.find('text')
- # TODO: USE OR
- text_node.set('address', self.address if self.address else "NULL")
- text_node.set('author', self.author)
- text_node.set('booktitle', self.booktitle if self.booktitle else "NULL")
- text_node.set('chapter', self.chapter if self.chapter else "NULL")
- text_node.set('editor', self.editor if self.editor else "NULL")
- text_node.set('institution', self.institution if self.institution else "NULL")
- text_node.set('journal', self.journal if self.journal else "NULL")
- text_node.set('pages', self.pages if self.pages else "NULL")
- text_node.set('publisher', self.publisher if self.publisher else "NULL")
- text_node.set('publishing_year', str(self.publishing_year))
- text_node.set('school', self.school if self.school else "NULL")
- text_node.set('title', self.title)
- element_tree.write(file)
- self.corpus.status = 'unprepared'
- db.session.commit()
-
def to_dict(self):
return {'id': self.id,
'address': self.address,
@@ -447,9 +424,6 @@ class Corpus(db.Model):
'title': self.title,
'user_id': self.user_id}
- def build(self):
- pass
-
def delete(self):
for corpus_file in self.files:
db.session.delete(corpus_file)
diff --git a/docker-compose.yml b/docker-compose.yml
index a4f33c9c..97c9cc53 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -5,6 +5,9 @@ networks:
external:
name: reverse-proxy
+volumes:
+ redis-trash1:
+
services:
web:
depends_on:
@@ -52,3 +55,5 @@ services:
- "/srv/nopaque/db:/var/lib/postgresql/data"
redis:
image: redis:5
+ volumes:
+ - "redis-trash1:/data"