mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2024-11-14 16:55:42 +00:00
Process corpus files in task, not in database model
This commit is contained in:
parent
e882af8888
commit
edc0b34032
@ -1,9 +1,52 @@
|
|||||||
|
from .. import db
|
||||||
from ..decorators import background
|
from ..decorators import background
|
||||||
from ..models import Corpus, CorpusFile
|
from ..models import Corpus, CorpusFile
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
|
|
||||||
|
@background
|
||||||
|
def build_corpus(app, corpus_id):
|
||||||
|
with app.app_context():
|
||||||
|
corpus = Corpus.query.get(corpus_id)
|
||||||
|
if corpus is None:
|
||||||
|
return
|
||||||
|
corpus.status = 'File processing'
|
||||||
|
db.session.commit()
|
||||||
|
corpus_dir = os.path.join(app.config['NOPAQUE_STORAGE'],
|
||||||
|
str(corpus.user_id), 'corpora',
|
||||||
|
str(corpus.id))
|
||||||
|
output_dir = os.path.join(corpus_dir, 'merged')
|
||||||
|
shutil.rmtree(output_dir, ignore_errors=True)
|
||||||
|
os.mkdir(output_dir)
|
||||||
|
master_element_tree = ET.ElementTree(
|
||||||
|
ET.fromstring('<corpus>\n</corpus>'))
|
||||||
|
for corpus_file in corpus.files:
|
||||||
|
file = os.path.join(corpus_dir, corpus_file.filename)
|
||||||
|
element_tree = ET.parse(file)
|
||||||
|
text_node = element_tree.find('text')
|
||||||
|
text_node.set('address', corpus_file.address or "NULL")
|
||||||
|
text_node.set('author', corpus_file.author)
|
||||||
|
text_node.set('booktitle', corpus_file.booktitle or "NULL")
|
||||||
|
text_node.set('chapter', corpus_file.chapter or "NULL")
|
||||||
|
text_node.set('editor', corpus_file.editor or "NULL")
|
||||||
|
text_node.set('institution', corpus_file.institution or "NULL")
|
||||||
|
text_node.set('journal', corpus_file.journal or "NULL")
|
||||||
|
text_node.set('pages', corpus_file.pages or "NULL")
|
||||||
|
text_node.set('publisher', corpus_file.publisher or "NULL")
|
||||||
|
text_node.set('publishing_year', str(corpus_file.publishing_year))
|
||||||
|
text_node.set('school', corpus_file.school or "NULL")
|
||||||
|
text_node.set('title', corpus_file.title)
|
||||||
|
element_tree.write(file)
|
||||||
|
master_element_tree.getroot().insert(1, text_node)
|
||||||
|
output_file = os.path.join(output_dir, 'corpus.vrt')
|
||||||
|
master_element_tree.write(output_file, xml_declaration=True,
|
||||||
|
encoding='utf-8')
|
||||||
|
corpus.status = 'submitted'
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
|
||||||
@background
|
@background
|
||||||
def delete_corpus(app, corpus_id):
|
def delete_corpus(app, corpus_id):
|
||||||
with app.app_context():
|
with app.app_context():
|
||||||
@ -30,12 +73,3 @@ def delete_corpus_file(app, corpus_file_id):
|
|||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
corpus_file.delete()
|
corpus_file.delete()
|
||||||
|
|
||||||
|
|
||||||
@background
|
|
||||||
def edit_corpus_file(app, corpus_file_id):
|
|
||||||
with app.app_context():
|
|
||||||
corpus_file = CorpusFile.query.get(corpus_file_id)
|
|
||||||
if corpus_file is None:
|
|
||||||
raise Exception('Corpus file {} not found!'.format(corpus_file_id))
|
|
||||||
corpus_file.insert_metadata()
|
|
||||||
|
@ -60,14 +60,16 @@ def analyse_corpus(corpus_id):
|
|||||||
query_form = QueryForm(prefix='query-form',
|
query_form = QueryForm(prefix='query-form',
|
||||||
query=request.args.get('query'))
|
query=request.args.get('query'))
|
||||||
query_download_form = QueryDownloadForm(prefix='query-download-form')
|
query_download_form = QueryDownloadForm(prefix='query-download-form')
|
||||||
inspect_display_options_form = InspectDisplayOptionsForm(prefix='inspect-display-options-form')
|
inspect_display_options_form = InspectDisplayOptionsForm(
|
||||||
return render_template('corpora/analyse_corpus.html.j2',
|
prefix='inspect-display-options-form')
|
||||||
corpus_id=corpus_id,
|
return render_template(
|
||||||
display_options_form=display_options_form,
|
'corpora/analyse_corpus.html.j2',
|
||||||
query_form=query_form,
|
corpus_id=corpus_id,
|
||||||
query_download_form=query_download_form,
|
display_options_form=display_options_form,
|
||||||
inspect_display_options_form=inspect_display_options_form,
|
query_form=query_form,
|
||||||
title='Corpus analysis')
|
query_download_form=query_download_form,
|
||||||
|
inspect_display_options_form=inspect_display_options_form,
|
||||||
|
title='Corpus analysis')
|
||||||
|
|
||||||
|
|
||||||
@corpora.route('/<int:corpus_id>/delete')
|
@corpora.route('/<int:corpus_id>/delete')
|
||||||
@ -114,8 +116,8 @@ def add_corpus_file(corpus_id):
|
|||||||
school=add_corpus_file_form.school.data,
|
school=add_corpus_file_form.school.data,
|
||||||
title=add_corpus_file_form.title.data)
|
title=add_corpus_file_form.title.data)
|
||||||
db.session.add(corpus_file)
|
db.session.add(corpus_file)
|
||||||
|
corpus.status = 'unprepared'
|
||||||
db.session.commit()
|
db.session.commit()
|
||||||
tasks.edit_corpus_file(corpus_file.id)
|
|
||||||
flash('Corpus file added!')
|
flash('Corpus file added!')
|
||||||
return make_response(
|
return make_response(
|
||||||
{'redirect_url': url_for('corpora.corpus', corpus_id=corpus.id)},
|
{'redirect_url': url_for('corpora.corpus', corpus_id=corpus.id)},
|
||||||
@ -181,8 +183,8 @@ def edit_corpus_file(corpus_id, corpus_file_id):
|
|||||||
edit_corpus_file_form.publishing_year.data
|
edit_corpus_file_form.publishing_year.data
|
||||||
corpus_file.school = edit_corpus_file_form.school.data
|
corpus_file.school = edit_corpus_file_form.school.data
|
||||||
corpus_file.title = edit_corpus_file_form.title.data
|
corpus_file.title = edit_corpus_file_form.title.data
|
||||||
|
corpus.status = 'unprepared'
|
||||||
db.session.commit()
|
db.session.commit()
|
||||||
tasks.edit_corpus_file(corpus_file_id)
|
|
||||||
flash('Corpus file edited!')
|
flash('Corpus file edited!')
|
||||||
return redirect(url_for('corpora.corpus', corpus_id=corpus_id))
|
return redirect(url_for('corpora.corpus', corpus_id=corpus_id))
|
||||||
# If no form is submitted or valid, fill out fields with current values
|
# If no form is submitted or valid, fill out fields with current values
|
||||||
@ -211,9 +213,8 @@ def prepare_corpus(corpus_id):
|
|||||||
if not (corpus.creator == current_user or current_user.is_administrator()):
|
if not (corpus.creator == current_user or current_user.is_administrator()):
|
||||||
abort(403)
|
abort(403)
|
||||||
if corpus.files.all():
|
if corpus.files.all():
|
||||||
corpus.status = 'submitted'
|
tasks.build_corpus(corpus_id)
|
||||||
db.session.commit()
|
flash('Corpus gets build now.')
|
||||||
flash('Corpus marked for preparation!')
|
|
||||||
else:
|
else:
|
||||||
flash('Can not prepare corpus, please add corpus file(s).')
|
flash('Can not build corpus, please add corpus file(s).')
|
||||||
return redirect(url_for('corpora.corpus', corpus_id=corpus_id))
|
return redirect(url_for('corpora.corpus', corpus_id=corpus_id))
|
||||||
|
@ -7,7 +7,6 @@ from werkzeug.utils import secure_filename
|
|||||||
from . import db, logger, login_manager
|
from . import db, logger, login_manager
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import xml.etree.ElementTree as ET
|
|
||||||
|
|
||||||
|
|
||||||
class Permission:
|
class Permission:
|
||||||
@ -380,28 +379,6 @@ class CorpusFile(db.Model):
|
|||||||
db.session.delete(self)
|
db.session.delete(self)
|
||||||
db.session.commit()
|
db.session.commit()
|
||||||
|
|
||||||
def insert_metadata(self):
|
|
||||||
file = os.path.join(current_app.config['NOPAQUE_STORAGE'],
|
|
||||||
self.dir, self.filename)
|
|
||||||
element_tree = ET.parse(file)
|
|
||||||
text_node = element_tree.find('text')
|
|
||||||
# TODO: USE OR
|
|
||||||
text_node.set('address', self.address if self.address else "NULL")
|
|
||||||
text_node.set('author', self.author)
|
|
||||||
text_node.set('booktitle', self.booktitle if self.booktitle else "NULL")
|
|
||||||
text_node.set('chapter', self.chapter if self.chapter else "NULL")
|
|
||||||
text_node.set('editor', self.editor if self.editor else "NULL")
|
|
||||||
text_node.set('institution', self.institution if self.institution else "NULL")
|
|
||||||
text_node.set('journal', self.journal if self.journal else "NULL")
|
|
||||||
text_node.set('pages', self.pages if self.pages else "NULL")
|
|
||||||
text_node.set('publisher', self.publisher if self.publisher else "NULL")
|
|
||||||
text_node.set('publishing_year', str(self.publishing_year))
|
|
||||||
text_node.set('school', self.school if self.school else "NULL")
|
|
||||||
text_node.set('title', self.title)
|
|
||||||
element_tree.write(file)
|
|
||||||
self.corpus.status = 'unprepared'
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
def to_dict(self):
|
def to_dict(self):
|
||||||
return {'id': self.id,
|
return {'id': self.id,
|
||||||
'address': self.address,
|
'address': self.address,
|
||||||
@ -447,9 +424,6 @@ class Corpus(db.Model):
|
|||||||
'title': self.title,
|
'title': self.title,
|
||||||
'user_id': self.user_id}
|
'user_id': self.user_id}
|
||||||
|
|
||||||
def build(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def delete(self):
|
def delete(self):
|
||||||
for corpus_file in self.files:
|
for corpus_file in self.files:
|
||||||
db.session.delete(corpus_file)
|
db.session.delete(corpus_file)
|
||||||
|
@ -5,6 +5,9 @@ networks:
|
|||||||
external:
|
external:
|
||||||
name: reverse-proxy
|
name: reverse-proxy
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
redis-trash1:
|
||||||
|
|
||||||
services:
|
services:
|
||||||
web:
|
web:
|
||||||
depends_on:
|
depends_on:
|
||||||
@ -52,3 +55,5 @@ services:
|
|||||||
- "/srv/nopaque/db:/var/lib/postgresql/data"
|
- "/srv/nopaque/db:/var/lib/postgresql/data"
|
||||||
redis:
|
redis:
|
||||||
image: redis:5
|
image: redis:5
|
||||||
|
volumes:
|
||||||
|
- "redis-trash1:/data"
|
||||||
|
Loading…
Reference in New Issue
Block a user