mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
				synced 2025-11-03 20:02:47 +00:00 
			
		
		
		
	Process corpus files in task, not in database model
This commit is contained in:
		@@ -1,9 +1,52 @@
 | 
				
			|||||||
 | 
					from .. import db
 | 
				
			||||||
from ..decorators import background
 | 
					from ..decorators import background
 | 
				
			||||||
from ..models import Corpus, CorpusFile
 | 
					from ..models import Corpus, CorpusFile
 | 
				
			||||||
 | 
					import xml.etree.ElementTree as ET
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
import shutil
 | 
					import shutil
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@background
 | 
				
			||||||
 | 
					def build_corpus(app, corpus_id):
 | 
				
			||||||
 | 
					    with app.app_context():
 | 
				
			||||||
 | 
					        corpus = Corpus.query.get(corpus_id)
 | 
				
			||||||
 | 
					        if corpus is None:
 | 
				
			||||||
 | 
					            return
 | 
				
			||||||
 | 
					        corpus.status = 'File processing'
 | 
				
			||||||
 | 
					        db.session.commit()
 | 
				
			||||||
 | 
					        corpus_dir = os.path.join(app.config['NOPAQUE_STORAGE'],
 | 
				
			||||||
 | 
					                                  str(corpus.user_id), 'corpora',
 | 
				
			||||||
 | 
					                                  str(corpus.id))
 | 
				
			||||||
 | 
					        output_dir = os.path.join(corpus_dir, 'merged')
 | 
				
			||||||
 | 
					        shutil.rmtree(output_dir, ignore_errors=True)
 | 
				
			||||||
 | 
					        os.mkdir(output_dir)
 | 
				
			||||||
 | 
					        master_element_tree = ET.ElementTree(
 | 
				
			||||||
 | 
					            ET.fromstring('<corpus>\n</corpus>'))
 | 
				
			||||||
 | 
					        for corpus_file in corpus.files:
 | 
				
			||||||
 | 
					            file = os.path.join(corpus_dir, corpus_file.filename)
 | 
				
			||||||
 | 
					            element_tree = ET.parse(file)
 | 
				
			||||||
 | 
					            text_node = element_tree.find('text')
 | 
				
			||||||
 | 
					            text_node.set('address', corpus_file.address or "NULL")
 | 
				
			||||||
 | 
					            text_node.set('author', corpus_file.author)
 | 
				
			||||||
 | 
					            text_node.set('booktitle', corpus_file.booktitle or "NULL")
 | 
				
			||||||
 | 
					            text_node.set('chapter', corpus_file.chapter or "NULL")
 | 
				
			||||||
 | 
					            text_node.set('editor', corpus_file.editor or "NULL")
 | 
				
			||||||
 | 
					            text_node.set('institution', corpus_file.institution or "NULL")
 | 
				
			||||||
 | 
					            text_node.set('journal', corpus_file.journal or "NULL")
 | 
				
			||||||
 | 
					            text_node.set('pages', corpus_file.pages or "NULL")
 | 
				
			||||||
 | 
					            text_node.set('publisher', corpus_file.publisher or "NULL")
 | 
				
			||||||
 | 
					            text_node.set('publishing_year', str(corpus_file.publishing_year))
 | 
				
			||||||
 | 
					            text_node.set('school', corpus_file.school or "NULL")
 | 
				
			||||||
 | 
					            text_node.set('title', corpus_file.title)
 | 
				
			||||||
 | 
					            element_tree.write(file)
 | 
				
			||||||
 | 
					            master_element_tree.getroot().insert(1, text_node)
 | 
				
			||||||
 | 
					        output_file = os.path.join(output_dir, 'corpus.vrt')
 | 
				
			||||||
 | 
					        master_element_tree.write(output_file, xml_declaration=True,
 | 
				
			||||||
 | 
					                                  encoding='utf-8')
 | 
				
			||||||
 | 
					        corpus.status = 'submitted'
 | 
				
			||||||
 | 
					        db.session.commit()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@background
 | 
					@background
 | 
				
			||||||
def delete_corpus(app, corpus_id):
 | 
					def delete_corpus(app, corpus_id):
 | 
				
			||||||
    with app.app_context():
 | 
					    with app.app_context():
 | 
				
			||||||
@@ -30,12 +73,3 @@ def delete_corpus_file(app, corpus_file_id):
 | 
				
			|||||||
            pass
 | 
					            pass
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            corpus_file.delete()
 | 
					            corpus_file.delete()
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@background
 | 
					 | 
				
			||||||
def edit_corpus_file(app, corpus_file_id):
 | 
					 | 
				
			||||||
    with app.app_context():
 | 
					 | 
				
			||||||
        corpus_file = CorpusFile.query.get(corpus_file_id)
 | 
					 | 
				
			||||||
        if corpus_file is None:
 | 
					 | 
				
			||||||
            raise Exception('Corpus file {} not found!'.format(corpus_file_id))
 | 
					 | 
				
			||||||
        corpus_file.insert_metadata()
 | 
					 | 
				
			||||||
 
 | 
				
			|||||||
@@ -60,14 +60,16 @@ def analyse_corpus(corpus_id):
 | 
				
			|||||||
    query_form = QueryForm(prefix='query-form',
 | 
					    query_form = QueryForm(prefix='query-form',
 | 
				
			||||||
                           query=request.args.get('query'))
 | 
					                           query=request.args.get('query'))
 | 
				
			||||||
    query_download_form = QueryDownloadForm(prefix='query-download-form')
 | 
					    query_download_form = QueryDownloadForm(prefix='query-download-form')
 | 
				
			||||||
    inspect_display_options_form = InspectDisplayOptionsForm(prefix='inspect-display-options-form')
 | 
					    inspect_display_options_form = InspectDisplayOptionsForm(
 | 
				
			||||||
    return render_template('corpora/analyse_corpus.html.j2',
 | 
					        prefix='inspect-display-options-form')
 | 
				
			||||||
                           corpus_id=corpus_id,
 | 
					    return render_template(
 | 
				
			||||||
                           display_options_form=display_options_form,
 | 
					        'corpora/analyse_corpus.html.j2',
 | 
				
			||||||
                           query_form=query_form,
 | 
					        corpus_id=corpus_id,
 | 
				
			||||||
                           query_download_form=query_download_form,
 | 
					        display_options_form=display_options_form,
 | 
				
			||||||
                           inspect_display_options_form=inspect_display_options_form,
 | 
					        query_form=query_form,
 | 
				
			||||||
                           title='Corpus analysis')
 | 
					        query_download_form=query_download_form,
 | 
				
			||||||
 | 
					        inspect_display_options_form=inspect_display_options_form,
 | 
				
			||||||
 | 
					        title='Corpus analysis')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@corpora.route('/<int:corpus_id>/delete')
 | 
					@corpora.route('/<int:corpus_id>/delete')
 | 
				
			||||||
@@ -114,8 +116,8 @@ def add_corpus_file(corpus_id):
 | 
				
			|||||||
            school=add_corpus_file_form.school.data,
 | 
					            school=add_corpus_file_form.school.data,
 | 
				
			||||||
            title=add_corpus_file_form.title.data)
 | 
					            title=add_corpus_file_form.title.data)
 | 
				
			||||||
        db.session.add(corpus_file)
 | 
					        db.session.add(corpus_file)
 | 
				
			||||||
 | 
					        corpus.status = 'unprepared'
 | 
				
			||||||
        db.session.commit()
 | 
					        db.session.commit()
 | 
				
			||||||
        tasks.edit_corpus_file(corpus_file.id)
 | 
					 | 
				
			||||||
        flash('Corpus file added!')
 | 
					        flash('Corpus file added!')
 | 
				
			||||||
        return make_response(
 | 
					        return make_response(
 | 
				
			||||||
            {'redirect_url': url_for('corpora.corpus', corpus_id=corpus.id)},
 | 
					            {'redirect_url': url_for('corpora.corpus', corpus_id=corpus.id)},
 | 
				
			||||||
@@ -181,8 +183,8 @@ def edit_corpus_file(corpus_id, corpus_file_id):
 | 
				
			|||||||
            edit_corpus_file_form.publishing_year.data
 | 
					            edit_corpus_file_form.publishing_year.data
 | 
				
			||||||
        corpus_file.school = edit_corpus_file_form.school.data
 | 
					        corpus_file.school = edit_corpus_file_form.school.data
 | 
				
			||||||
        corpus_file.title = edit_corpus_file_form.title.data
 | 
					        corpus_file.title = edit_corpus_file_form.title.data
 | 
				
			||||||
 | 
					        corpus.status = 'unprepared'
 | 
				
			||||||
        db.session.commit()
 | 
					        db.session.commit()
 | 
				
			||||||
        tasks.edit_corpus_file(corpus_file_id)
 | 
					 | 
				
			||||||
        flash('Corpus file edited!')
 | 
					        flash('Corpus file edited!')
 | 
				
			||||||
        return redirect(url_for('corpora.corpus', corpus_id=corpus_id))
 | 
					        return redirect(url_for('corpora.corpus', corpus_id=corpus_id))
 | 
				
			||||||
    # If no form is submitted or valid, fill out fields with current values
 | 
					    # If no form is submitted or valid, fill out fields with current values
 | 
				
			||||||
@@ -211,9 +213,8 @@ def prepare_corpus(corpus_id):
 | 
				
			|||||||
    if not (corpus.creator == current_user or current_user.is_administrator()):
 | 
					    if not (corpus.creator == current_user or current_user.is_administrator()):
 | 
				
			||||||
        abort(403)
 | 
					        abort(403)
 | 
				
			||||||
    if corpus.files.all():
 | 
					    if corpus.files.all():
 | 
				
			||||||
        corpus.status = 'submitted'
 | 
					        tasks.build_corpus(corpus_id)
 | 
				
			||||||
        db.session.commit()
 | 
					        flash('Corpus gets build now.')
 | 
				
			||||||
        flash('Corpus marked for preparation!')
 | 
					 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        flash('Can not prepare corpus, please add corpus file(s).')
 | 
					        flash('Can not build corpus, please add corpus file(s).')
 | 
				
			||||||
    return redirect(url_for('corpora.corpus', corpus_id=corpus_id))
 | 
					    return redirect(url_for('corpora.corpus', corpus_id=corpus_id))
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -7,7 +7,6 @@ from werkzeug.utils import secure_filename
 | 
				
			|||||||
from . import db, logger, login_manager
 | 
					from . import db, logger, login_manager
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
import shutil
 | 
					import shutil
 | 
				
			||||||
import xml.etree.ElementTree as ET
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Permission:
 | 
					class Permission:
 | 
				
			||||||
@@ -380,28 +379,6 @@ class CorpusFile(db.Model):
 | 
				
			|||||||
        db.session.delete(self)
 | 
					        db.session.delete(self)
 | 
				
			||||||
        db.session.commit()
 | 
					        db.session.commit()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def insert_metadata(self):
 | 
					 | 
				
			||||||
        file = os.path.join(current_app.config['NOPAQUE_STORAGE'],
 | 
					 | 
				
			||||||
                            self.dir, self.filename)
 | 
					 | 
				
			||||||
        element_tree = ET.parse(file)
 | 
					 | 
				
			||||||
        text_node = element_tree.find('text')
 | 
					 | 
				
			||||||
        # TODO: USE OR
 | 
					 | 
				
			||||||
        text_node.set('address', self.address if self.address else "NULL")
 | 
					 | 
				
			||||||
        text_node.set('author', self.author)
 | 
					 | 
				
			||||||
        text_node.set('booktitle', self.booktitle if self.booktitle else "NULL")
 | 
					 | 
				
			||||||
        text_node.set('chapter', self.chapter if self.chapter else "NULL")
 | 
					 | 
				
			||||||
        text_node.set('editor', self.editor if self.editor else "NULL")
 | 
					 | 
				
			||||||
        text_node.set('institution', self.institution if self.institution else "NULL")
 | 
					 | 
				
			||||||
        text_node.set('journal', self.journal if self.journal else "NULL")
 | 
					 | 
				
			||||||
        text_node.set('pages', self.pages if self.pages else "NULL")
 | 
					 | 
				
			||||||
        text_node.set('publisher', self.publisher if self.publisher else "NULL")
 | 
					 | 
				
			||||||
        text_node.set('publishing_year', str(self.publishing_year))
 | 
					 | 
				
			||||||
        text_node.set('school', self.school if self.school else "NULL")
 | 
					 | 
				
			||||||
        text_node.set('title', self.title)
 | 
					 | 
				
			||||||
        element_tree.write(file)
 | 
					 | 
				
			||||||
        self.corpus.status = 'unprepared'
 | 
					 | 
				
			||||||
        db.session.commit()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def to_dict(self):
 | 
					    def to_dict(self):
 | 
				
			||||||
        return {'id': self.id,
 | 
					        return {'id': self.id,
 | 
				
			||||||
                'address': self.address,
 | 
					                'address': self.address,
 | 
				
			||||||
@@ -447,9 +424,6 @@ class Corpus(db.Model):
 | 
				
			|||||||
                'title': self.title,
 | 
					                'title': self.title,
 | 
				
			||||||
                'user_id': self.user_id}
 | 
					                'user_id': self.user_id}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def build(self):
 | 
					 | 
				
			||||||
        pass
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def delete(self):
 | 
					    def delete(self):
 | 
				
			||||||
        for corpus_file in self.files:
 | 
					        for corpus_file in self.files:
 | 
				
			||||||
            db.session.delete(corpus_file)
 | 
					            db.session.delete(corpus_file)
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -5,6 +5,9 @@ networks:
 | 
				
			|||||||
    external:
 | 
					    external:
 | 
				
			||||||
      name: reverse-proxy
 | 
					      name: reverse-proxy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					volumes:
 | 
				
			||||||
 | 
					  redis-trash1:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
services:
 | 
					services:
 | 
				
			||||||
  web:
 | 
					  web:
 | 
				
			||||||
    depends_on:
 | 
					    depends_on:
 | 
				
			||||||
@@ -52,3 +55,5 @@ services:
 | 
				
			|||||||
      - "/srv/nopaque/db:/var/lib/postgresql/data"
 | 
					      - "/srv/nopaque/db:/var/lib/postgresql/data"
 | 
				
			||||||
  redis:
 | 
					  redis:
 | 
				
			||||||
    image: redis:5
 | 
					    image: redis:5
 | 
				
			||||||
 | 
					    volumes:
 | 
				
			||||||
 | 
					      - "redis-trash1:/data"
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user