Merge branch 'development' of gitlab.ub.uni-bielefeld.de:sfb1288inf/nopaque into development

This commit is contained in:
Patrick Jentsch
2020-10-30 10:19:45 +01:00
17 changed files with 485 additions and 108 deletions

View File

@ -8,7 +8,8 @@ from ..events import connected_sessions
from ..models import Corpus, User
import cqi
import math
import logging
import os
import shutil
'''
@ -23,6 +24,29 @@ corpus_analysis_sessions = {}
corpus_analysis_clients = {}
@socketio.on('corpus_create_zip')
@socketio_login_required
def corpus_create_zip(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
# delete old corpus archive if it exists/has been build before
if corpus.archive_file is not None:
if (os.path.isfile(corpus.archive_file)):
os.remove(corpus.archive_file)
root_dir = os.path.join(current_app.config['DATA_DIR'],
str(current_user.id),
'corpora')
base_dir = os.path.join(root_dir, str(corpus.id))
zip_name = corpus.title
zip_path = os.path.join(root_dir, zip_name)
corpus.archive_file = os.path.join(base_dir, zip_name) + '.zip'
db.session.commit()
shutil.make_archive(zip_path,
'zip',
base_dir)
shutil.move(zip_path + '.zip', corpus.archive_file)
socketio.emit('corpus_zip_created', room=request.sid)
@socketio.on('corpus_analysis_init')
@socketio_login_required
def init_corpus_analysis(corpus_id):
@ -125,10 +149,6 @@ def corpus_analysis_query(query):
chunk_start = 0
context = 50
progress = 0
# for attr in corpus.structural_attributes.list():
# if attr.attrs['name'] == 'text':
# text_attr = attr
# logging.warning(results.fdist_1(15, results.attrs['fields']['match'], text_attr))
client.status = 'running'
while chunk_start <= results.attrs['size']:
if client.status == 'abort':

View File

@ -71,6 +71,26 @@ class AddCorpusForm(FlaskForm):
title = StringField('Title', validators=[DataRequired(), Length(1, 32)])
class ImportCorpusForm(FlaskForm):
'''
Form to import a corpus.
'''
description = StringField('Description',
validators=[DataRequired(), Length(1, 255)])
file = FileField('File', validators=[DataRequired()])
submit = SubmitField()
title = StringField('Title', validators=[DataRequired(), Length(1, 32)])
def __init__(self, *args, **kwargs):
super(ImportCorpusForm, self).__init__(*args, **kwargs)
def validate_file(self, field):
if not field.data.filename.lower().endswith('.zip'):
raise ValidationError('File does not have an approved extension: '
'.zip')
field.data.filename = secure_filename(field.data.filename)
class QueryForm(FlaskForm):
'''
Form to submit a query to the server which is executed via cqi-py.

View File

@ -0,0 +1,89 @@
check_zip_contents = ['data/',
'merged/',
'registry/',
'registry/corpus',
'data/corpus/',
'data/corpus/text_editor.avs',
'data/corpus/pos.lexicon',
'data/corpus/simple_pos.huf',
'data/corpus/word.huf',
'data/corpus/text_booktitle.avs',
'data/corpus/word.lexicon.srt',
'data/corpus/word.lexicon.idx',
'data/corpus/simple_pos.crx',
'data/corpus/text_pages.rng',
'data/corpus/simple_pos.crc',
'data/corpus/ner.lexicon',
'data/corpus/lemma.huf',
'data/corpus/text_title.rng',
'data/corpus/text_chapter.avx',
'data/corpus/lemma.lexicon.srt',
'data/corpus/lemma.lexicon.idx',
'data/corpus/text_school.rng',
'data/corpus/text_journal.avs',
'data/corpus/simple_pos.lexicon',
'data/corpus/pos.huf',
'data/corpus/text_editor.avx',
'data/corpus/lemma.crc',
'data/corpus/lemma.lexicon',
'data/corpus/pos.hcd',
'data/corpus/text_title.avx',
'data/corpus/text_institution.avs',
'data/corpus/text_address.avx',
'data/corpus/lemma.corpus.cnt',
'data/corpus/word.crx',
'data/corpus/simple_pos.hcd',
'data/corpus/simple_pos.huf.syn',
'data/corpus/simple_pos.lexicon.srt',
'data/corpus/text_author.avx',
'data/corpus/text_publisher.avs',
'data/corpus/text_chapter.avs',
'data/corpus/ner.corpus.cnt',
'data/corpus/pos.huf.syn',
'data/corpus/text_booktitle.rng',
'data/corpus/lemma.huf.syn',
'data/corpus/pos.corpus.cnt',
'data/corpus/word.lexicon',
'data/corpus/text_publishing_year.avs',
'data/corpus/lemma.hcd',
'data/corpus/text_school.avs',
'data/corpus/text_journal.rng',
'data/corpus/word.corpus.cnt',
'data/corpus/text_school.avx',
'data/corpus/text_journal.avx',
'data/corpus/pos.lexicon.srt',
'data/corpus/text_title.avs',
'data/corpus/word.hcd',
'data/corpus/text_chapter.rng',
'data/corpus/text_address.rng',
'data/corpus/ner.hcd',
'data/corpus/text_publisher.avx',
'data/corpus/text_institution.rng',
'data/corpus/lemma.crx',
'data/corpus/pos.crc',
'data/corpus/text_author.rng',
'data/corpus/text_address.avs',
'data/corpus/pos.lexicon.idx',
'data/corpus/ner.huf',
'data/corpus/ner.huf.syn',
'data/corpus/text_pages.avs',
'data/corpus/text_publishing_year.avx',
'data/corpus/ner.lexicon.idx',
'data/corpus/text.rng',
'data/corpus/word.crc',
'data/corpus/ner.crc',
'data/corpus/text_publisher.rng',
'data/corpus/text_editor.rng',
'data/corpus/text_author.avs',
'data/corpus/s.rng',
'data/corpus/text_publishing_year.rng',
'data/corpus/simple_pos.corpus.cnt',
'data/corpus/simple_pos.lexicon.idx',
'data/corpus/word.huf.syn',
'data/corpus/ner.lexicon.srt',
'data/corpus/text_pages.avx',
'data/corpus/text_booktitle.avx',
'data/corpus/pos.crx',
'data/corpus/ner.crx',
'data/corpus/text_institution.avx',
'merged/corpus.vrt']

View File

@ -5,12 +5,18 @@ from . import corpora
from . import tasks
from .forms import (AddCorpusFileForm, AddCorpusForm, AddQueryResultForm,
EditCorpusFileForm, QueryDownloadForm, QueryForm,
DisplayOptionsForm, InspectDisplayOptionsForm)
DisplayOptionsForm, InspectDisplayOptionsForm,
ImportCorpusForm)
from jsonschema import validate
from .. import db
from ..models import Corpus, CorpusFile, QueryResult
import json
from jsonschema import validate
import os
import shutil
import glob
import xml.etree.ElementTree as ET
from zipfile import ZipFile
from .import_corpus import check_zip_contents
@corpora.route('/add', methods=['GET', 'POST'])
@ -40,6 +46,85 @@ def add_corpus():
title='Add corpus')
@corpora.route('/import', methods=['GET', 'POST'])
@login_required
def import_corpus():
import_corpus_form = ImportCorpusForm()
if import_corpus_form.is_submitted():
if not import_corpus_form.validate():
return make_response(import_corpus_form.errors, 400)
corpus = Corpus(creator=current_user,
description=import_corpus_form.description.data,
status='unprepared',
title=import_corpus_form.title.data)
db.session.add(corpus)
db.session.commit()
dir = os.path.join(current_app.config['DATA_DIR'],
str(corpus.user_id), 'corpora', str(corpus.id))
try:
os.makedirs(dir)
except OSError:
flash('[ERROR]: Could not import corpus!', 'corpus')
corpus.delete()
else:
# Upload zip
archive_file = os.path.join(current_app.config['DATA_DIR'], dir,
import_corpus_form.file.data.filename)
corpus_dir = os.path.dirname(archive_file)
import_corpus_form.file.data.save(archive_file)
# Some checks to verify it is a valid exported corpus
with ZipFile(archive_file, 'r') as zip:
contents = zip.namelist()
if set(check_zip_contents).issubset(contents):
# Unzip
shutil.unpack_archive(archive_file, corpus_dir)
# Register vrt files to corpus
vrts = glob.glob(corpus_dir + '/*.vrt')
for file in vrts:
element_tree = ET.parse(file)
text_node = element_tree.find('text')
corpus_file = CorpusFile(
address=text_node.get('address', 'NULL'),
author=text_node.get('author', 'NULL'),
booktitle=text_node.get('booktitle', 'NULL'),
chapter=text_node.get('chapter', 'NULL'),
corpus=corpus,
dir=dir,
editor=text_node.get('editor', 'NULL'),
filename=os.path.basename(file),
institution=text_node.get('institution', 'NULL'),
journal=text_node.get('journal', 'NULL'),
pages=text_node.get('pages', 'NULL'),
publisher=text_node.get('publisher', 'NULL'),
publishing_year=text_node.get('publishing_year', ''),
school=text_node.get('school', 'NULL'),
title=text_node.get('title', 'NULL'))
db.session.add(corpus_file)
# finish import and got to imported corpus
url = url_for('corpora.corpus', corpus_id=corpus.id)
corpus.status = 'prepared'
db.session.commit()
os.remove(archive_file)
flash('[<a href="{}">{}</a>] imported'.format(url,
corpus.title),
'corpus')
return make_response(
{'redirect_url': url_for('corpora.corpus',
corpus_id=corpus.id)},
201)
else:
# If imported zip is not valid delete corpus and give feedback
corpus.delete()
db.session.commit()
flash('Imported corpus is not valid.', 'error')
return make_response(
{'redirect_url': url_for('corpora.import_corpus')},
201)
return render_template('corpora/import_corpus.html.j2',
import_corpus_form=import_corpus_form,
title='Import Corpus')
@corpora.route('/<int:corpus_id>')
@login_required
def corpus(corpus_id):
@ -59,6 +144,20 @@ def corpus(corpus_id):
title='Corpus')
@corpora.route('/<int:corpus_id>/export')
@login_required
def export_corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.creator == current_user or current_user.is_administrator()):
abort(403)
dir = os.path.dirname(corpus.archive_file)
filename = os.path.basename(corpus.archive_file)
return send_from_directory(directory=dir,
filename=filename,
mimetype='zip',
as_attachment=True)
@corpora.route('/<int:corpus_id>/analyse')
@login_required
def analyse_corpus(corpus_id):