mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2025-06-20 21:10:36 +00:00
Merge branch 'development' of gitlab.ub.uni-bielefeld.de:sfb1288inf/nopaque into development
This commit is contained in:
@ -8,7 +8,8 @@ from ..events import connected_sessions
|
||||
from ..models import Corpus, User
|
||||
import cqi
|
||||
import math
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
|
||||
|
||||
'''
|
||||
@ -23,6 +24,29 @@ corpus_analysis_sessions = {}
|
||||
corpus_analysis_clients = {}
|
||||
|
||||
|
||||
@socketio.on('corpus_create_zip')
|
||||
@socketio_login_required
|
||||
def corpus_create_zip(corpus_id):
|
||||
corpus = Corpus.query.get_or_404(corpus_id)
|
||||
# delete old corpus archive if it exists/has been build before
|
||||
if corpus.archive_file is not None:
|
||||
if (os.path.isfile(corpus.archive_file)):
|
||||
os.remove(corpus.archive_file)
|
||||
root_dir = os.path.join(current_app.config['DATA_DIR'],
|
||||
str(current_user.id),
|
||||
'corpora')
|
||||
base_dir = os.path.join(root_dir, str(corpus.id))
|
||||
zip_name = corpus.title
|
||||
zip_path = os.path.join(root_dir, zip_name)
|
||||
corpus.archive_file = os.path.join(base_dir, zip_name) + '.zip'
|
||||
db.session.commit()
|
||||
shutil.make_archive(zip_path,
|
||||
'zip',
|
||||
base_dir)
|
||||
shutil.move(zip_path + '.zip', corpus.archive_file)
|
||||
socketio.emit('corpus_zip_created', room=request.sid)
|
||||
|
||||
|
||||
@socketio.on('corpus_analysis_init')
|
||||
@socketio_login_required
|
||||
def init_corpus_analysis(corpus_id):
|
||||
@ -125,10 +149,6 @@ def corpus_analysis_query(query):
|
||||
chunk_start = 0
|
||||
context = 50
|
||||
progress = 0
|
||||
# for attr in corpus.structural_attributes.list():
|
||||
# if attr.attrs['name'] == 'text':
|
||||
# text_attr = attr
|
||||
# logging.warning(results.fdist_1(15, results.attrs['fields']['match'], text_attr))
|
||||
client.status = 'running'
|
||||
while chunk_start <= results.attrs['size']:
|
||||
if client.status == 'abort':
|
||||
|
@ -71,6 +71,26 @@ class AddCorpusForm(FlaskForm):
|
||||
title = StringField('Title', validators=[DataRequired(), Length(1, 32)])
|
||||
|
||||
|
||||
class ImportCorpusForm(FlaskForm):
|
||||
'''
|
||||
Form to import a corpus.
|
||||
'''
|
||||
description = StringField('Description',
|
||||
validators=[DataRequired(), Length(1, 255)])
|
||||
file = FileField('File', validators=[DataRequired()])
|
||||
submit = SubmitField()
|
||||
title = StringField('Title', validators=[DataRequired(), Length(1, 32)])
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(ImportCorpusForm, self).__init__(*args, **kwargs)
|
||||
|
||||
def validate_file(self, field):
|
||||
if not field.data.filename.lower().endswith('.zip'):
|
||||
raise ValidationError('File does not have an approved extension: '
|
||||
'.zip')
|
||||
field.data.filename = secure_filename(field.data.filename)
|
||||
|
||||
|
||||
class QueryForm(FlaskForm):
|
||||
'''
|
||||
Form to submit a query to the server which is executed via cqi-py.
|
||||
|
89
web/app/corpora/import_corpus.py
Normal file
89
web/app/corpora/import_corpus.py
Normal file
@ -0,0 +1,89 @@
|
||||
check_zip_contents = ['data/',
|
||||
'merged/',
|
||||
'registry/',
|
||||
'registry/corpus',
|
||||
'data/corpus/',
|
||||
'data/corpus/text_editor.avs',
|
||||
'data/corpus/pos.lexicon',
|
||||
'data/corpus/simple_pos.huf',
|
||||
'data/corpus/word.huf',
|
||||
'data/corpus/text_booktitle.avs',
|
||||
'data/corpus/word.lexicon.srt',
|
||||
'data/corpus/word.lexicon.idx',
|
||||
'data/corpus/simple_pos.crx',
|
||||
'data/corpus/text_pages.rng',
|
||||
'data/corpus/simple_pos.crc',
|
||||
'data/corpus/ner.lexicon',
|
||||
'data/corpus/lemma.huf',
|
||||
'data/corpus/text_title.rng',
|
||||
'data/corpus/text_chapter.avx',
|
||||
'data/corpus/lemma.lexicon.srt',
|
||||
'data/corpus/lemma.lexicon.idx',
|
||||
'data/corpus/text_school.rng',
|
||||
'data/corpus/text_journal.avs',
|
||||
'data/corpus/simple_pos.lexicon',
|
||||
'data/corpus/pos.huf',
|
||||
'data/corpus/text_editor.avx',
|
||||
'data/corpus/lemma.crc',
|
||||
'data/corpus/lemma.lexicon',
|
||||
'data/corpus/pos.hcd',
|
||||
'data/corpus/text_title.avx',
|
||||
'data/corpus/text_institution.avs',
|
||||
'data/corpus/text_address.avx',
|
||||
'data/corpus/lemma.corpus.cnt',
|
||||
'data/corpus/word.crx',
|
||||
'data/corpus/simple_pos.hcd',
|
||||
'data/corpus/simple_pos.huf.syn',
|
||||
'data/corpus/simple_pos.lexicon.srt',
|
||||
'data/corpus/text_author.avx',
|
||||
'data/corpus/text_publisher.avs',
|
||||
'data/corpus/text_chapter.avs',
|
||||
'data/corpus/ner.corpus.cnt',
|
||||
'data/corpus/pos.huf.syn',
|
||||
'data/corpus/text_booktitle.rng',
|
||||
'data/corpus/lemma.huf.syn',
|
||||
'data/corpus/pos.corpus.cnt',
|
||||
'data/corpus/word.lexicon',
|
||||
'data/corpus/text_publishing_year.avs',
|
||||
'data/corpus/lemma.hcd',
|
||||
'data/corpus/text_school.avs',
|
||||
'data/corpus/text_journal.rng',
|
||||
'data/corpus/word.corpus.cnt',
|
||||
'data/corpus/text_school.avx',
|
||||
'data/corpus/text_journal.avx',
|
||||
'data/corpus/pos.lexicon.srt',
|
||||
'data/corpus/text_title.avs',
|
||||
'data/corpus/word.hcd',
|
||||
'data/corpus/text_chapter.rng',
|
||||
'data/corpus/text_address.rng',
|
||||
'data/corpus/ner.hcd',
|
||||
'data/corpus/text_publisher.avx',
|
||||
'data/corpus/text_institution.rng',
|
||||
'data/corpus/lemma.crx',
|
||||
'data/corpus/pos.crc',
|
||||
'data/corpus/text_author.rng',
|
||||
'data/corpus/text_address.avs',
|
||||
'data/corpus/pos.lexicon.idx',
|
||||
'data/corpus/ner.huf',
|
||||
'data/corpus/ner.huf.syn',
|
||||
'data/corpus/text_pages.avs',
|
||||
'data/corpus/text_publishing_year.avx',
|
||||
'data/corpus/ner.lexicon.idx',
|
||||
'data/corpus/text.rng',
|
||||
'data/corpus/word.crc',
|
||||
'data/corpus/ner.crc',
|
||||
'data/corpus/text_publisher.rng',
|
||||
'data/corpus/text_editor.rng',
|
||||
'data/corpus/text_author.avs',
|
||||
'data/corpus/s.rng',
|
||||
'data/corpus/text_publishing_year.rng',
|
||||
'data/corpus/simple_pos.corpus.cnt',
|
||||
'data/corpus/simple_pos.lexicon.idx',
|
||||
'data/corpus/word.huf.syn',
|
||||
'data/corpus/ner.lexicon.srt',
|
||||
'data/corpus/text_pages.avx',
|
||||
'data/corpus/text_booktitle.avx',
|
||||
'data/corpus/pos.crx',
|
||||
'data/corpus/ner.crx',
|
||||
'data/corpus/text_institution.avx',
|
||||
'merged/corpus.vrt']
|
@ -5,12 +5,18 @@ from . import corpora
|
||||
from . import tasks
|
||||
from .forms import (AddCorpusFileForm, AddCorpusForm, AddQueryResultForm,
|
||||
EditCorpusFileForm, QueryDownloadForm, QueryForm,
|
||||
DisplayOptionsForm, InspectDisplayOptionsForm)
|
||||
DisplayOptionsForm, InspectDisplayOptionsForm,
|
||||
ImportCorpusForm)
|
||||
from jsonschema import validate
|
||||
from .. import db
|
||||
from ..models import Corpus, CorpusFile, QueryResult
|
||||
import json
|
||||
from jsonschema import validate
|
||||
import os
|
||||
import shutil
|
||||
import glob
|
||||
import xml.etree.ElementTree as ET
|
||||
from zipfile import ZipFile
|
||||
from .import_corpus import check_zip_contents
|
||||
|
||||
|
||||
@corpora.route('/add', methods=['GET', 'POST'])
|
||||
@ -40,6 +46,85 @@ def add_corpus():
|
||||
title='Add corpus')
|
||||
|
||||
|
||||
@corpora.route('/import', methods=['GET', 'POST'])
|
||||
@login_required
|
||||
def import_corpus():
|
||||
import_corpus_form = ImportCorpusForm()
|
||||
if import_corpus_form.is_submitted():
|
||||
if not import_corpus_form.validate():
|
||||
return make_response(import_corpus_form.errors, 400)
|
||||
corpus = Corpus(creator=current_user,
|
||||
description=import_corpus_form.description.data,
|
||||
status='unprepared',
|
||||
title=import_corpus_form.title.data)
|
||||
db.session.add(corpus)
|
||||
db.session.commit()
|
||||
dir = os.path.join(current_app.config['DATA_DIR'],
|
||||
str(corpus.user_id), 'corpora', str(corpus.id))
|
||||
try:
|
||||
os.makedirs(dir)
|
||||
except OSError:
|
||||
flash('[ERROR]: Could not import corpus!', 'corpus')
|
||||
corpus.delete()
|
||||
else:
|
||||
# Upload zip
|
||||
archive_file = os.path.join(current_app.config['DATA_DIR'], dir,
|
||||
import_corpus_form.file.data.filename)
|
||||
corpus_dir = os.path.dirname(archive_file)
|
||||
import_corpus_form.file.data.save(archive_file)
|
||||
# Some checks to verify it is a valid exported corpus
|
||||
with ZipFile(archive_file, 'r') as zip:
|
||||
contents = zip.namelist()
|
||||
if set(check_zip_contents).issubset(contents):
|
||||
# Unzip
|
||||
shutil.unpack_archive(archive_file, corpus_dir)
|
||||
# Register vrt files to corpus
|
||||
vrts = glob.glob(corpus_dir + '/*.vrt')
|
||||
for file in vrts:
|
||||
element_tree = ET.parse(file)
|
||||
text_node = element_tree.find('text')
|
||||
corpus_file = CorpusFile(
|
||||
address=text_node.get('address', 'NULL'),
|
||||
author=text_node.get('author', 'NULL'),
|
||||
booktitle=text_node.get('booktitle', 'NULL'),
|
||||
chapter=text_node.get('chapter', 'NULL'),
|
||||
corpus=corpus,
|
||||
dir=dir,
|
||||
editor=text_node.get('editor', 'NULL'),
|
||||
filename=os.path.basename(file),
|
||||
institution=text_node.get('institution', 'NULL'),
|
||||
journal=text_node.get('journal', 'NULL'),
|
||||
pages=text_node.get('pages', 'NULL'),
|
||||
publisher=text_node.get('publisher', 'NULL'),
|
||||
publishing_year=text_node.get('publishing_year', ''),
|
||||
school=text_node.get('school', 'NULL'),
|
||||
title=text_node.get('title', 'NULL'))
|
||||
db.session.add(corpus_file)
|
||||
# finish import and got to imported corpus
|
||||
url = url_for('corpora.corpus', corpus_id=corpus.id)
|
||||
corpus.status = 'prepared'
|
||||
db.session.commit()
|
||||
os.remove(archive_file)
|
||||
flash('[<a href="{}">{}</a>] imported'.format(url,
|
||||
corpus.title),
|
||||
'corpus')
|
||||
return make_response(
|
||||
{'redirect_url': url_for('corpora.corpus',
|
||||
corpus_id=corpus.id)},
|
||||
201)
|
||||
else:
|
||||
# If imported zip is not valid delete corpus and give feedback
|
||||
corpus.delete()
|
||||
db.session.commit()
|
||||
flash('Imported corpus is not valid.', 'error')
|
||||
return make_response(
|
||||
{'redirect_url': url_for('corpora.import_corpus')},
|
||||
201)
|
||||
return render_template('corpora/import_corpus.html.j2',
|
||||
import_corpus_form=import_corpus_form,
|
||||
title='Import Corpus')
|
||||
|
||||
|
||||
@corpora.route('/<int:corpus_id>')
|
||||
@login_required
|
||||
def corpus(corpus_id):
|
||||
@ -59,6 +144,20 @@ def corpus(corpus_id):
|
||||
title='Corpus')
|
||||
|
||||
|
||||
@corpora.route('/<int:corpus_id>/export')
|
||||
@login_required
|
||||
def export_corpus(corpus_id):
|
||||
corpus = Corpus.query.get_or_404(corpus_id)
|
||||
if not (corpus.creator == current_user or current_user.is_administrator()):
|
||||
abort(403)
|
||||
dir = os.path.dirname(corpus.archive_file)
|
||||
filename = os.path.basename(corpus.archive_file)
|
||||
return send_from_directory(directory=dir,
|
||||
filename=filename,
|
||||
mimetype='zip',
|
||||
as_attachment=True)
|
||||
|
||||
|
||||
@corpora.route('/<int:corpus_id>/analyse')
|
||||
@login_required
|
||||
def analyse_corpus(corpus_id):
|
||||
|
Reference in New Issue
Block a user