Add corpus import/export and some fixes etc.

This commit is contained in:
Stephan Porada 2020-10-29 15:20:30 +01:00
parent 21af48bc52
commit 94fa11060e
17 changed files with 492 additions and 113 deletions

View File

@ -8,7 +8,8 @@ from ..events import connected_sessions
from ..models import Corpus, User
import cqi
import math
import logging
import os
import shutil
'''
@ -23,6 +24,29 @@ corpus_analysis_sessions = {}
corpus_analysis_clients = {}
@socketio.on('corpus_create_zip')
@socketio_login_required
def corpus_create_zip(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
# delete old corpus archive if it exists/has been build before
if corpus.archive_file is not None:
if (os.path.isfile(corpus.archive_file)):
os.remove(corpus.archive_file)
root_dir = os.path.join(current_app.config['DATA_DIR'],
str(current_user.id),
'corpora')
base_dir = os.path.join(root_dir, str(corpus.id))
zip_name = corpus.title
zip_path = os.path.join(root_dir, zip_name)
corpus.archive_file = os.path.join(base_dir, zip_name) + '.zip'
db.session.commit()
shutil.make_archive(zip_path,
'zip',
base_dir)
shutil.move(zip_path + '.zip', corpus.archive_file)
socketio.emit('corpus_zip_created', room=request.sid)
@socketio.on('corpus_analysis_init')
@socketio_login_required
def init_corpus_analysis(corpus_id):
@ -125,10 +149,6 @@ def corpus_analysis_query(query):
chunk_start = 0
context = 50
progress = 0
# for attr in corpus.structural_attributes.list():
# if attr.attrs['name'] == 'text':
# text_attr = attr
# logging.warning(results.fdist_1(15, results.attrs['fields']['match'], text_attr))
client.status = 'running'
while chunk_start <= results.attrs['size']:
if client.status == 'abort':

View File

@ -69,6 +69,26 @@ class AddCorpusForm(FlaskForm):
title = StringField('Title', validators=[DataRequired(), Length(1, 32)])
class ImportCorpusForm(FlaskForm):
'''
Form to import a corpus.
'''
description = StringField('Description',
validators=[DataRequired(), Length(1, 255)])
file = FileField('File', validators=[DataRequired()])
submit = SubmitField()
title = StringField('Title', validators=[DataRequired(), Length(1, 32)])
def __init__(self, *args, **kwargs):
super(ImportCorpusForm, self).__init__(*args, **kwargs)
def validate_file(self, field):
if not field.data.filename.lower().endswith('.zip'):
raise ValidationError('File does not have an approved extension: '
'.zip')
field.data.filename = secure_filename(field.data.filename)
class QueryForm(FlaskForm):
'''
Form to submit a query to the server which is executed via cqi-py.

View File

@ -0,0 +1,89 @@
check_zip_contents = ['data/',
'merged/',
'registry/',
'registry/corpus',
'data/corpus/',
'data/corpus/text_editor.avs',
'data/corpus/pos.lexicon',
'data/corpus/simple_pos.huf',
'data/corpus/word.huf',
'data/corpus/text_booktitle.avs',
'data/corpus/word.lexicon.srt',
'data/corpus/word.lexicon.idx',
'data/corpus/simple_pos.crx',
'data/corpus/text_pages.rng',
'data/corpus/simple_pos.crc',
'data/corpus/ner.lexicon',
'data/corpus/lemma.huf',
'data/corpus/text_title.rng',
'data/corpus/text_chapter.avx',
'data/corpus/lemma.lexicon.srt',
'data/corpus/lemma.lexicon.idx',
'data/corpus/text_school.rng',
'data/corpus/text_journal.avs',
'data/corpus/simple_pos.lexicon',
'data/corpus/pos.huf',
'data/corpus/text_editor.avx',
'data/corpus/lemma.crc',
'data/corpus/lemma.lexicon',
'data/corpus/pos.hcd',
'data/corpus/text_title.avx',
'data/corpus/text_institution.avs',
'data/corpus/text_address.avx',
'data/corpus/lemma.corpus.cnt',
'data/corpus/word.crx',
'data/corpus/simple_pos.hcd',
'data/corpus/simple_pos.huf.syn',
'data/corpus/simple_pos.lexicon.srt',
'data/corpus/text_author.avx',
'data/corpus/text_publisher.avs',
'data/corpus/text_chapter.avs',
'data/corpus/ner.corpus.cnt',
'data/corpus/pos.huf.syn',
'data/corpus/text_booktitle.rng',
'data/corpus/lemma.huf.syn',
'data/corpus/pos.corpus.cnt',
'data/corpus/word.lexicon',
'data/corpus/text_publishing_year.avs',
'data/corpus/lemma.hcd',
'data/corpus/text_school.avs',
'data/corpus/text_journal.rng',
'data/corpus/word.corpus.cnt',
'data/corpus/text_school.avx',
'data/corpus/text_journal.avx',
'data/corpus/pos.lexicon.srt',
'data/corpus/text_title.avs',
'data/corpus/word.hcd',
'data/corpus/text_chapter.rng',
'data/corpus/text_address.rng',
'data/corpus/ner.hcd',
'data/corpus/text_publisher.avx',
'data/corpus/text_institution.rng',
'data/corpus/lemma.crx',
'data/corpus/pos.crc',
'data/corpus/text_author.rng',
'data/corpus/text_address.avs',
'data/corpus/pos.lexicon.idx',
'data/corpus/ner.huf',
'data/corpus/ner.huf.syn',
'data/corpus/text_pages.avs',
'data/corpus/text_publishing_year.avx',
'data/corpus/ner.lexicon.idx',
'data/corpus/text.rng',
'data/corpus/word.crc',
'data/corpus/ner.crc',
'data/corpus/text_publisher.rng',
'data/corpus/text_editor.rng',
'data/corpus/text_author.avs',
'data/corpus/s.rng',
'data/corpus/text_publishing_year.rng',
'data/corpus/simple_pos.corpus.cnt',
'data/corpus/simple_pos.lexicon.idx',
'data/corpus/word.huf.syn',
'data/corpus/ner.lexicon.srt',
'data/corpus/text_pages.avx',
'data/corpus/text_booktitle.avx',
'data/corpus/pos.crx',
'data/corpus/ner.crx',
'data/corpus/text_institution.avx',
'merged/corpus.vrt']

View File

@ -5,12 +5,18 @@ from . import corpora
from . import tasks
from .forms import (AddCorpusFileForm, AddCorpusForm, AddQueryResultForm,
EditCorpusFileForm, QueryDownloadForm, QueryForm,
DisplayOptionsForm, InspectDisplayOptionsForm)
DisplayOptionsForm, InspectDisplayOptionsForm,
ImportCorpusForm)
from jsonschema import validate
from .. import db
from ..models import Corpus, CorpusFile, QueryResult
import json
from jsonschema import validate
import os
import shutil
import glob
import xml.etree.ElementTree as ET
from zipfile import ZipFile
from .import_corpus import check_zip_contents
@corpora.route('/add', methods=['GET', 'POST'])
@ -40,6 +46,85 @@ def add_corpus():
title='Add corpus')
@corpora.route('/import', methods=['GET', 'POST'])
@login_required
def import_corpus():
import_corpus_form = ImportCorpusForm()
if import_corpus_form.is_submitted():
if not import_corpus_form.validate():
return make_response(import_corpus_form.errors, 400)
corpus = Corpus(creator=current_user,
description=import_corpus_form.description.data,
status='unprepared',
title=import_corpus_form.title.data)
db.session.add(corpus)
db.session.commit()
dir = os.path.join(current_app.config['DATA_DIR'],
str(corpus.user_id), 'corpora', str(corpus.id))
try:
os.makedirs(dir)
except OSError:
flash('[ERROR]: Could not import corpus!', 'corpus')
corpus.delete()
else:
# Upload zip
archive_file = os.path.join(current_app.config['DATA_DIR'], dir,
import_corpus_form.file.data.filename)
corpus_dir = os.path.dirname(archive_file)
import_corpus_form.file.data.save(archive_file)
# Some checks to verify it is a valid exported corpus
with ZipFile(archive_file, 'r') as zip:
contents = zip.namelist()
if set(check_zip_contents).issubset(contents):
# Unzip
shutil.unpack_archive(archive_file, corpus_dir)
# Register vrt files to corpus
vrts = glob.glob(corpus_dir + '/*.vrt')
for file in vrts:
element_tree = ET.parse(file)
text_node = element_tree.find('text')
corpus_file = CorpusFile(
address=text_node.get('address', 'NULL'),
author=text_node.get('author', 'NULL'),
booktitle=text_node.get('booktitle', 'NULL'),
chapter=text_node.get('chapter', 'NULL'),
corpus=corpus,
dir=dir,
editor=text_node.get('editor', 'NULL'),
filename=os.path.basename(file),
institution=text_node.get('institution', 'NULL'),
journal=text_node.get('journal', 'NULL'),
pages=text_node.get('pages', 'NULL'),
publisher=text_node.get('publisher', 'NULL'),
publishing_year=text_node.get('publishing_year', ''),
school=text_node.get('school', 'NULL'),
title=text_node.get('title', 'NULL'))
db.session.add(corpus_file)
# finish import and got to imported corpus
url = url_for('corpora.corpus', corpus_id=corpus.id)
corpus.status = 'prepared'
db.session.commit()
os.remove(archive_file)
flash('[<a href="{}">{}</a>] imported'.format(url,
corpus.title),
'corpus')
return make_response(
{'redirect_url': url_for('corpora.corpus',
corpus_id=corpus.id)},
201)
else:
# If imported zip is not valid delete corpus and give feedback
corpus.delete()
db.session.commit()
flash('Imported corpus is not valid.', 'error')
return make_response(
{'redirect_url': url_for('corpora.import_corpus')},
201)
return render_template('corpora/import_corpus.html.j2',
import_corpus_form=import_corpus_form,
title='Import Corpus')
@corpora.route('/<int:corpus_id>')
@login_required
def corpus(corpus_id):
@ -60,6 +145,20 @@ def corpus(corpus_id):
title='Corpus')
@corpora.route('/<int:corpus_id>/export')
@login_required
def export_corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.creator == current_user or current_user.is_administrator()):
abort(403)
dir = os.path.dirname(corpus.archive_file)
filename = os.path.basename(corpus.archive_file)
return send_from_directory(directory=dir,
filename=filename,
mimetype='zip',
as_attachment=True)
@corpora.route('/<int:corpus_id>/analyse')
@login_required
def analyse_corpus(corpus_id):

View File

@ -26,6 +26,7 @@ def background(f):
@wraps(f)
def wrapped(*args, **kwargs):
kwargs['app'] = current_app._get_current_object()
kwargs['current_user'] = current_user._get_current_object()
thread = socketio.start_background_task(f, *args, **kwargs)
return thread
return wrapped

View File

@ -555,6 +555,7 @@ class Corpus(db.Model):
max_nr_of_tokens = db.Column(db.BigInteger, default=2147483647)
status = db.Column(db.String(16))
title = db.Column(db.String(32))
archive_file = db.Column(db.String(255))
# Relationships
files = db.relationship('CorpusFile', backref='corpus', lazy='dynamic',
cascade='save-update, merge, delete')

View File

@ -34,6 +34,16 @@ main {
height: 19.5px !important;
}
/*
* changes preoloader size etc. to fit visually better with the chip status
* indicator of jobs
*/
.status-spinner {
margin-bottom: -10px;
width: 30px !important;
height: 30px !important;
}
/* flat-interaction addition to show background color */
.flat-interaction {

View File

@ -64,7 +64,6 @@ class ResultsList extends List {
* hase been issued by the user.
*/
resetFields() {
this.addToSubResultsStatus = {};
this.subResultsIndexes = {};
}
@ -216,17 +215,21 @@ class ResultsList extends List {
btn.textContent = "add";
}
/**
* Either adds or removes a match to the sub-results. For this it checks
* onclick if the current button has been checked or not. For this the
* function checks if its status in addToSubResultsStatus is either flase or
* true. Adds match to sub-results if status is false if status is true it
* removes it.
* This function is invoked when the users adds or removes a match using the
* add-btn (+ button/or green checkmark) to/from sub-results. When the button
* is clicked the function checks if the current dataIndex ID is already
* saved in subResultsIndexes or not. If it is not the dataIndex will be used
* as a key in subResultsIndexes with the value true. If it is already added
* the entry with the key dataIndex will be deleted from subResultsIndexes.
* Visual feedback (green checkmark if a match has been added etc.) is also
* handled on the basis of the information stored in subResultsIndexes.
*/
addToSubResults(dataIndex, client, tableCall=true) {
let toShowArray;
dataIndex = parseInt(dataIndex);
if (!this.subResultsIndexes[dataIndex]
|| this.subResultsIndexes === undefined) {
|| this.subResultsIndexes[dataIndex] === undefined) {
// add button is activated because status is false or undefined
this.helperActivateAddBtn(event.target);
this.subResultsIndexes[dataIndex] = true;
toShowArray = Object.keys(this.subResultsIndexes).map(index => parseInt(index));
@ -273,7 +276,7 @@ class ResultsList extends List {
this.getHTMLElements(['#query-results-table']);
let container = this.queryResultsTable.querySelector(`[data-index="${dataIndex}"]`);
let tableAddBtn = container.querySelector('.add-btn'); // gets the add button from the list view
if (this.addToSubResultsStatus[dataIndex]) {
if (this.subResultsIndexes[dataIndex]) {
this.helperActivateAddBtn(tableAddBtn);
} else {
this.helperDeactivateAddBtn(tableAddBtn);

View File

@ -281,7 +281,7 @@ function exportFullContextSwitch(resultsList) {
function createFullResults(resultsList, results) {
resultsList.fullResultsCreate.onclick = (event) => {
resultsList.fullResultsCreate.querySelector('i').classList.toggle('hide');
resultsList.fullResultsCreate.innerText = 'Creating...';
resultsList.fullResultsCreate.textContent = 'Creating...';
resultsList.fullResultsCreate.insertAdjacentHTML('afterbegin',
loadingSpinnerHTML);
// .keys() is for a zero based array. I think...
@ -302,7 +302,7 @@ function createSubResults(resultsList, results) {
dataIndexes.push(id);
});
resultsList.subResultsCreate.querySelector('i').classList.toggle('hide');
resultsList.subResultsCreate.innerText = 'Creating...';
resultsList.subResultsCreate.textContent = 'Creating...';
resultsList.subResultsCreate.insertAdjacentHTML('afterbegin',
loadingSpinnerHTML);
// Empty subResultsData so that no previous data is used.

View File

@ -4,7 +4,9 @@
<div class="col s12 m4">
<h3 id="title">{{ corpus.title }}</h3>
<p id="description">{{ corpus.description }}</p>
<div class="active preloader-wrapper small hide" id="progress-indicator">
<span class="chip status white-text hide" id="status"></span>
<div class="active preloader-wrapper small hide status-spinner"
id="progress-indicator">
<div class="spinner-layer spinner-blue-only">
<div class="circle-clipper left">
<div class="circle"></div>
@ -17,7 +19,6 @@
</div>
</div>
</div>
<span class="chip status white-text hide" id="status"></span>
</div>
<div class="col s12 m8">
@ -50,6 +51,7 @@
<div class="card-action right-align">
<a href="{{ url_for('corpora.analyse_corpus', corpus_id=corpus.id) }}" class="btn disabled hide waves-effect waves-light" id="analyze"><i class="material-icons left">search</i>Analyze</a>
<a href="{{ url_for('corpora.prepare_corpus', corpus_id=corpus.id) }}" class="btn disabled hide waves-effect waves-light" id="build"><i class="material-icons left">build</i>Build</a>
<a class="btn hide waves-effect waves-light download" id="corpus_create_zip"><i class="material-icons left">import_export</i>Export Corpus</a>
<a data-target="delete-corpus-modal" class="btn modal-trigger red waves-effect waves-light"><i class="material-icons left">delete</i>Delete</a>
</div>
</div>
@ -109,7 +111,10 @@
</div>
<script type="module">
import {RessourceList} from '../../static/js/nopaque.lists.js';
import {
RessourceList
} from '../../static/js/nopaque.lists.js';
class InformationUpdater {
constructor(corpusId, foreignCorpusFlag) {
this.corpusId = corpusId;
@ -186,6 +191,13 @@
} else {
buildElement.classList.add("disabled", "hide");
}
let downloadBtn = document.querySelector('#corpus_create_zip');
if (status === "prepared") {
downloadBtn.classList.toggle('hide', false);
} else {
downloadBtn.classList.toggle('hide', true);
}
}
}
@ -202,5 +214,24 @@
document.addEventListener("DOMContentLoaded", () => {
corpusFilesList._add({{ corpus_files|tojson|safe }});
});
// Events to handle full corpus download
let downloadBtn = document.querySelector('#corpus_create_zip');
downloadBtn.addEventListener('click', () => {
nopaque.flash('Compressing your corpus', 'corpus')
nopaque.socket.emit('corpus_create_zip', {{ corpus.id }});
downloadBtn.classList.toggle('disabled', true);
});
document.addEventListener('DOMContentLoaded', () => {
nopaque.socket.on('corpus_zip_created', () => {
nopaque.flash('Downloading your corpus', 'corpus');
downloadBtn.classList.toggle('disabled', false);
// Little trick to call the download view after ziping has finished
let fakeBtn = document.createElement('a');
fakeBtn.href = '{{ url_for('corpora.export_corpus',
corpus_id=corpus.id) }}';
fakeBtn.click();
});
});
</script>
{% endblock %}

View File

@ -0,0 +1,46 @@
{% extends "nopaque.html.j2" %}
{% block page_content %}
<div class="col s12 m4">
<p>Fill out the following form to import a corpus.</p>
<a class="waves-effect waves-light btn" href="{{ url_for('main.dashboard') }}"><i class="material-icons left">arrow_back</i>Back to dashboard</a>
</div>
<div class="col s12 m8">
<form class="nopaque-submit-form" data-progress-modal="progress-modal">
<div class="card">
<div class="card-content">
{{ import_corpus_form.hidden_tag() }}
<div class="row">
<div class="col s12 m4">
{{ M.render_field(import_corpus_form.title, data_length='32', material_icon='title') }}
</div>
<div class="col s12 m8">
{{ M.render_field(import_corpus_form.description, data_length='255', material_icon='description') }}
</div>
</div>
<div class="row">
<div class="col s12">
{{ M.render_field(import_corpus_form.file, accept='.zip', placeholder='Choose your exported .zip file') }}
</div>
</div>
</div>
<div class="card-action right-align">
{{ M.render_field(import_corpus_form.submit, material_icon='send') }}
</div>
</form>
</div>
</div>
<div id="progress-modal" class="modal">
<div class="modal-content">
<h4><i class="material-icons prefix">file_upload</i> Uploading file...</h4>
<div class="progress">
<div class="determinate" style="width: 0%"></div>
</div>
</div>
<div class="modal-footer">
<a href="#!" class="modal-close waves-effect waves-light btn red abort-request">Cancel</a>
</div>
</div>
{% endblock %}

View File

@ -6,20 +6,18 @@ result.-->
<h6 style="margin-top: 0px;">Infos</h6>
<div class="divider" style="margin-bottom: 10px;"></div>
<div class="row">
<div class="col s12">
<button id="loading-matches"
class="waves-effect
waves-light
btn-flat
flat-interaction
disabled black-text"
style="color: #000 !important;"
type="submit">
<div class="col s12"
style="height: 39px;
margin-top: 0px;
padding-top: 5px;
padding-left: 1.75rem;">
<span id="loading-matches"
class="black-text">
<i class="material-icons left">dvr</i>
<span id="recieved-match-count"></span>/
<span id="total-match-count"></span>
matches loaded
</button>
</span>
</div>
<div class="col s12">
<div class="progress hide" id="query-progress-bar">

View File

@ -32,7 +32,8 @@
<div class="col s4 m3 l2 right-align">
<span class="chip status white-text"></span>
<div class="active preloader-wrapper small" id="progress-indicator">
<div class="active preloader-wrapper small status-spinner"
id="progress-indicator">
<div class="spinner-layer spinner-blue-only">
<div class="circle-clipper left">
<div class="circle"></div>

View File

@ -37,6 +37,7 @@
<ul class="pagination paginationBottom"></ul>
</div>
<div class="card-action right-align">
<a class="waves-effect waves-light btn" href="{{ url_for('corpora.import_corpus') }}"><i class="material-icons right">import_export</i>Import Corpus</a>
<a class="waves-effect waves-light btn" href="{{ url_for('corpora.add_corpus') }}">New corpus<i class="material-icons right">add</i></a>
</div>
</div>

View File

@ -45,7 +45,8 @@
<ul class="pagination paginationBottom"></ul>
</div>
<div class="card-action right-align">
<a class="btn corpus-analysis-color darken waves-effect waves-light" href="{{ url_for('corpora.add_corpus') }}">New corpus<i class="material-icons right">add</i></a>
<a class="waves-effect waves-light btn" href="{{ url_for('corpora.import_corpus') }}"><i class="material-icons right">import_export</i>Import Corpus</a>
<a class="waves-effect waves-light btn" href="{{ url_for('corpora.add_corpus') }}">New corpus<i class="material-icons right">add</i></a>
</div>
</div>
</div>

View File

@ -0,0 +1,30 @@
"""empty message
Revision ID: befe5326787e
Revises: ecaf75fece7b
Create Date: 2020-10-16 13:32:09.620960
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = 'befe5326787e'
down_revision = 'ecaf75fece7b'
branch_labels = None
depends_on = None
def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('corpora', sa.Column('archive_file', sa.String(length=255), nullable=True))
op.drop_column('corpora', 'archive_dir')
# ### end Alembic commands ###
def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('corpora', sa.Column('archive_dir', sa.VARCHAR(length=255), autoincrement=False, nullable=True))
op.drop_column('corpora', 'archive_file')
# ### end Alembic commands ###

View File

@ -0,0 +1,28 @@
"""empty message
Revision ID: ecaf75fece7b
Revises: c3827cddea6e
Create Date: 2020-10-16 13:31:30.681269
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = 'ecaf75fece7b'
down_revision = 'c3827cddea6e'
branch_labels = None
depends_on = None
def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('corpora', sa.Column('archive_dir', sa.String(length=255), nullable=True))
# ### end Alembic commands ###
def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column('corpora', 'archive_dir')
# ### end Alembic commands ###