Reimplement corpus import and activate it again

This commit is contained in:
Patrick Jentsch 2022-04-19 11:48:44 +02:00
parent de4a83582d
commit 9d4001f469
4 changed files with 145 additions and 225 deletions

View File

@ -1,89 +0,0 @@
check_zip_contents = ['data/',
'merged/',
'registry/',
'registry/corpus',
'data/corpus/',
'data/corpus/text_editor.avs',
'data/corpus/pos.lexicon',
'data/corpus/simple_pos.huf',
'data/corpus/word.huf',
'data/corpus/text_booktitle.avs',
'data/corpus/word.lexicon.srt',
'data/corpus/word.lexicon.idx',
'data/corpus/simple_pos.crx',
'data/corpus/text_pages.rng',
'data/corpus/simple_pos.crc',
'data/corpus/ner.lexicon',
'data/corpus/lemma.huf',
'data/corpus/text_title.rng',
'data/corpus/text_chapter.avx',
'data/corpus/lemma.lexicon.srt',
'data/corpus/lemma.lexicon.idx',
'data/corpus/text_school.rng',
'data/corpus/text_journal.avs',
'data/corpus/simple_pos.lexicon',
'data/corpus/pos.huf',
'data/corpus/text_editor.avx',
'data/corpus/lemma.crc',
'data/corpus/lemma.lexicon',
'data/corpus/pos.hcd',
'data/corpus/text_title.avx',
'data/corpus/text_institution.avs',
'data/corpus/text_address.avx',
'data/corpus/lemma.corpus.cnt',
'data/corpus/word.crx',
'data/corpus/simple_pos.hcd',
'data/corpus/simple_pos.huf.syn',
'data/corpus/simple_pos.lexicon.srt',
'data/corpus/text_author.avx',
'data/corpus/text_publisher.avs',
'data/corpus/text_chapter.avs',
'data/corpus/ner.corpus.cnt',
'data/corpus/pos.huf.syn',
'data/corpus/text_booktitle.rng',
'data/corpus/lemma.huf.syn',
'data/corpus/pos.corpus.cnt',
'data/corpus/word.lexicon',
'data/corpus/text_publishing_year.avs',
'data/corpus/lemma.hcd',
'data/corpus/text_school.avs',
'data/corpus/text_journal.rng',
'data/corpus/word.corpus.cnt',
'data/corpus/text_school.avx',
'data/corpus/text_journal.avx',
'data/corpus/pos.lexicon.srt',
'data/corpus/text_title.avs',
'data/corpus/word.hcd',
'data/corpus/text_chapter.rng',
'data/corpus/text_address.rng',
'data/corpus/ner.hcd',
'data/corpus/text_publisher.avx',
'data/corpus/text_institution.rng',
'data/corpus/lemma.crx',
'data/corpus/pos.crc',
'data/corpus/text_author.rng',
'data/corpus/text_address.avs',
'data/corpus/pos.lexicon.idx',
'data/corpus/ner.huf',
'data/corpus/ner.huf.syn',
'data/corpus/text_pages.avs',
'data/corpus/text_publishing_year.avx',
'data/corpus/ner.lexicon.idx',
'data/corpus/text.rng',
'data/corpus/word.crc',
'data/corpus/ner.crc',
'data/corpus/text_publisher.rng',
'data/corpus/text_editor.rng',
'data/corpus/text_author.avs',
'data/corpus/s.rng',
'data/corpus/text_publishing_year.rng',
'data/corpus/simple_pos.corpus.cnt',
'data/corpus/simple_pos.lexicon.idx',
'data/corpus/word.huf.syn',
'data/corpus/ner.lexicon.srt',
'data/corpus/text_pages.avx',
'data/corpus/text_booktitle.avx',
'data/corpus/pos.crx',
'data/corpus/ner.crx',
'data/corpus/text_institution.avx',
'merged/corpus.vrt']

View File

@ -21,9 +21,9 @@ from .forms import (
EditCorpusFileForm,
ImportCorpusForm
)
from .import_corpus import check_zip_contents
import os
import shutil
import tempfile
import glob
import xml.etree.ElementTree as ET
@ -58,26 +58,10 @@ def add_corpus():
)
@bp.route('/<hashid:corpus_id>/export')
@login_required
def export_corpus(corpus_id):
abort(503)
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.user == current_user or current_user.is_administrator()):
abort(403)
return send_from_directory(
as_attachment=True,
directory=os.path.join(corpus.user.path, 'corpora'),
filename=corpus.archive_file,
mimetype='zip'
)
@bp.route('/import', methods=['GET', 'POST'])
@login_required
def import_corpus():
abort(503)
form = ImportCorpusForm()
form = ImportCorpusForm(prefix='import-corpus-form')
if form.is_submitted():
if not form.validate():
return make_response(form.errors, 400)
@ -87,61 +71,71 @@ def import_corpus():
title=form.title.data
)
db.session.add(corpus)
db.session.flush()
db.session.flush(objects=[corpus])
db.session.refresh(corpus)
try:
os.makedirs(corpus.path)
corpus.makedirs()
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error', category='error')
return make_response({'redirect_url': url_for('.import_corpus')}, 500) # noqa
# Upload zip
archive_file = os.path.join(corpus.path, form.file.data.filename)
form.file.data.save(archive_file)
# Some checks to verify it is a valid exported corpus
with ZipFile(archive_file, 'r') as zip:
contents = zip.namelist()
if set(check_zip_contents).issubset(contents):
# Unzip
shutil.unpack_archive(archive_file, corpus.path)
# Register vrt files to corpus
vrts = glob.glob(corpus.path + '/*.vrt')
for file in vrts:
element_tree = ET.parse(file)
# Save the uploaded zip file in a temporary directory
tmp_dir_base = os.path.join(current_app.config['NOPAQUE_DATA_DIR'], 'tmp') # noqa
with tempfile.TemporaryDirectory(dir=tmp_dir_base) as tmp_dir:
archive_file = os.path.join(tmp_dir, 'corpus.zip')
try:
form.archive.data.save(archive_file)
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error1', category='error')
return make_response({'redirect_url': url_for('.import_corpus')}, 500) # noqa
shutil.unpack_archive(archive_file, extract_dir=tmp_dir)
for vrt_filename in [x for x in os.listdir(tmp_dir) if x.endswith('.vrt')]:
vrt_file = os.path.join(tmp_dir, vrt_filename)
element_tree = ET.parse(vrt_file)
text_node = element_tree.find('text')
corpus_file = CorpusFile(
address=text_node.get('address', 'NULL'),
author=text_node.get('author', 'NULL'),
booktitle=text_node.get('booktitle', 'NULL'),
chapter=text_node.get('chapter', 'NULL'),
author=text_node.get('author'),
corpus=corpus,
editor=text_node.get('editor', 'NULL'),
filename=os.path.basename(file),
institution=text_node.get('institution', 'NULL'),
journal=text_node.get('journal', 'NULL'),
pages=text_node.get('pages', 'NULL'),
publisher=text_node.get('publisher', 'NULL'),
publishing_year=text_node.get('publishing_year', ''),
school=text_node.get('school', 'NULL'),
title=text_node.get('title', 'NULL')
filename=vrt_filename,
mimetype='application/vrt+xml',
publishing_year=int(text_node.get('publishing_year')),
title=text_node.get('title')
)
if 'address' not in text_node.attrib:
corpus_file.address = text_node.get('address')
if 'booktitle' not in text_node.attrib:
corpus_file.booktitle = text_node.get('booktitle')
if 'chapter' not in text_node.attrib:
corpus_file.chapter = text_node.get('chapter')
if 'editor' not in text_node.attrib:
corpus_file.editor = text_node.get('editor')
if 'institution' not in text_node.attrib:
corpus_file.institution = text_node.get('institution')
if 'journal' not in text_node.attrib:
corpus_file.journal = text_node.get('journal')
if 'pages' not in text_node.attrib:
corpus_file.pages = text_node.get('pages')
if 'publisher' not in text_node.attrib:
corpus_file.publisher = text_node.get('publisher')
if 'school' not in text_node.attrib:
corpus_file.school = text_node.get('school')
db.session.add(corpus_file)
# finish import and redirect to imported corpus
corpus.status = CorpusStatus.BUILT
db.session.commit()
os.remove(archive_file)
flash(f'Corpus "{corpus.title}" imported', 'corpus')
return make_response(
{'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201)
else:
# If imported zip is not valid delete corpus and give feedback
flash(
f'Can\'t import corpus "{corpus.title}": Invalid archive file',
category='error'
)
tasks.delete_corpus(corpus.id)
return make_response({'redirect_url': url_for('.import_corpus')}, 201) # noqa
db.session.flush(objects=[corpus_file])
db.session.refresh(corpus)
current_app.logger.warning(vrt_file)
current_app.logger.warning(corpus_file.path)
try:
shutil.copy2(vrt_file, corpus_file.path)
except Exception as e:
db.session.rollback()
flash('Internal Server Error2', category='error')
return make_response({'redirect_url': url_for('.import_corpus')}, 500) # noqa
db.session.commit()
flash(f'Corpus "{corpus.title}" imported', 'corpus')
return make_response({'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201)
return render_template(
'corpora/import_corpus.html.j2',
form=form,
@ -173,6 +167,26 @@ def analyse_corpus(corpus_id):
)
@bp.route('/<hashid:corpus_id>/build')
@login_required
def build_corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.user == current_user or current_user.is_administrator()):
abort(403)
if corpus.files.all():
tasks.build_corpus(corpus_id)
flash(
f'Corpus "{corpus.title}" marked for building',
category='corpus'
)
else:
flash(
f'Can\'t build corpus "{corpus.title}": No corpus file(s)',
category='error'
)
return redirect(url_for('.corpus', corpus_id=corpus_id))
@bp.route('/<hashid:corpus_id>/delete')
@login_required
def delete_corpus(corpus_id):
@ -184,6 +198,73 @@ def delete_corpus(corpus_id):
return redirect(url_for('main.dashboard'))
@bp.route('/<hashid:corpus_id>/export')
@login_required
def export_corpus(corpus_id):
abort(503)
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.user == current_user or current_user.is_administrator()):
abort(403)
return send_from_directory(
as_attachment=True,
directory=os.path.join(corpus.user.path, 'corpora'),
filename=corpus.archive_file,
mimetype='zip'
)
@bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>', methods=['GET', 'POST']) # noqa
@login_required
def corpus_file(corpus_id, corpus_file_id):
corpus_file = CorpusFile.query.filter(
CorpusFile.corpus_id == corpus_id,
CorpusFile.id == corpus_file_id
).first_or_404()
if not (
corpus_file.corpus.user == current_user
or current_user.is_administrator()
):
abort(403)
form = EditCorpusFileForm(prefix='edit-corpus-file-form')
if form.validate_on_submit():
corpus_file.address = form.address.data
corpus_file.author = form.author.data
corpus_file.booktitle = form.booktitle.data
corpus_file.chapter = form.chapter.data
corpus_file.editor = form.editor.data
corpus_file.institution = form.institution.data
corpus_file.journal = form.journal.data
corpus_file.pages = form.pages.data
corpus_file.publisher = form.publisher.data
corpus_file.publishing_year = form.publishing_year.data
corpus_file.school = form.school.data
corpus_file.title = form.title.data
corpus_file.corpus.status = CorpusStatus.UNPREPARED
db.session.commit()
flash(f'Corpus file "{corpus_file.filename}" edited', category='corpus') # noqa
return redirect(url_for('.corpus', corpus_id=corpus_id))
# If no form is submitted or valid, fill out fields with current values
form.address.data = corpus_file.address
form.author.data = corpus_file.author
form.booktitle.data = corpus_file.booktitle
form.chapter.data = corpus_file.chapter
form.editor.data = corpus_file.editor
form.institution.data = corpus_file.institution
form.journal.data = corpus_file.journal
form.pages.data = corpus_file.pages
form.publisher.data = corpus_file.publisher
form.publishing_year.data = corpus_file.publishing_year
form.school.data = corpus_file.school
form.title.data = corpus_file.title
return render_template(
'corpora/corpus_file.html.j2',
corpus=corpus_file.corpus,
corpus_file=corpus_file,
form=form,
title='Edit corpus file'
)
@bp.route('/<hashid:corpus_id>/files/add', methods=['GET', 'POST'])
@login_required
def add_corpus_file(corpus_id):
@ -271,76 +352,4 @@ def download_corpus_file(corpus_id, corpus_file_id):
attachment_filename=corpus_file.filename,
directory=os.path.dirname(corpus_file.path),
filename=os.path.basename(corpus_file.path)
)
@bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>', methods=['GET', 'POST']) # noqa
@login_required
def corpus_file(corpus_id, corpus_file_id):
corpus_file = CorpusFile.query.filter(
CorpusFile.corpus_id == corpus_id,
CorpusFile.id == corpus_file_id
).first_or_404()
if not (
corpus_file.corpus.user == current_user
or current_user.is_administrator()
):
abort(403)
form = EditCorpusFileForm(prefix='edit-corpus-file-form')
if form.validate_on_submit():
corpus_file.address = form.address.data
corpus_file.author = form.author.data
corpus_file.booktitle = form.booktitle.data
corpus_file.chapter = form.chapter.data
corpus_file.editor = form.editor.data
corpus_file.institution = form.institution.data
corpus_file.journal = form.journal.data
corpus_file.pages = form.pages.data
corpus_file.publisher = form.publisher.data
corpus_file.publishing_year = form.publishing_year.data
corpus_file.school = form.school.data
corpus_file.title = form.title.data
corpus_file.corpus.status = CorpusStatus.UNPREPARED
db.session.commit()
flash(f'Corpus file "{corpus_file.filename}" edited', category='corpus') # noqa
return redirect(url_for('.corpus', corpus_id=corpus_id))
# If no form is submitted or valid, fill out fields with current values
form.address.data = corpus_file.address
form.author.data = corpus_file.author
form.booktitle.data = corpus_file.booktitle
form.chapter.data = corpus_file.chapter
form.editor.data = corpus_file.editor
form.institution.data = corpus_file.institution
form.journal.data = corpus_file.journal
form.pages.data = corpus_file.pages
form.publisher.data = corpus_file.publisher
form.publishing_year.data = corpus_file.publishing_year
form.school.data = corpus_file.school
form.title.data = corpus_file.title
return render_template(
'corpora/corpus_file.html.j2',
corpus=corpus_file.corpus,
corpus_file=corpus_file,
form=form,
title='Edit corpus file'
)
@bp.route('/<hashid:corpus_id>/build')
@login_required
def build_corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.user == current_user or current_user.is_administrator()):
abort(403)
if corpus.files.all():
tasks.build_corpus(corpus_id)
flash(
f'Corpus "{corpus.title}" marked for building',
category='corpus'
)
else:
flash(
f'Can\'t build corpus "{corpus.title}": No corpus file(s)',
category='error'
)
return redirect(url_for('.corpus', corpus_id=corpus_id))
)

View File

@ -31,7 +31,7 @@
</div>
<div class="row">
<div class="col s12">
{{ wtf.render_field(form.file, accept='.zip', placeholder='Choose your exported .zip file') }}
{{ wtf.render_field(form.archive, accept='.zip', placeholder='Choose an exported ZIP archive') }}
</div>
</div>
</div>

View File

@ -40,7 +40,7 @@
<ul class="pagination"></ul>
</div>
<div class="card-action right-align">
<a class="btn disabled waves-effect waves-light" href="{{ url_for('corpora.import_corpus') }}"><i class="material-icons right">import_export</i>Import Corpus</a>
<a class="btn waves-effect waves-light" href="{{ url_for('corpora.import_corpus') }}"><i class="material-icons right">import_export</i>Import Corpus</a>
<a class="btn waves-effect waves-light" href="{{ url_for('corpora.add_corpus') }}">New corpus<i class="material-icons right">add</i></a>
</div>
</div>