mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2025-01-24 08:40:33 +00:00
Reimplement corpus import and activate it again
This commit is contained in:
parent
de4a83582d
commit
9d4001f469
@ -1,89 +0,0 @@
|
||||
check_zip_contents = ['data/',
|
||||
'merged/',
|
||||
'registry/',
|
||||
'registry/corpus',
|
||||
'data/corpus/',
|
||||
'data/corpus/text_editor.avs',
|
||||
'data/corpus/pos.lexicon',
|
||||
'data/corpus/simple_pos.huf',
|
||||
'data/corpus/word.huf',
|
||||
'data/corpus/text_booktitle.avs',
|
||||
'data/corpus/word.lexicon.srt',
|
||||
'data/corpus/word.lexicon.idx',
|
||||
'data/corpus/simple_pos.crx',
|
||||
'data/corpus/text_pages.rng',
|
||||
'data/corpus/simple_pos.crc',
|
||||
'data/corpus/ner.lexicon',
|
||||
'data/corpus/lemma.huf',
|
||||
'data/corpus/text_title.rng',
|
||||
'data/corpus/text_chapter.avx',
|
||||
'data/corpus/lemma.lexicon.srt',
|
||||
'data/corpus/lemma.lexicon.idx',
|
||||
'data/corpus/text_school.rng',
|
||||
'data/corpus/text_journal.avs',
|
||||
'data/corpus/simple_pos.lexicon',
|
||||
'data/corpus/pos.huf',
|
||||
'data/corpus/text_editor.avx',
|
||||
'data/corpus/lemma.crc',
|
||||
'data/corpus/lemma.lexicon',
|
||||
'data/corpus/pos.hcd',
|
||||
'data/corpus/text_title.avx',
|
||||
'data/corpus/text_institution.avs',
|
||||
'data/corpus/text_address.avx',
|
||||
'data/corpus/lemma.corpus.cnt',
|
||||
'data/corpus/word.crx',
|
||||
'data/corpus/simple_pos.hcd',
|
||||
'data/corpus/simple_pos.huf.syn',
|
||||
'data/corpus/simple_pos.lexicon.srt',
|
||||
'data/corpus/text_author.avx',
|
||||
'data/corpus/text_publisher.avs',
|
||||
'data/corpus/text_chapter.avs',
|
||||
'data/corpus/ner.corpus.cnt',
|
||||
'data/corpus/pos.huf.syn',
|
||||
'data/corpus/text_booktitle.rng',
|
||||
'data/corpus/lemma.huf.syn',
|
||||
'data/corpus/pos.corpus.cnt',
|
||||
'data/corpus/word.lexicon',
|
||||
'data/corpus/text_publishing_year.avs',
|
||||
'data/corpus/lemma.hcd',
|
||||
'data/corpus/text_school.avs',
|
||||
'data/corpus/text_journal.rng',
|
||||
'data/corpus/word.corpus.cnt',
|
||||
'data/corpus/text_school.avx',
|
||||
'data/corpus/text_journal.avx',
|
||||
'data/corpus/pos.lexicon.srt',
|
||||
'data/corpus/text_title.avs',
|
||||
'data/corpus/word.hcd',
|
||||
'data/corpus/text_chapter.rng',
|
||||
'data/corpus/text_address.rng',
|
||||
'data/corpus/ner.hcd',
|
||||
'data/corpus/text_publisher.avx',
|
||||
'data/corpus/text_institution.rng',
|
||||
'data/corpus/lemma.crx',
|
||||
'data/corpus/pos.crc',
|
||||
'data/corpus/text_author.rng',
|
||||
'data/corpus/text_address.avs',
|
||||
'data/corpus/pos.lexicon.idx',
|
||||
'data/corpus/ner.huf',
|
||||
'data/corpus/ner.huf.syn',
|
||||
'data/corpus/text_pages.avs',
|
||||
'data/corpus/text_publishing_year.avx',
|
||||
'data/corpus/ner.lexicon.idx',
|
||||
'data/corpus/text.rng',
|
||||
'data/corpus/word.crc',
|
||||
'data/corpus/ner.crc',
|
||||
'data/corpus/text_publisher.rng',
|
||||
'data/corpus/text_editor.rng',
|
||||
'data/corpus/text_author.avs',
|
||||
'data/corpus/s.rng',
|
||||
'data/corpus/text_publishing_year.rng',
|
||||
'data/corpus/simple_pos.corpus.cnt',
|
||||
'data/corpus/simple_pos.lexicon.idx',
|
||||
'data/corpus/word.huf.syn',
|
||||
'data/corpus/ner.lexicon.srt',
|
||||
'data/corpus/text_pages.avx',
|
||||
'data/corpus/text_booktitle.avx',
|
||||
'data/corpus/pos.crx',
|
||||
'data/corpus/ner.crx',
|
||||
'data/corpus/text_institution.avx',
|
||||
'merged/corpus.vrt']
|
@ -21,9 +21,9 @@ from .forms import (
|
||||
EditCorpusFileForm,
|
||||
ImportCorpusForm
|
||||
)
|
||||
from .import_corpus import check_zip_contents
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import glob
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
@ -58,26 +58,10 @@ def add_corpus():
|
||||
)
|
||||
|
||||
|
||||
@bp.route('/<hashid:corpus_id>/export')
|
||||
@login_required
|
||||
def export_corpus(corpus_id):
|
||||
abort(503)
|
||||
corpus = Corpus.query.get_or_404(corpus_id)
|
||||
if not (corpus.user == current_user or current_user.is_administrator()):
|
||||
abort(403)
|
||||
return send_from_directory(
|
||||
as_attachment=True,
|
||||
directory=os.path.join(corpus.user.path, 'corpora'),
|
||||
filename=corpus.archive_file,
|
||||
mimetype='zip'
|
||||
)
|
||||
|
||||
|
||||
@bp.route('/import', methods=['GET', 'POST'])
|
||||
@login_required
|
||||
def import_corpus():
|
||||
abort(503)
|
||||
form = ImportCorpusForm()
|
||||
form = ImportCorpusForm(prefix='import-corpus-form')
|
||||
if form.is_submitted():
|
||||
if not form.validate():
|
||||
return make_response(form.errors, 400)
|
||||
@ -87,61 +71,71 @@ def import_corpus():
|
||||
title=form.title.data
|
||||
)
|
||||
db.session.add(corpus)
|
||||
db.session.flush()
|
||||
db.session.flush(objects=[corpus])
|
||||
db.session.refresh(corpus)
|
||||
try:
|
||||
os.makedirs(corpus.path)
|
||||
corpus.makedirs()
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
flash('Internal Server Error', category='error')
|
||||
return make_response({'redirect_url': url_for('.import_corpus')}, 500) # noqa
|
||||
# Upload zip
|
||||
archive_file = os.path.join(corpus.path, form.file.data.filename)
|
||||
form.file.data.save(archive_file)
|
||||
# Some checks to verify it is a valid exported corpus
|
||||
with ZipFile(archive_file, 'r') as zip:
|
||||
contents = zip.namelist()
|
||||
if set(check_zip_contents).issubset(contents):
|
||||
# Unzip
|
||||
shutil.unpack_archive(archive_file, corpus.path)
|
||||
# Register vrt files to corpus
|
||||
vrts = glob.glob(corpus.path + '/*.vrt')
|
||||
for file in vrts:
|
||||
element_tree = ET.parse(file)
|
||||
# Save the uploaded zip file in a temporary directory
|
||||
tmp_dir_base = os.path.join(current_app.config['NOPAQUE_DATA_DIR'], 'tmp') # noqa
|
||||
with tempfile.TemporaryDirectory(dir=tmp_dir_base) as tmp_dir:
|
||||
archive_file = os.path.join(tmp_dir, 'corpus.zip')
|
||||
try:
|
||||
form.archive.data.save(archive_file)
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
flash('Internal Server Error1', category='error')
|
||||
return make_response({'redirect_url': url_for('.import_corpus')}, 500) # noqa
|
||||
shutil.unpack_archive(archive_file, extract_dir=tmp_dir)
|
||||
for vrt_filename in [x for x in os.listdir(tmp_dir) if x.endswith('.vrt')]:
|
||||
vrt_file = os.path.join(tmp_dir, vrt_filename)
|
||||
element_tree = ET.parse(vrt_file)
|
||||
text_node = element_tree.find('text')
|
||||
corpus_file = CorpusFile(
|
||||
address=text_node.get('address', 'NULL'),
|
||||
author=text_node.get('author', 'NULL'),
|
||||
booktitle=text_node.get('booktitle', 'NULL'),
|
||||
chapter=text_node.get('chapter', 'NULL'),
|
||||
author=text_node.get('author'),
|
||||
corpus=corpus,
|
||||
editor=text_node.get('editor', 'NULL'),
|
||||
filename=os.path.basename(file),
|
||||
institution=text_node.get('institution', 'NULL'),
|
||||
journal=text_node.get('journal', 'NULL'),
|
||||
pages=text_node.get('pages', 'NULL'),
|
||||
publisher=text_node.get('publisher', 'NULL'),
|
||||
publishing_year=text_node.get('publishing_year', ''),
|
||||
school=text_node.get('school', 'NULL'),
|
||||
title=text_node.get('title', 'NULL')
|
||||
filename=vrt_filename,
|
||||
mimetype='application/vrt+xml',
|
||||
publishing_year=int(text_node.get('publishing_year')),
|
||||
title=text_node.get('title')
|
||||
)
|
||||
if 'address' not in text_node.attrib:
|
||||
corpus_file.address = text_node.get('address')
|
||||
if 'booktitle' not in text_node.attrib:
|
||||
corpus_file.booktitle = text_node.get('booktitle')
|
||||
if 'chapter' not in text_node.attrib:
|
||||
corpus_file.chapter = text_node.get('chapter')
|
||||
if 'editor' not in text_node.attrib:
|
||||
corpus_file.editor = text_node.get('editor')
|
||||
if 'institution' not in text_node.attrib:
|
||||
corpus_file.institution = text_node.get('institution')
|
||||
if 'journal' not in text_node.attrib:
|
||||
corpus_file.journal = text_node.get('journal')
|
||||
if 'pages' not in text_node.attrib:
|
||||
corpus_file.pages = text_node.get('pages')
|
||||
if 'publisher' not in text_node.attrib:
|
||||
corpus_file.publisher = text_node.get('publisher')
|
||||
if 'school' not in text_node.attrib:
|
||||
corpus_file.school = text_node.get('school')
|
||||
db.session.add(corpus_file)
|
||||
# finish import and redirect to imported corpus
|
||||
corpus.status = CorpusStatus.BUILT
|
||||
db.session.commit()
|
||||
os.remove(archive_file)
|
||||
flash(f'Corpus "{corpus.title}" imported', 'corpus')
|
||||
return make_response(
|
||||
{'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201)
|
||||
else:
|
||||
# If imported zip is not valid delete corpus and give feedback
|
||||
flash(
|
||||
f'Can\'t import corpus "{corpus.title}": Invalid archive file',
|
||||
category='error'
|
||||
)
|
||||
tasks.delete_corpus(corpus.id)
|
||||
return make_response({'redirect_url': url_for('.import_corpus')}, 201) # noqa
|
||||
db.session.flush(objects=[corpus_file])
|
||||
db.session.refresh(corpus)
|
||||
current_app.logger.warning(vrt_file)
|
||||
current_app.logger.warning(corpus_file.path)
|
||||
try:
|
||||
shutil.copy2(vrt_file, corpus_file.path)
|
||||
except Exception as e:
|
||||
db.session.rollback()
|
||||
flash('Internal Server Error2', category='error')
|
||||
return make_response({'redirect_url': url_for('.import_corpus')}, 500) # noqa
|
||||
db.session.commit()
|
||||
flash(f'Corpus "{corpus.title}" imported', 'corpus')
|
||||
return make_response({'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201)
|
||||
return render_template(
|
||||
'corpora/import_corpus.html.j2',
|
||||
form=form,
|
||||
@ -173,6 +167,26 @@ def analyse_corpus(corpus_id):
|
||||
)
|
||||
|
||||
|
||||
@bp.route('/<hashid:corpus_id>/build')
|
||||
@login_required
|
||||
def build_corpus(corpus_id):
|
||||
corpus = Corpus.query.get_or_404(corpus_id)
|
||||
if not (corpus.user == current_user or current_user.is_administrator()):
|
||||
abort(403)
|
||||
if corpus.files.all():
|
||||
tasks.build_corpus(corpus_id)
|
||||
flash(
|
||||
f'Corpus "{corpus.title}" marked for building',
|
||||
category='corpus'
|
||||
)
|
||||
else:
|
||||
flash(
|
||||
f'Can\'t build corpus "{corpus.title}": No corpus file(s)',
|
||||
category='error'
|
||||
)
|
||||
return redirect(url_for('.corpus', corpus_id=corpus_id))
|
||||
|
||||
|
||||
@bp.route('/<hashid:corpus_id>/delete')
|
||||
@login_required
|
||||
def delete_corpus(corpus_id):
|
||||
@ -184,6 +198,73 @@ def delete_corpus(corpus_id):
|
||||
return redirect(url_for('main.dashboard'))
|
||||
|
||||
|
||||
@bp.route('/<hashid:corpus_id>/export')
|
||||
@login_required
|
||||
def export_corpus(corpus_id):
|
||||
abort(503)
|
||||
corpus = Corpus.query.get_or_404(corpus_id)
|
||||
if not (corpus.user == current_user or current_user.is_administrator()):
|
||||
abort(403)
|
||||
return send_from_directory(
|
||||
as_attachment=True,
|
||||
directory=os.path.join(corpus.user.path, 'corpora'),
|
||||
filename=corpus.archive_file,
|
||||
mimetype='zip'
|
||||
)
|
||||
|
||||
|
||||
@bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>', methods=['GET', 'POST']) # noqa
|
||||
@login_required
|
||||
def corpus_file(corpus_id, corpus_file_id):
|
||||
corpus_file = CorpusFile.query.filter(
|
||||
CorpusFile.corpus_id == corpus_id,
|
||||
CorpusFile.id == corpus_file_id
|
||||
).first_or_404()
|
||||
if not (
|
||||
corpus_file.corpus.user == current_user
|
||||
or current_user.is_administrator()
|
||||
):
|
||||
abort(403)
|
||||
form = EditCorpusFileForm(prefix='edit-corpus-file-form')
|
||||
if form.validate_on_submit():
|
||||
corpus_file.address = form.address.data
|
||||
corpus_file.author = form.author.data
|
||||
corpus_file.booktitle = form.booktitle.data
|
||||
corpus_file.chapter = form.chapter.data
|
||||
corpus_file.editor = form.editor.data
|
||||
corpus_file.institution = form.institution.data
|
||||
corpus_file.journal = form.journal.data
|
||||
corpus_file.pages = form.pages.data
|
||||
corpus_file.publisher = form.publisher.data
|
||||
corpus_file.publishing_year = form.publishing_year.data
|
||||
corpus_file.school = form.school.data
|
||||
corpus_file.title = form.title.data
|
||||
corpus_file.corpus.status = CorpusStatus.UNPREPARED
|
||||
db.session.commit()
|
||||
flash(f'Corpus file "{corpus_file.filename}" edited', category='corpus') # noqa
|
||||
return redirect(url_for('.corpus', corpus_id=corpus_id))
|
||||
# If no form is submitted or valid, fill out fields with current values
|
||||
form.address.data = corpus_file.address
|
||||
form.author.data = corpus_file.author
|
||||
form.booktitle.data = corpus_file.booktitle
|
||||
form.chapter.data = corpus_file.chapter
|
||||
form.editor.data = corpus_file.editor
|
||||
form.institution.data = corpus_file.institution
|
||||
form.journal.data = corpus_file.journal
|
||||
form.pages.data = corpus_file.pages
|
||||
form.publisher.data = corpus_file.publisher
|
||||
form.publishing_year.data = corpus_file.publishing_year
|
||||
form.school.data = corpus_file.school
|
||||
form.title.data = corpus_file.title
|
||||
return render_template(
|
||||
'corpora/corpus_file.html.j2',
|
||||
corpus=corpus_file.corpus,
|
||||
corpus_file=corpus_file,
|
||||
form=form,
|
||||
title='Edit corpus file'
|
||||
)
|
||||
|
||||
|
||||
@bp.route('/<hashid:corpus_id>/files/add', methods=['GET', 'POST'])
|
||||
@login_required
|
||||
def add_corpus_file(corpus_id):
|
||||
@ -271,76 +352,4 @@ def download_corpus_file(corpus_id, corpus_file_id):
|
||||
attachment_filename=corpus_file.filename,
|
||||
directory=os.path.dirname(corpus_file.path),
|
||||
filename=os.path.basename(corpus_file.path)
|
||||
)
|
||||
|
||||
|
||||
@bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>', methods=['GET', 'POST']) # noqa
|
||||
@login_required
|
||||
def corpus_file(corpus_id, corpus_file_id):
|
||||
corpus_file = CorpusFile.query.filter(
|
||||
CorpusFile.corpus_id == corpus_id,
|
||||
CorpusFile.id == corpus_file_id
|
||||
).first_or_404()
|
||||
if not (
|
||||
corpus_file.corpus.user == current_user
|
||||
or current_user.is_administrator()
|
||||
):
|
||||
abort(403)
|
||||
form = EditCorpusFileForm(prefix='edit-corpus-file-form')
|
||||
if form.validate_on_submit():
|
||||
corpus_file.address = form.address.data
|
||||
corpus_file.author = form.author.data
|
||||
corpus_file.booktitle = form.booktitle.data
|
||||
corpus_file.chapter = form.chapter.data
|
||||
corpus_file.editor = form.editor.data
|
||||
corpus_file.institution = form.institution.data
|
||||
corpus_file.journal = form.journal.data
|
||||
corpus_file.pages = form.pages.data
|
||||
corpus_file.publisher = form.publisher.data
|
||||
corpus_file.publishing_year = form.publishing_year.data
|
||||
corpus_file.school = form.school.data
|
||||
corpus_file.title = form.title.data
|
||||
corpus_file.corpus.status = CorpusStatus.UNPREPARED
|
||||
db.session.commit()
|
||||
flash(f'Corpus file "{corpus_file.filename}" edited', category='corpus') # noqa
|
||||
return redirect(url_for('.corpus', corpus_id=corpus_id))
|
||||
# If no form is submitted or valid, fill out fields with current values
|
||||
form.address.data = corpus_file.address
|
||||
form.author.data = corpus_file.author
|
||||
form.booktitle.data = corpus_file.booktitle
|
||||
form.chapter.data = corpus_file.chapter
|
||||
form.editor.data = corpus_file.editor
|
||||
form.institution.data = corpus_file.institution
|
||||
form.journal.data = corpus_file.journal
|
||||
form.pages.data = corpus_file.pages
|
||||
form.publisher.data = corpus_file.publisher
|
||||
form.publishing_year.data = corpus_file.publishing_year
|
||||
form.school.data = corpus_file.school
|
||||
form.title.data = corpus_file.title
|
||||
return render_template(
|
||||
'corpora/corpus_file.html.j2',
|
||||
corpus=corpus_file.corpus,
|
||||
corpus_file=corpus_file,
|
||||
form=form,
|
||||
title='Edit corpus file'
|
||||
)
|
||||
|
||||
|
||||
@bp.route('/<hashid:corpus_id>/build')
|
||||
@login_required
|
||||
def build_corpus(corpus_id):
|
||||
corpus = Corpus.query.get_or_404(corpus_id)
|
||||
if not (corpus.user == current_user or current_user.is_administrator()):
|
||||
abort(403)
|
||||
if corpus.files.all():
|
||||
tasks.build_corpus(corpus_id)
|
||||
flash(
|
||||
f'Corpus "{corpus.title}" marked for building',
|
||||
category='corpus'
|
||||
)
|
||||
else:
|
||||
flash(
|
||||
f'Can\'t build corpus "{corpus.title}": No corpus file(s)',
|
||||
category='error'
|
||||
)
|
||||
return redirect(url_for('.corpus', corpus_id=corpus_id))
|
||||
)
|
@ -31,7 +31,7 @@
|
||||
</div>
|
||||
<div class="row">
|
||||
<div class="col s12">
|
||||
{{ wtf.render_field(form.file, accept='.zip', placeholder='Choose your exported .zip file') }}
|
||||
{{ wtf.render_field(form.archive, accept='.zip', placeholder='Choose an exported ZIP archive') }}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
@ -40,7 +40,7 @@
|
||||
<ul class="pagination"></ul>
|
||||
</div>
|
||||
<div class="card-action right-align">
|
||||
<a class="btn disabled waves-effect waves-light" href="{{ url_for('corpora.import_corpus') }}"><i class="material-icons right">import_export</i>Import Corpus</a>
|
||||
<a class="btn waves-effect waves-light" href="{{ url_for('corpora.import_corpus') }}"><i class="material-icons right">import_export</i>Import Corpus</a>
|
||||
<a class="btn waves-effect waves-light" href="{{ url_for('corpora.add_corpus') }}">New corpus<i class="material-icons right">add</i></a>
|
||||
</div>
|
||||
</div>
|
||||
|
Loading…
x
Reference in New Issue
Block a user