diff --git a/app/corpora/import_corpus.py b/app/corpora/import_corpus.py deleted file mode 100644 index a78f6f26..00000000 --- a/app/corpora/import_corpus.py +++ /dev/null @@ -1,89 +0,0 @@ -check_zip_contents = ['data/', - 'merged/', - 'registry/', - 'registry/corpus', - 'data/corpus/', - 'data/corpus/text_editor.avs', - 'data/corpus/pos.lexicon', - 'data/corpus/simple_pos.huf', - 'data/corpus/word.huf', - 'data/corpus/text_booktitle.avs', - 'data/corpus/word.lexicon.srt', - 'data/corpus/word.lexicon.idx', - 'data/corpus/simple_pos.crx', - 'data/corpus/text_pages.rng', - 'data/corpus/simple_pos.crc', - 'data/corpus/ner.lexicon', - 'data/corpus/lemma.huf', - 'data/corpus/text_title.rng', - 'data/corpus/text_chapter.avx', - 'data/corpus/lemma.lexicon.srt', - 'data/corpus/lemma.lexicon.idx', - 'data/corpus/text_school.rng', - 'data/corpus/text_journal.avs', - 'data/corpus/simple_pos.lexicon', - 'data/corpus/pos.huf', - 'data/corpus/text_editor.avx', - 'data/corpus/lemma.crc', - 'data/corpus/lemma.lexicon', - 'data/corpus/pos.hcd', - 'data/corpus/text_title.avx', - 'data/corpus/text_institution.avs', - 'data/corpus/text_address.avx', - 'data/corpus/lemma.corpus.cnt', - 'data/corpus/word.crx', - 'data/corpus/simple_pos.hcd', - 'data/corpus/simple_pos.huf.syn', - 'data/corpus/simple_pos.lexicon.srt', - 'data/corpus/text_author.avx', - 'data/corpus/text_publisher.avs', - 'data/corpus/text_chapter.avs', - 'data/corpus/ner.corpus.cnt', - 'data/corpus/pos.huf.syn', - 'data/corpus/text_booktitle.rng', - 'data/corpus/lemma.huf.syn', - 'data/corpus/pos.corpus.cnt', - 'data/corpus/word.lexicon', - 'data/corpus/text_publishing_year.avs', - 'data/corpus/lemma.hcd', - 'data/corpus/text_school.avs', - 'data/corpus/text_journal.rng', - 'data/corpus/word.corpus.cnt', - 'data/corpus/text_school.avx', - 'data/corpus/text_journal.avx', - 'data/corpus/pos.lexicon.srt', - 'data/corpus/text_title.avs', - 'data/corpus/word.hcd', - 'data/corpus/text_chapter.rng', - 'data/corpus/text_address.rng', - 'data/corpus/ner.hcd', - 'data/corpus/text_publisher.avx', - 'data/corpus/text_institution.rng', - 'data/corpus/lemma.crx', - 'data/corpus/pos.crc', - 'data/corpus/text_author.rng', - 'data/corpus/text_address.avs', - 'data/corpus/pos.lexicon.idx', - 'data/corpus/ner.huf', - 'data/corpus/ner.huf.syn', - 'data/corpus/text_pages.avs', - 'data/corpus/text_publishing_year.avx', - 'data/corpus/ner.lexicon.idx', - 'data/corpus/text.rng', - 'data/corpus/word.crc', - 'data/corpus/ner.crc', - 'data/corpus/text_publisher.rng', - 'data/corpus/text_editor.rng', - 'data/corpus/text_author.avs', - 'data/corpus/s.rng', - 'data/corpus/text_publishing_year.rng', - 'data/corpus/simple_pos.corpus.cnt', - 'data/corpus/simple_pos.lexicon.idx', - 'data/corpus/word.huf.syn', - 'data/corpus/ner.lexicon.srt', - 'data/corpus/text_pages.avx', - 'data/corpus/text_booktitle.avx', - 'data/corpus/pos.crx', - 'data/corpus/ner.crx', - 'data/corpus/text_institution.avx', - 'merged/corpus.vrt'] diff --git a/app/corpora/routes.py b/app/corpora/routes.py index 44405e7d..b6bf2d01 100644 --- a/app/corpora/routes.py +++ b/app/corpora/routes.py @@ -21,9 +21,9 @@ from .forms import ( EditCorpusFileForm, ImportCorpusForm ) -from .import_corpus import check_zip_contents import os import shutil +import tempfile import glob import xml.etree.ElementTree as ET @@ -58,26 +58,10 @@ def add_corpus(): ) -@bp.route('//export') -@login_required -def export_corpus(corpus_id): - abort(503) - corpus = Corpus.query.get_or_404(corpus_id) - if not (corpus.user == current_user or current_user.is_administrator()): - abort(403) - return send_from_directory( - as_attachment=True, - directory=os.path.join(corpus.user.path, 'corpora'), - filename=corpus.archive_file, - mimetype='zip' - ) - - @bp.route('/import', methods=['GET', 'POST']) @login_required def import_corpus(): - abort(503) - form = ImportCorpusForm() + form = ImportCorpusForm(prefix='import-corpus-form') if form.is_submitted(): if not form.validate(): return make_response(form.errors, 400) @@ -87,61 +71,71 @@ def import_corpus(): title=form.title.data ) db.session.add(corpus) - db.session.flush() + db.session.flush(objects=[corpus]) db.session.refresh(corpus) try: - os.makedirs(corpus.path) + corpus.makedirs() except OSError as e: current_app.logger.error(e) db.session.rollback() flash('Internal Server Error', category='error') return make_response({'redirect_url': url_for('.import_corpus')}, 500) # noqa - # Upload zip - archive_file = os.path.join(corpus.path, form.file.data.filename) - form.file.data.save(archive_file) - # Some checks to verify it is a valid exported corpus - with ZipFile(archive_file, 'r') as zip: - contents = zip.namelist() - if set(check_zip_contents).issubset(contents): - # Unzip - shutil.unpack_archive(archive_file, corpus.path) - # Register vrt files to corpus - vrts = glob.glob(corpus.path + '/*.vrt') - for file in vrts: - element_tree = ET.parse(file) + # Save the uploaded zip file in a temporary directory + tmp_dir_base = os.path.join(current_app.config['NOPAQUE_DATA_DIR'], 'tmp') # noqa + with tempfile.TemporaryDirectory(dir=tmp_dir_base) as tmp_dir: + archive_file = os.path.join(tmp_dir, 'corpus.zip') + try: + form.archive.data.save(archive_file) + except OSError as e: + current_app.logger.error(e) + db.session.rollback() + flash('Internal Server Error1', category='error') + return make_response({'redirect_url': url_for('.import_corpus')}, 500) # noqa + shutil.unpack_archive(archive_file, extract_dir=tmp_dir) + for vrt_filename in [x for x in os.listdir(tmp_dir) if x.endswith('.vrt')]: + vrt_file = os.path.join(tmp_dir, vrt_filename) + element_tree = ET.parse(vrt_file) text_node = element_tree.find('text') corpus_file = CorpusFile( - address=text_node.get('address', 'NULL'), - author=text_node.get('author', 'NULL'), - booktitle=text_node.get('booktitle', 'NULL'), - chapter=text_node.get('chapter', 'NULL'), + author=text_node.get('author'), corpus=corpus, - editor=text_node.get('editor', 'NULL'), - filename=os.path.basename(file), - institution=text_node.get('institution', 'NULL'), - journal=text_node.get('journal', 'NULL'), - pages=text_node.get('pages', 'NULL'), - publisher=text_node.get('publisher', 'NULL'), - publishing_year=text_node.get('publishing_year', ''), - school=text_node.get('school', 'NULL'), - title=text_node.get('title', 'NULL') + filename=vrt_filename, + mimetype='application/vrt+xml', + publishing_year=int(text_node.get('publishing_year')), + title=text_node.get('title') ) + if 'address' not in text_node.attrib: + corpus_file.address = text_node.get('address') + if 'booktitle' not in text_node.attrib: + corpus_file.booktitle = text_node.get('booktitle') + if 'chapter' not in text_node.attrib: + corpus_file.chapter = text_node.get('chapter') + if 'editor' not in text_node.attrib: + corpus_file.editor = text_node.get('editor') + if 'institution' not in text_node.attrib: + corpus_file.institution = text_node.get('institution') + if 'journal' not in text_node.attrib: + corpus_file.journal = text_node.get('journal') + if 'pages' not in text_node.attrib: + corpus_file.pages = text_node.get('pages') + if 'publisher' not in text_node.attrib: + corpus_file.publisher = text_node.get('publisher') + if 'school' not in text_node.attrib: + corpus_file.school = text_node.get('school') db.session.add(corpus_file) - # finish import and redirect to imported corpus - corpus.status = CorpusStatus.BUILT - db.session.commit() - os.remove(archive_file) - flash(f'Corpus "{corpus.title}" imported', 'corpus') - return make_response( - {'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201) - else: - # If imported zip is not valid delete corpus and give feedback - flash( - f'Can\'t import corpus "{corpus.title}": Invalid archive file', - category='error' - ) - tasks.delete_corpus(corpus.id) - return make_response({'redirect_url': url_for('.import_corpus')}, 201) # noqa + db.session.flush(objects=[corpus_file]) + db.session.refresh(corpus) + current_app.logger.warning(vrt_file) + current_app.logger.warning(corpus_file.path) + try: + shutil.copy2(vrt_file, corpus_file.path) + except Exception as e: + db.session.rollback() + flash('Internal Server Error2', category='error') + return make_response({'redirect_url': url_for('.import_corpus')}, 500) # noqa + db.session.commit() + flash(f'Corpus "{corpus.title}" imported', 'corpus') + return make_response({'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201) return render_template( 'corpora/import_corpus.html.j2', form=form, @@ -173,6 +167,26 @@ def analyse_corpus(corpus_id): ) +@bp.route('//build') +@login_required +def build_corpus(corpus_id): + corpus = Corpus.query.get_or_404(corpus_id) + if not (corpus.user == current_user or current_user.is_administrator()): + abort(403) + if corpus.files.all(): + tasks.build_corpus(corpus_id) + flash( + f'Corpus "{corpus.title}" marked for building', + category='corpus' + ) + else: + flash( + f'Can\'t build corpus "{corpus.title}": No corpus file(s)', + category='error' + ) + return redirect(url_for('.corpus', corpus_id=corpus_id)) + + @bp.route('//delete') @login_required def delete_corpus(corpus_id): @@ -184,6 +198,73 @@ def delete_corpus(corpus_id): return redirect(url_for('main.dashboard')) +@bp.route('//export') +@login_required +def export_corpus(corpus_id): + abort(503) + corpus = Corpus.query.get_or_404(corpus_id) + if not (corpus.user == current_user or current_user.is_administrator()): + abort(403) + return send_from_directory( + as_attachment=True, + directory=os.path.join(corpus.user.path, 'corpora'), + filename=corpus.archive_file, + mimetype='zip' + ) + + +@bp.route('//files/', methods=['GET', 'POST']) # noqa +@login_required +def corpus_file(corpus_id, corpus_file_id): + corpus_file = CorpusFile.query.filter( + CorpusFile.corpus_id == corpus_id, + CorpusFile.id == corpus_file_id + ).first_or_404() + if not ( + corpus_file.corpus.user == current_user + or current_user.is_administrator() + ): + abort(403) + form = EditCorpusFileForm(prefix='edit-corpus-file-form') + if form.validate_on_submit(): + corpus_file.address = form.address.data + corpus_file.author = form.author.data + corpus_file.booktitle = form.booktitle.data + corpus_file.chapter = form.chapter.data + corpus_file.editor = form.editor.data + corpus_file.institution = form.institution.data + corpus_file.journal = form.journal.data + corpus_file.pages = form.pages.data + corpus_file.publisher = form.publisher.data + corpus_file.publishing_year = form.publishing_year.data + corpus_file.school = form.school.data + corpus_file.title = form.title.data + corpus_file.corpus.status = CorpusStatus.UNPREPARED + db.session.commit() + flash(f'Corpus file "{corpus_file.filename}" edited', category='corpus') # noqa + return redirect(url_for('.corpus', corpus_id=corpus_id)) + # If no form is submitted or valid, fill out fields with current values + form.address.data = corpus_file.address + form.author.data = corpus_file.author + form.booktitle.data = corpus_file.booktitle + form.chapter.data = corpus_file.chapter + form.editor.data = corpus_file.editor + form.institution.data = corpus_file.institution + form.journal.data = corpus_file.journal + form.pages.data = corpus_file.pages + form.publisher.data = corpus_file.publisher + form.publishing_year.data = corpus_file.publishing_year + form.school.data = corpus_file.school + form.title.data = corpus_file.title + return render_template( + 'corpora/corpus_file.html.j2', + corpus=corpus_file.corpus, + corpus_file=corpus_file, + form=form, + title='Edit corpus file' + ) + + @bp.route('//files/add', methods=['GET', 'POST']) @login_required def add_corpus_file(corpus_id): @@ -271,76 +352,4 @@ def download_corpus_file(corpus_id, corpus_file_id): attachment_filename=corpus_file.filename, directory=os.path.dirname(corpus_file.path), filename=os.path.basename(corpus_file.path) - ) - - -@bp.route('//files/', methods=['GET', 'POST']) # noqa -@login_required -def corpus_file(corpus_id, corpus_file_id): - corpus_file = CorpusFile.query.filter( - CorpusFile.corpus_id == corpus_id, - CorpusFile.id == corpus_file_id - ).first_or_404() - if not ( - corpus_file.corpus.user == current_user - or current_user.is_administrator() - ): - abort(403) - form = EditCorpusFileForm(prefix='edit-corpus-file-form') - if form.validate_on_submit(): - corpus_file.address = form.address.data - corpus_file.author = form.author.data - corpus_file.booktitle = form.booktitle.data - corpus_file.chapter = form.chapter.data - corpus_file.editor = form.editor.data - corpus_file.institution = form.institution.data - corpus_file.journal = form.journal.data - corpus_file.pages = form.pages.data - corpus_file.publisher = form.publisher.data - corpus_file.publishing_year = form.publishing_year.data - corpus_file.school = form.school.data - corpus_file.title = form.title.data - corpus_file.corpus.status = CorpusStatus.UNPREPARED - db.session.commit() - flash(f'Corpus file "{corpus_file.filename}" edited', category='corpus') # noqa - return redirect(url_for('.corpus', corpus_id=corpus_id)) - # If no form is submitted or valid, fill out fields with current values - form.address.data = corpus_file.address - form.author.data = corpus_file.author - form.booktitle.data = corpus_file.booktitle - form.chapter.data = corpus_file.chapter - form.editor.data = corpus_file.editor - form.institution.data = corpus_file.institution - form.journal.data = corpus_file.journal - form.pages.data = corpus_file.pages - form.publisher.data = corpus_file.publisher - form.publishing_year.data = corpus_file.publishing_year - form.school.data = corpus_file.school - form.title.data = corpus_file.title - return render_template( - 'corpora/corpus_file.html.j2', - corpus=corpus_file.corpus, - corpus_file=corpus_file, - form=form, - title='Edit corpus file' - ) - - -@bp.route('//build') -@login_required -def build_corpus(corpus_id): - corpus = Corpus.query.get_or_404(corpus_id) - if not (corpus.user == current_user or current_user.is_administrator()): - abort(403) - if corpus.files.all(): - tasks.build_corpus(corpus_id) - flash( - f'Corpus "{corpus.title}" marked for building', - category='corpus' - ) - else: - flash( - f'Can\'t build corpus "{corpus.title}": No corpus file(s)', - category='error' - ) - return redirect(url_for('.corpus', corpus_id=corpus_id)) + ) \ No newline at end of file diff --git a/app/templates/corpora/import_corpus.html.j2 b/app/templates/corpora/import_corpus.html.j2 index 85db2646..957668e6 100644 --- a/app/templates/corpora/import_corpus.html.j2 +++ b/app/templates/corpora/import_corpus.html.j2 @@ -31,7 +31,7 @@
- {{ wtf.render_field(form.file, accept='.zip', placeholder='Choose your exported .zip file') }} + {{ wtf.render_field(form.archive, accept='.zip', placeholder='Choose an exported ZIP archive') }}
diff --git a/app/templates/main/dashboard.html.j2 b/app/templates/main/dashboard.html.j2 index 00695d69..007f190b 100644 --- a/app/templates/main/dashboard.html.j2 +++ b/app/templates/main/dashboard.html.j2 @@ -40,7 +40,7 @@