from datetime import datetime from .. import db from ..decorators import background from ..models import Corpus, CorpusFile import xml.etree.ElementTree as ET import os import shutil @background def build_corpus(corpus_id, *args, **kwargs): app = kwargs['app'] with app.app_context(): corpus = Corpus.query.get(corpus_id) if corpus is None: return corpus.status = 'File processing' db.session.commit() corpus_dir = os.path.join(app.config['NOPAQUE_STORAGE'], str(corpus.user_id), 'corpora', str(corpus.id)) output_dir = os.path.join(corpus_dir, 'merged') shutil.rmtree(output_dir, ignore_errors=True) os.mkdir(output_dir) master_element_tree = ET.ElementTree( ET.fromstring('\n')) for corpus_file in corpus.files: file = os.path.join(corpus_dir, corpus_file.filename) element_tree = ET.parse(file) text_node = element_tree.find('text') text_node.set('address', corpus_file.address or "NULL") text_node.set('author', corpus_file.author) text_node.set('booktitle', corpus_file.booktitle or "NULL") text_node.set('chapter', corpus_file.chapter or "NULL") text_node.set('editor', corpus_file.editor or "NULL") text_node.set('institution', corpus_file.institution or "NULL") text_node.set('journal', corpus_file.journal or "NULL") text_node.set('pages', corpus_file.pages or "NULL") text_node.set('publisher', corpus_file.publisher or "NULL") text_node.set('publishing_year', str(corpus_file.publishing_year)) text_node.set('school', corpus_file.school or "NULL") text_node.set('title', corpus_file.title) element_tree.write(file) master_element_tree.getroot().insert(1, text_node) output_file = os.path.join(output_dir, 'corpus.vrt') master_element_tree.write(output_file, xml_declaration=True, encoding='utf-8') corpus.status = 'submitted' corpus.last_edited_date = datetime.utcnow() db.session.commit() @background def delete_corpus(corpus_id, *args, **kwargs): app = kwargs['app'] with app.app_context(): corpus = Corpus.query.get(corpus_id) if corpus is None: return path = os.path.join(app.config['NOPAQUE_STORAGE'], str(corpus.user_id), 'corpora', str(corpus.id)) shutil.rmtree(path, ignore_errors=True) corpus.delete() @background def delete_corpus_file(corpus_file_id, *args, **kwargs): app = kwargs['app'] with app.app_context(): corpus_file = CorpusFile.query.get(corpus_file_id) if corpus_file is None: return path = os.path.join(app.config['NOPAQUE_STORAGE'], corpus_file.dir, corpus_file.filename) try: os.remove(path) except Exception: pass else: corpus_file.delete()