mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2024-12-27 03:44:19 +00:00
81 lines
3.1 KiB
Python
81 lines
3.1 KiB
Python
from datetime import datetime
|
|
from .. import db
|
|
from ..decorators import background
|
|
from ..models import Corpus, CorpusFile
|
|
import xml.etree.ElementTree as ET
|
|
import os
|
|
import shutil
|
|
|
|
|
|
@background
|
|
def build_corpus(corpus_id, *args, **kwargs):
|
|
app = kwargs['app']
|
|
with app.app_context():
|
|
corpus = Corpus.query.get(corpus_id)
|
|
if corpus is None:
|
|
return
|
|
corpus.status = 'File processing'
|
|
db.session.commit()
|
|
corpus_dir = os.path.join(app.config['NOPAQUE_STORAGE'],
|
|
str(corpus.user_id), 'corpora',
|
|
str(corpus.id))
|
|
output_dir = os.path.join(corpus_dir, 'merged')
|
|
shutil.rmtree(output_dir, ignore_errors=True)
|
|
os.mkdir(output_dir)
|
|
master_element_tree = ET.ElementTree(
|
|
ET.fromstring('<corpus>\n</corpus>'))
|
|
for corpus_file in corpus.files:
|
|
file = os.path.join(corpus_dir, corpus_file.filename)
|
|
element_tree = ET.parse(file)
|
|
text_node = element_tree.find('text')
|
|
text_node.set('address', corpus_file.address or "NULL")
|
|
text_node.set('author', corpus_file.author)
|
|
text_node.set('booktitle', corpus_file.booktitle or "NULL")
|
|
text_node.set('chapter', corpus_file.chapter or "NULL")
|
|
text_node.set('editor', corpus_file.editor or "NULL")
|
|
text_node.set('institution', corpus_file.institution or "NULL")
|
|
text_node.set('journal', corpus_file.journal or "NULL")
|
|
text_node.set('pages', corpus_file.pages or "NULL")
|
|
text_node.set('publisher', corpus_file.publisher or "NULL")
|
|
text_node.set('publishing_year', str(corpus_file.publishing_year))
|
|
text_node.set('school', corpus_file.school or "NULL")
|
|
text_node.set('title', corpus_file.title)
|
|
element_tree.write(file)
|
|
master_element_tree.getroot().insert(1, text_node)
|
|
output_file = os.path.join(output_dir, 'corpus.vrt')
|
|
master_element_tree.write(output_file, xml_declaration=True,
|
|
encoding='utf-8')
|
|
corpus.status = 'submitted'
|
|
corpus.last_edited_date = datetime.utcnow()
|
|
db.session.commit()
|
|
|
|
|
|
@background
|
|
def delete_corpus(corpus_id, *args, **kwargs):
|
|
app = kwargs['app']
|
|
with app.app_context():
|
|
corpus = Corpus.query.get(corpus_id)
|
|
if corpus is None:
|
|
return
|
|
path = os.path.join(app.config['NOPAQUE_STORAGE'], str(corpus.user_id),
|
|
'corpora', str(corpus.id))
|
|
shutil.rmtree(path, ignore_errors=True)
|
|
corpus.delete()
|
|
|
|
|
|
@background
|
|
def delete_corpus_file(corpus_file_id, *args, **kwargs):
|
|
app = kwargs['app']
|
|
with app.app_context():
|
|
corpus_file = CorpusFile.query.get(corpus_file_id)
|
|
if corpus_file is None:
|
|
return
|
|
path = os.path.join(app.config['NOPAQUE_STORAGE'], corpus_file.dir,
|
|
corpus_file.filename)
|
|
try:
|
|
os.remove(path)
|
|
except Exception:
|
|
pass
|
|
else:
|
|
corpus_file.delete()
|