mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2025-01-12 19:20:34 +00:00
200 lines
7.0 KiB
Python
200 lines
7.0 KiB
Python
from datetime import datetime
|
|
from enum import IntEnum
|
|
from flask import current_app, url_for
|
|
from flask_hashids import HashidMixin
|
|
from sqlalchemy.ext.associationproxy import association_proxy
|
|
from pathlib import Path
|
|
import shutil
|
|
import xml.etree.ElementTree as ET
|
|
from app import db
|
|
from app.converters.vrt import normalize_vrt_file
|
|
from app.extensions.nopaque_sqlalchemy_type_decorators import IntEnumColumn
|
|
from .corpus_follower_association import CorpusFollowerAssociation
|
|
|
|
|
|
class CorpusStatus(IntEnum):
|
|
UNPREPARED = 1
|
|
SUBMITTED = 2
|
|
QUEUED = 3
|
|
BUILDING = 4
|
|
BUILT = 5
|
|
FAILED = 6
|
|
STARTING_ANALYSIS_SESSION = 7
|
|
RUNNING_ANALYSIS_SESSION = 8
|
|
CANCELING_ANALYSIS_SESSION = 9
|
|
|
|
@staticmethod
|
|
def get(corpus_status: 'CorpusStatus | int | str') -> 'CorpusStatus':
|
|
if isinstance(corpus_status, CorpusStatus):
|
|
return corpus_status
|
|
if isinstance(corpus_status, int):
|
|
return CorpusStatus(corpus_status)
|
|
if isinstance(corpus_status, str):
|
|
return CorpusStatus[corpus_status]
|
|
raise TypeError('corpus_status must be CorpusStatus, int, or str')
|
|
|
|
|
|
class Corpus(HashidMixin, db.Model):
|
|
'''
|
|
Class to define a corpus.
|
|
'''
|
|
__tablename__ = 'corpora'
|
|
# Primary key
|
|
id = db.Column(db.Integer, primary_key=True)
|
|
# Foreign keys
|
|
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
|
|
# Fields
|
|
creation_date = db.Column(db.DateTime(), default=datetime.utcnow)
|
|
description = db.Column(db.String(255))
|
|
status = db.Column(
|
|
IntEnumColumn(CorpusStatus),
|
|
default=CorpusStatus.UNPREPARED
|
|
)
|
|
title = db.Column(db.String(32))
|
|
num_analysis_sessions = db.Column(db.Integer, default=0)
|
|
num_tokens = db.Column(db.Integer, default=0)
|
|
is_public = db.Column(db.Boolean, default=False)
|
|
# Relationships
|
|
files = db.relationship(
|
|
'CorpusFile',
|
|
back_populates='corpus',
|
|
lazy='dynamic',
|
|
cascade='all, delete-orphan'
|
|
)
|
|
corpus_follower_associations = db.relationship(
|
|
'CorpusFollowerAssociation',
|
|
back_populates='corpus',
|
|
cascade='all, delete-orphan'
|
|
)
|
|
followers = association_proxy(
|
|
'corpus_follower_associations',
|
|
'follower',
|
|
creator=lambda u: CorpusFollowerAssociation(follower=u)
|
|
)
|
|
user = db.relationship('User', back_populates='corpora')
|
|
# "static" attributes
|
|
max_num_tokens = 2_147_483_647
|
|
|
|
def __repr__(self):
|
|
return f'<Corpus {self.title}>'
|
|
|
|
@property
|
|
def analysis_url(self):
|
|
return url_for('corpora.analysis', corpus_id=self.id)
|
|
|
|
@property
|
|
def jsonpatch_path(self):
|
|
return f'{self.user.jsonpatch_path}/corpora/{self.hashid}'
|
|
|
|
@property
|
|
def path(self) -> Path:
|
|
return self.user.path / 'corpora' / f'{self.id}'
|
|
|
|
@property
|
|
def url(self):
|
|
return url_for('corpora.corpus', corpus_id=self.id)
|
|
|
|
@property
|
|
def user_hashid(self):
|
|
return self.user.hashid
|
|
|
|
@staticmethod
|
|
def create(**kwargs):
|
|
corpus = Corpus(**kwargs)
|
|
db.session.add(corpus)
|
|
db.session.flush(objects=[corpus])
|
|
db.session.refresh(corpus)
|
|
corpus_files_dir = corpus.path / 'files'
|
|
corpus_cwb_dir = corpus.path / 'cwb'
|
|
corpus_cwb_data_dir = corpus_cwb_dir / 'data'
|
|
corpus_cwb_registry_dir = corpus_cwb_dir / 'registry'
|
|
try:
|
|
corpus.path.mkdir()
|
|
corpus_files_dir.mkdir()
|
|
corpus_cwb_dir.mkdir()
|
|
corpus_cwb_data_dir.mkdir()
|
|
corpus_cwb_registry_dir.mkdir()
|
|
except OSError as e:
|
|
# TODO: Potential leftover cleanup
|
|
current_app.logger.error(e)
|
|
db.session.rollback()
|
|
raise
|
|
return corpus
|
|
|
|
def build(self):
|
|
corpus_cwb_dir = self.path / 'cwb'
|
|
corpus_cwb_data_dir = corpus_cwb_dir / 'data'
|
|
corpus_cwb_registry_dir = corpus_cwb_dir / 'registry'
|
|
try:
|
|
shutil.rmtree(corpus_cwb_dir, ignore_errors=True)
|
|
corpus_cwb_dir.mkdir()
|
|
corpus_cwb_data_dir.mkdir()
|
|
corpus_cwb_registry_dir.mkdir()
|
|
except OSError as e:
|
|
current_app.logger.error(e)
|
|
self.status = CorpusStatus.FAILED
|
|
raise
|
|
corpus_element = ET.fromstring('<corpus>\n</corpus>')
|
|
for corpus_file in self.files:
|
|
normalized_vrt_path = corpus_cwb_dir / f'{corpus_file.id}.norm.vrt'
|
|
try:
|
|
normalize_vrt_file(corpus_file.path, normalized_vrt_path)
|
|
except:
|
|
self.status = CorpusStatus.FAILED
|
|
return
|
|
element_tree = ET.parse(normalized_vrt_path)
|
|
text_element = element_tree.getroot()
|
|
text_element.set('author', corpus_file.author)
|
|
text_element.set('title', corpus_file.title)
|
|
text_element.set(
|
|
'publishing_year',
|
|
f'{corpus_file.publishing_year}'
|
|
)
|
|
text_element.set('address', corpus_file.address or 'NULL')
|
|
text_element.set('booktitle', corpus_file.booktitle or 'NULL')
|
|
text_element.set('chapter', corpus_file.chapter or 'NULL')
|
|
text_element.set('editor', corpus_file.editor or 'NULL')
|
|
text_element.set('institution', corpus_file.institution or 'NULL')
|
|
text_element.set('journal', corpus_file.journal or 'NULL')
|
|
text_element.set('pages', f'{corpus_file.pages}' or 'NULL')
|
|
text_element.set('publisher', corpus_file.publisher or 'NULL')
|
|
text_element.set('school', corpus_file.school or 'NULL')
|
|
text_element.tail = '\n'
|
|
# corpus_element.insert(1, text_element)
|
|
corpus_element.append(text_element)
|
|
ET.ElementTree(corpus_element).write(
|
|
corpus_cwb_dir / 'corpus.vrt',
|
|
encoding='utf-8'
|
|
)
|
|
self.status = CorpusStatus.SUBMITTED
|
|
|
|
def delete(self):
|
|
shutil.rmtree(self.path, ignore_errors=True)
|
|
db.session.delete(self)
|
|
|
|
def to_json_serializeable(self, backrefs=False, relationships=False):
|
|
json_serializeable = {
|
|
'id': self.hashid,
|
|
'creation_date': f'{self.creation_date.isoformat()}Z',
|
|
'description': self.description,
|
|
'max_num_tokens': self.max_num_tokens,
|
|
'num_analysis_sessions': self.num_analysis_sessions,
|
|
'num_tokens': self.num_tokens,
|
|
'status': self.status.name,
|
|
'title': self.title,
|
|
'is_public': self.is_public
|
|
}
|
|
if backrefs:
|
|
json_serializeable['user'] = \
|
|
self.user.to_json_serializeable(backrefs=True)
|
|
if relationships:
|
|
json_serializeable['corpus_follower_associations'] = {
|
|
x.hashid: x.to_json_serializeable()
|
|
for x in self.corpus_follower_associations
|
|
}
|
|
json_serializeable['files'] = {
|
|
x.hashid: x.to_json_serializeable(relationships=True)
|
|
for x in self.files
|
|
}
|
|
return json_serializeable
|