move models in seperate modules

This commit is contained in:
Patrick Jentsch
2024-03-05 16:02:23 +01:00
parent cf8c164d60
commit a1e5bd61e0
20 changed files with 1890 additions and 2 deletions

188
app/models/corpus.py Normal file
View File

@ -0,0 +1,188 @@
from datetime import datetime
from enum import IntEnum
from flask import current_app, url_for
from flask_hashids import HashidMixin
from sqlalchemy.ext.associationproxy import association_proxy
from typing import Union
import os
import shutil
import xml.etree.ElementTree as ET
from app import db
from app.converters.vrt import normalize_vrt_file
from app.ext.flask_sqlalchemy import IntEnumColumn
from .corpus_follower_association import CorpusFollowerAssociation
class CorpusStatus(IntEnum):
UNPREPARED = 1
SUBMITTED = 2
QUEUED = 3
BUILDING = 4
BUILT = 5
FAILED = 6
STARTING_ANALYSIS_SESSION = 7
RUNNING_ANALYSIS_SESSION = 8
CANCELING_ANALYSIS_SESSION = 9
@staticmethod
def get(corpus_status: Union['CorpusStatus', int, str]) -> 'CorpusStatus':
if isinstance(corpus_status, CorpusStatus):
return corpus_status
if isinstance(corpus_status, int):
return CorpusStatus(corpus_status)
if isinstance(corpus_status, str):
return CorpusStatus[corpus_status]
raise TypeError('corpus_status must be CorpusStatus, int, or str')
class Corpus(HashidMixin, db.Model):
'''
Class to define a corpus.
'''
__tablename__ = 'corpora'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
# Fields
creation_date = db.Column(db.DateTime(), default=datetime.utcnow)
description = db.Column(db.String(255))
status = db.Column(
IntEnumColumn(CorpusStatus),
default=CorpusStatus.UNPREPARED
)
title = db.Column(db.String(32))
num_analysis_sessions = db.Column(db.Integer, default=0)
num_tokens = db.Column(db.Integer, default=0)
is_public = db.Column(db.Boolean, default=False)
# Relationships
files = db.relationship(
'CorpusFile',
back_populates='corpus',
lazy='dynamic',
cascade='all, delete-orphan'
)
corpus_follower_associations = db.relationship(
'CorpusFollowerAssociation',
back_populates='corpus',
cascade='all, delete-orphan'
)
followers = association_proxy(
'corpus_follower_associations',
'follower',
creator=lambda u: CorpusFollowerAssociation(follower=u)
)
user = db.relationship('User', back_populates='corpora')
# "static" attributes
max_num_tokens = 2_147_483_647
def __repr__(self):
return f'<Corpus {self.title}>'
@property
def analysis_url(self):
return url_for('corpora.analysis', corpus_id=self.id)
@property
def jsonpatch_path(self):
return f'{self.user.jsonpatch_path}/corpora/{self.hashid}'
@property
def path(self):
return os.path.join(self.user.path, 'corpora', str(self.id))
@property
def url(self):
return url_for('corpora.corpus', corpus_id=self.id)
@property
def user_hashid(self):
return self.user.hashid
@staticmethod
def create(**kwargs):
corpus = Corpus(**kwargs)
db.session.add(corpus)
db.session.flush(objects=[corpus])
db.session.refresh(corpus)
try:
os.mkdir(corpus.path)
os.mkdir(os.path.join(corpus.path, 'files'))
os.mkdir(os.path.join(corpus.path, 'cwb'))
os.mkdir(os.path.join(corpus.path, 'cwb', 'data'))
os.mkdir(os.path.join(corpus.path, 'cwb', 'registry'))
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
raise e
return corpus
def build(self):
build_dir = os.path.join(self.path, 'cwb')
shutil.rmtree(build_dir, ignore_errors=True)
os.mkdir(build_dir)
os.mkdir(os.path.join(build_dir, 'data'))
os.mkdir(os.path.join(build_dir, 'registry'))
corpus_element = ET.fromstring('<corpus>\n</corpus>')
for corpus_file in self.files:
normalized_vrt_path = os.path.join(build_dir, f'{corpus_file.id}.norm.vrt')
try:
normalize_vrt_file(corpus_file.path, normalized_vrt_path)
except:
self.status = CorpusStatus.FAILED
return
element_tree = ET.parse(normalized_vrt_path)
text_element = element_tree.getroot()
text_element.set('author', corpus_file.author)
text_element.set('title', corpus_file.title)
text_element.set(
'publishing_year',
f'{corpus_file.publishing_year}'
)
text_element.set('address', corpus_file.address or 'NULL')
text_element.set('booktitle', corpus_file.booktitle or 'NULL')
text_element.set('chapter', corpus_file.chapter or 'NULL')
text_element.set('editor', corpus_file.editor or 'NULL')
text_element.set('institution', corpus_file.institution or 'NULL')
text_element.set('journal', corpus_file.journal or 'NULL')
text_element.set('pages', f'{corpus_file.pages}' or 'NULL')
text_element.set('publisher', corpus_file.publisher or 'NULL')
text_element.set('school', corpus_file.school or 'NULL')
text_element.tail = '\n'
# corpus_element.insert(1, text_element)
corpus_element.append(text_element)
ET.ElementTree(corpus_element).write(
os.path.join(build_dir, 'corpus.vrt'),
encoding='utf-8'
)
self.status = CorpusStatus.SUBMITTED
def delete(self):
shutil.rmtree(self.path, ignore_errors=True)
db.session.delete(self)
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
'id': self.hashid,
'creation_date': f'{self.creation_date.isoformat()}Z',
'description': self.description,
'max_num_tokens': self.max_num_tokens,
'num_analysis_sessions': self.num_analysis_sessions,
'num_tokens': self.num_tokens,
'status': self.status.name,
'title': self.title,
'is_public': self.is_public
}
if backrefs:
json_serializeable['user'] = \
self.user.to_json_serializeable(backrefs=True)
if relationships:
json_serializeable['corpus_follower_associations'] = {
x.hashid: x.to_json_serializeable()
for x in self.corpus_follower_associations
}
json_serializeable['files'] = {
x.hashid: x.to_json_serializeable(relationships=True)
for x in self.files
}
return json_serializeable