mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2025-06-21 05:20:36 +00:00
move models in seperate modules
This commit is contained in:
188
app/models/corpus.py
Normal file
188
app/models/corpus.py
Normal file
@ -0,0 +1,188 @@
|
||||
from datetime import datetime
|
||||
from enum import IntEnum
|
||||
from flask import current_app, url_for
|
||||
from flask_hashids import HashidMixin
|
||||
from sqlalchemy.ext.associationproxy import association_proxy
|
||||
from typing import Union
|
||||
import os
|
||||
import shutil
|
||||
import xml.etree.ElementTree as ET
|
||||
from app import db
|
||||
from app.converters.vrt import normalize_vrt_file
|
||||
from app.ext.flask_sqlalchemy import IntEnumColumn
|
||||
from .corpus_follower_association import CorpusFollowerAssociation
|
||||
|
||||
|
||||
class CorpusStatus(IntEnum):
|
||||
UNPREPARED = 1
|
||||
SUBMITTED = 2
|
||||
QUEUED = 3
|
||||
BUILDING = 4
|
||||
BUILT = 5
|
||||
FAILED = 6
|
||||
STARTING_ANALYSIS_SESSION = 7
|
||||
RUNNING_ANALYSIS_SESSION = 8
|
||||
CANCELING_ANALYSIS_SESSION = 9
|
||||
|
||||
@staticmethod
|
||||
def get(corpus_status: Union['CorpusStatus', int, str]) -> 'CorpusStatus':
|
||||
if isinstance(corpus_status, CorpusStatus):
|
||||
return corpus_status
|
||||
if isinstance(corpus_status, int):
|
||||
return CorpusStatus(corpus_status)
|
||||
if isinstance(corpus_status, str):
|
||||
return CorpusStatus[corpus_status]
|
||||
raise TypeError('corpus_status must be CorpusStatus, int, or str')
|
||||
|
||||
|
||||
class Corpus(HashidMixin, db.Model):
|
||||
'''
|
||||
Class to define a corpus.
|
||||
'''
|
||||
__tablename__ = 'corpora'
|
||||
# Primary key
|
||||
id = db.Column(db.Integer, primary_key=True)
|
||||
# Foreign keys
|
||||
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
|
||||
# Fields
|
||||
creation_date = db.Column(db.DateTime(), default=datetime.utcnow)
|
||||
description = db.Column(db.String(255))
|
||||
status = db.Column(
|
||||
IntEnumColumn(CorpusStatus),
|
||||
default=CorpusStatus.UNPREPARED
|
||||
)
|
||||
title = db.Column(db.String(32))
|
||||
num_analysis_sessions = db.Column(db.Integer, default=0)
|
||||
num_tokens = db.Column(db.Integer, default=0)
|
||||
is_public = db.Column(db.Boolean, default=False)
|
||||
# Relationships
|
||||
files = db.relationship(
|
||||
'CorpusFile',
|
||||
back_populates='corpus',
|
||||
lazy='dynamic',
|
||||
cascade='all, delete-orphan'
|
||||
)
|
||||
corpus_follower_associations = db.relationship(
|
||||
'CorpusFollowerAssociation',
|
||||
back_populates='corpus',
|
||||
cascade='all, delete-orphan'
|
||||
)
|
||||
followers = association_proxy(
|
||||
'corpus_follower_associations',
|
||||
'follower',
|
||||
creator=lambda u: CorpusFollowerAssociation(follower=u)
|
||||
)
|
||||
user = db.relationship('User', back_populates='corpora')
|
||||
# "static" attributes
|
||||
max_num_tokens = 2_147_483_647
|
||||
|
||||
def __repr__(self):
|
||||
return f'<Corpus {self.title}>'
|
||||
|
||||
@property
|
||||
def analysis_url(self):
|
||||
return url_for('corpora.analysis', corpus_id=self.id)
|
||||
|
||||
@property
|
||||
def jsonpatch_path(self):
|
||||
return f'{self.user.jsonpatch_path}/corpora/{self.hashid}'
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return os.path.join(self.user.path, 'corpora', str(self.id))
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
return url_for('corpora.corpus', corpus_id=self.id)
|
||||
|
||||
@property
|
||||
def user_hashid(self):
|
||||
return self.user.hashid
|
||||
|
||||
@staticmethod
|
||||
def create(**kwargs):
|
||||
corpus = Corpus(**kwargs)
|
||||
db.session.add(corpus)
|
||||
db.session.flush(objects=[corpus])
|
||||
db.session.refresh(corpus)
|
||||
try:
|
||||
os.mkdir(corpus.path)
|
||||
os.mkdir(os.path.join(corpus.path, 'files'))
|
||||
os.mkdir(os.path.join(corpus.path, 'cwb'))
|
||||
os.mkdir(os.path.join(corpus.path, 'cwb', 'data'))
|
||||
os.mkdir(os.path.join(corpus.path, 'cwb', 'registry'))
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
raise e
|
||||
return corpus
|
||||
|
||||
def build(self):
|
||||
build_dir = os.path.join(self.path, 'cwb')
|
||||
shutil.rmtree(build_dir, ignore_errors=True)
|
||||
os.mkdir(build_dir)
|
||||
os.mkdir(os.path.join(build_dir, 'data'))
|
||||
os.mkdir(os.path.join(build_dir, 'registry'))
|
||||
corpus_element = ET.fromstring('<corpus>\n</corpus>')
|
||||
for corpus_file in self.files:
|
||||
normalized_vrt_path = os.path.join(build_dir, f'{corpus_file.id}.norm.vrt')
|
||||
try:
|
||||
normalize_vrt_file(corpus_file.path, normalized_vrt_path)
|
||||
except:
|
||||
self.status = CorpusStatus.FAILED
|
||||
return
|
||||
element_tree = ET.parse(normalized_vrt_path)
|
||||
text_element = element_tree.getroot()
|
||||
text_element.set('author', corpus_file.author)
|
||||
text_element.set('title', corpus_file.title)
|
||||
text_element.set(
|
||||
'publishing_year',
|
||||
f'{corpus_file.publishing_year}'
|
||||
)
|
||||
text_element.set('address', corpus_file.address or 'NULL')
|
||||
text_element.set('booktitle', corpus_file.booktitle or 'NULL')
|
||||
text_element.set('chapter', corpus_file.chapter or 'NULL')
|
||||
text_element.set('editor', corpus_file.editor or 'NULL')
|
||||
text_element.set('institution', corpus_file.institution or 'NULL')
|
||||
text_element.set('journal', corpus_file.journal or 'NULL')
|
||||
text_element.set('pages', f'{corpus_file.pages}' or 'NULL')
|
||||
text_element.set('publisher', corpus_file.publisher or 'NULL')
|
||||
text_element.set('school', corpus_file.school or 'NULL')
|
||||
text_element.tail = '\n'
|
||||
# corpus_element.insert(1, text_element)
|
||||
corpus_element.append(text_element)
|
||||
ET.ElementTree(corpus_element).write(
|
||||
os.path.join(build_dir, 'corpus.vrt'),
|
||||
encoding='utf-8'
|
||||
)
|
||||
self.status = CorpusStatus.SUBMITTED
|
||||
|
||||
def delete(self):
|
||||
shutil.rmtree(self.path, ignore_errors=True)
|
||||
db.session.delete(self)
|
||||
|
||||
def to_json_serializeable(self, backrefs=False, relationships=False):
|
||||
json_serializeable = {
|
||||
'id': self.hashid,
|
||||
'creation_date': f'{self.creation_date.isoformat()}Z',
|
||||
'description': self.description,
|
||||
'max_num_tokens': self.max_num_tokens,
|
||||
'num_analysis_sessions': self.num_analysis_sessions,
|
||||
'num_tokens': self.num_tokens,
|
||||
'status': self.status.name,
|
||||
'title': self.title,
|
||||
'is_public': self.is_public
|
||||
}
|
||||
if backrefs:
|
||||
json_serializeable['user'] = \
|
||||
self.user.to_json_serializeable(backrefs=True)
|
||||
if relationships:
|
||||
json_serializeable['corpus_follower_associations'] = {
|
||||
x.hashid: x.to_json_serializeable()
|
||||
for x in self.corpus_follower_associations
|
||||
}
|
||||
json_serializeable['files'] = {
|
||||
x.hashid: x.to_json_serializeable(relationships=True)
|
||||
for x in self.files
|
||||
}
|
||||
return json_serializeable
|
Reference in New Issue
Block a user