from datetime import datetime from enum import IntEnum from flask import current_app, url_for from flask_hashids import HashidMixin from sqlalchemy.ext.associationproxy import association_proxy from typing import Union from pathlib import Path import shutil import xml.etree.ElementTree as ET from app import db from app.converters.vrt import normalize_vrt_file from app.extensions.sqlalchemy import IntEnumColumn from .corpus_follower_association import CorpusFollowerAssociation class CorpusStatus(IntEnum): UNPREPARED = 1 SUBMITTED = 2 QUEUED = 3 BUILDING = 4 BUILT = 5 FAILED = 6 STARTING_ANALYSIS_SESSION = 7 RUNNING_ANALYSIS_SESSION = 8 CANCELING_ANALYSIS_SESSION = 9 @staticmethod def get(corpus_status: Union['CorpusStatus', int, str]) -> 'CorpusStatus': if isinstance(corpus_status, CorpusStatus): return corpus_status if isinstance(corpus_status, int): return CorpusStatus(corpus_status) if isinstance(corpus_status, str): return CorpusStatus[corpus_status] raise TypeError('corpus_status must be CorpusStatus, int, or str') class Corpus(HashidMixin, db.Model): ''' Class to define a corpus. ''' __tablename__ = 'corpora' # Primary key id = db.Column(db.Integer, primary_key=True) # Foreign keys user_id = db.Column(db.Integer, db.ForeignKey('users.id')) # Fields creation_date = db.Column(db.DateTime(), default=datetime.utcnow) description = db.Column(db.String(255)) status = db.Column( IntEnumColumn(CorpusStatus), default=CorpusStatus.UNPREPARED ) title = db.Column(db.String(32)) num_analysis_sessions = db.Column(db.Integer, default=0) num_tokens = db.Column(db.Integer, default=0) is_public = db.Column(db.Boolean, default=False) # Relationships files = db.relationship( 'CorpusFile', back_populates='corpus', lazy='dynamic', cascade='all, delete-orphan' ) corpus_follower_associations = db.relationship( 'CorpusFollowerAssociation', back_populates='corpus', cascade='all, delete-orphan' ) followers = association_proxy( 'corpus_follower_associations', 'follower', creator=lambda u: CorpusFollowerAssociation(follower=u) ) user = db.relationship('User', back_populates='corpora') # "static" attributes max_num_tokens = 2_147_483_647 def __repr__(self): return f'' @property def analysis_url(self): return url_for('corpora.analysis', corpus_id=self.id) @property def jsonpatch_path(self): return f'{self.user.jsonpatch_path}/corpora/{self.hashid}' @property def path(self) -> Path: return self.user.path / 'corpora' / f'{self.id}' @property def url(self): return url_for('corpora.corpus', corpus_id=self.id) @property def user_hashid(self): return self.user.hashid @staticmethod def create(**kwargs): corpus = Corpus(**kwargs) db.session.add(corpus) db.session.flush(objects=[corpus]) db.session.refresh(corpus) corpus_files_dir = corpus.path / 'files' corpus_cwb_dir = corpus.path / 'cwb' corpus_cwb_data_dir = corpus_cwb_dir / 'data' corpus_cwb_registry_dir = corpus_cwb_dir / 'registry' try: corpus.path.mkdir() corpus_files_dir.mkdir() corpus_cwb_dir.mkdir() corpus_cwb_data_dir.mkdir() corpus_cwb_registry_dir.mkdir() except OSError as e: # TODO: Potential leftover cleanup current_app.logger.error(e) db.session.rollback() raise return corpus def build(self): corpus_cwb_dir = self.path / 'cwb' corpus_cwb_data_dir = corpus_cwb_dir / 'data' corpus_cwb_registry_dir = corpus_cwb_dir / 'registry' try: shutil.rmtree(corpus_cwb_dir, ignore_errors=True) corpus_cwb_dir.mkdir() corpus_cwb_data_dir.mkdir() corpus_cwb_registry_dir.mkdir() except OSError as e: current_app.logger.error(e) self.status = CorpusStatus.FAILED raise corpus_element = ET.fromstring('\n') for corpus_file in self.files: normalized_vrt_path = corpus_cwb_dir / f'{corpus_file.id}.norm.vrt' try: normalize_vrt_file(corpus_file.path, normalized_vrt_path) except: self.status = CorpusStatus.FAILED return element_tree = ET.parse(normalized_vrt_path) text_element = element_tree.getroot() text_element.set('author', corpus_file.author) text_element.set('title', corpus_file.title) text_element.set( 'publishing_year', f'{corpus_file.publishing_year}' ) text_element.set('address', corpus_file.address or 'NULL') text_element.set('booktitle', corpus_file.booktitle or 'NULL') text_element.set('chapter', corpus_file.chapter or 'NULL') text_element.set('editor', corpus_file.editor or 'NULL') text_element.set('institution', corpus_file.institution or 'NULL') text_element.set('journal', corpus_file.journal or 'NULL') text_element.set('pages', f'{corpus_file.pages}' or 'NULL') text_element.set('publisher', corpus_file.publisher or 'NULL') text_element.set('school', corpus_file.school or 'NULL') text_element.tail = '\n' # corpus_element.insert(1, text_element) corpus_element.append(text_element) ET.ElementTree(corpus_element).write( corpus_cwb_dir / 'corpus.vrt', encoding='utf-8' ) self.status = CorpusStatus.SUBMITTED def delete(self): shutil.rmtree(self.path, ignore_errors=True) db.session.delete(self) def to_json_serializeable(self, backrefs=False, relationships=False): json_serializeable = { 'id': self.hashid, 'creation_date': f'{self.creation_date.isoformat()}Z', 'description': self.description, 'max_num_tokens': self.max_num_tokens, 'num_analysis_sessions': self.num_analysis_sessions, 'num_tokens': self.num_tokens, 'status': self.status.name, 'title': self.title, 'is_public': self.is_public } if backrefs: json_serializeable['user'] = \ self.user.to_json_serializeable(backrefs=True) if relationships: json_serializeable['corpus_follower_associations'] = { x.hashid: x.to_json_serializeable() for x in self.corpus_follower_associations } json_serializeable['files'] = { x.hashid: x.to_json_serializeable(relationships=True) for x in self.files } return json_serializeable