from app import db, login, mail, socketio from app.converters.vrt import normalize_vrt_file from app.email import create_message from datetime import datetime, timedelta from enum import Enum, IntEnum from flask import current_app, url_for from flask_hashids import HashidMixin from flask_login import UserMixin from time import sleep from tqdm import tqdm from werkzeug.security import generate_password_hash, check_password_hash import base64 import json import jwt import os import requests import shutil import xml.etree.ElementTree as ET import yaml TRANSKRIBUS_HTR_MODELS = \ json.loads(requests.get('https://transkribus.eu/TrpServer/rest/models/text', params={'docType': 'handwritten'}).content)['trpModelMetadata'] # noqa ############################################################################## # enums # ############################################################################## # region enums class CorpusStatus(IntEnum): UNPREPARED = 1 SUBMITTED = 2 QUEUED = 3 BUILDING = 4 BUILT = 5 FAILED = 6 STARTING_ANALYSIS_SESSION = 7 RUNNING_ANALYSIS_SESSION = 8 CANCELING_ANALYSIS_SESSION = 9 class JobStatus(IntEnum): INITIALIZING = 1 SUBMITTED = 2 QUEUED = 3 RUNNING = 4 CANCELING = 5 CANCELED = 6 COMPLETED = 7 FAILED = 8 class Permission(IntEnum): ''' Defines User permissions as integers by the power of 2. User permission can be evaluated using the bitwise operator &. ''' ADMINISTRATE = 1 CONTRIBUTE = 2 USE_API = 4 class UserSettingJobStatusMailNotificationLevel(IntEnum): NONE = 1 END = 2 ALL = 3 # endregion enums ############################################################################## # mixins # ############################################################################## # region mixins class FileMixin: ''' Mixin for db.Model classes. All file related models should use this. ''' creation_date = db.Column(db.DateTime, default=datetime.utcnow) filename = db.Column(db.String(255)) last_edited_date = db.Column(db.DateTime, default=datetime.utcnow) mimetype = db.Column(db.String(255)) def file_mixin_to_dict(self, backrefs=False, relationships=False): return { 'creation_date': self.creation_date.isoformat() + 'Z', 'filename': self.filename, 'last_edited_date': self.last_edited_date.isoformat() + 'Z', 'mimetype': self.mimetype } # endregion mixins ############################################################################## # type_decorators # ############################################################################## # region type_decorators class IntEnumColumn(db.TypeDecorator): impl = db.Integer def __init__(self, enum_type, *args, **kwargs): super().__init__(*args, **kwargs) self.enum_type = enum_type def process_bind_param(self, value, dialect): if isinstance(value, self.enum_type) and isinstance(value.value, int): return value.value elif isinstance(value, int): return self.enum_type(value).value else: return TypeError() def process_result_value(self, value, dialect): return self.enum_type(value) class ContainerColumn(db.TypeDecorator): impl = db.String def __init__(self, container_type, *args, **kwargs): super().__init__(*args, **kwargs) self.container_type = container_type def process_bind_param(self, value, dialect): if isinstance(value, self.container_type): return json.dumps(value) elif ( isinstance(value, str) and isinstance(json.loads(value), self.container_type) ): return value else: return TypeError() def process_result_value(self, value, dialect): return json.loads(value) # endregion type_decorators ############################################################################## # Models # ############################################################################## # region models class Role(HashidMixin, db.Model): __tablename__ = 'roles' # Primary key id = db.Column(db.Integer, primary_key=True) # Fields default = db.Column(db.Boolean, default=False, index=True) name = db.Column(db.String(64), unique=True) permissions = db.Column(db.Integer) # Relationships users = db.relationship('User', backref='role', lazy='dynamic') def __init__(self, **kwargs): super().__init__(**kwargs) if self.permissions is None: self.permissions = 0 def __repr__(self): return f'' def add_permission(self, permission): if not self.has_permission(permission): self.permissions += permission def has_permission(self, permission): return self.permissions & permission == permission def remove_permission(self, permission): if self.has_permission(permission): self.permissions -= permission def reset_permissions(self): self.permissions = 0 def to_dict(self, backrefs=False, relationships=False): dict_role = { 'id': self.hashid, 'default': self.default, 'name': self.name, 'permissions': self.permissions } if relationships: dict_role['users'] = { x.hashid: x.to_dict(backrefs=False, relationships=True) for x in self.users } return dict_role @staticmethod def insert_defaults(): roles = { 'User': [], 'API user': [Permission.USE_API], 'Contributor': [Permission.CONTRIBUTE], 'Administrator': [ Permission.ADMINISTRATE, Permission.CONTRIBUTE, Permission.USE_API ] } default_role_name = 'User' for role_name, permissions in roles.items(): role = Role.query.filter_by(name=role_name).first() if role is None: role = Role(name=role_name) role.reset_permissions() for permission in permissions: role.add_permission(permission) role.default = role.name == default_role_name db.session.add(role) db.session.commit() class Token(db.Model): __tablename__ = 'tokens' # Primary key id = db.Column(db.Integer, primary_key=True) # Foreign keys user_id = db.Column(db.Integer, db.ForeignKey('users.id')) # Fields access_token = db.Column(db.String(64), nullable=False, index=True) access_expiration = db.Column(db.DateTime, nullable=False) refresh_token = db.Column(db.String(64), nullable=False, index=True) refresh_expiration = db.Column(db.DateTime, nullable=False) # def generate(self): # header = {'alg': 'HS256', 'exp': int(time.time()) + expiration} # payload = {'confirm': self.hashid} # return jwt.encode(header, payload, current_app.config['SECRET_KEY']) # self.access_token = secrets.token_urlsafe() # self.access_expiration = datetime.utcnow() + \ # timedelta(minutes=current_app.config['ACCESS_TOKEN_MINUTES']) # self.refresh_token = secrets.token_urlsafe() # self.refresh_expiration = datetime.utcnow() + \ # timedelta(days=current_app.config['REFRESH_TOKEN_DAYS']) class User(HashidMixin, UserMixin, db.Model): __tablename__ = 'users' # Primary key id = db.Column(db.Integer, primary_key=True) # Foreign keys role_id = db.Column(db.Integer, db.ForeignKey('roles.id')) # Fields confirmed = db.Column(db.Boolean, default=False) email = db.Column(db.String(254), unique=True, index=True) last_seen = db.Column(db.DateTime(), default=datetime.utcnow) member_since = db.Column(db.DateTime(), default=datetime.utcnow) password_hash = db.Column(db.String(128)) token = db.Column(db.String(32), index=True, unique=True) token_expiration = db.Column(db.DateTime) username = db.Column(db.String(64), unique=True, index=True) setting_dark_mode = db.Column(db.Boolean, default=False) setting_job_status_mail_notification_level = db.Column( IntEnumColumn(UserSettingJobStatusMailNotificationLevel), default=UserSettingJobStatusMailNotificationLevel.END ) # Backrefs: role: Role # Relationships tesseract_ocr_models = db.relationship( 'TesseractOCRModel', backref='user', cascade='all, delete-orphan', lazy='dynamic' ) transkribus_htr_models = db.relationship( 'TranskribusHTRModel', backref='user', cascade='all, delete-orphan', lazy='dynamic' ) corpora = db.relationship( 'Corpus', backref='user', cascade='all, delete-orphan', lazy='dynamic' ) jobs = db.relationship( 'Job', backref='user', cascade='all, delete-orphan', lazy='dynamic' ) def __init__(self, **kwargs): super().__init__(**kwargs) if self.role is not None: return if self.email == current_app.config['NOPAQUE_ADMIN']: self.role = Role.query.filter_by(name='Administrator').first() else: self.role = Role.query.filter_by(default=True).first() def __repr__(self): return f'' @property def jsonpatch_path(self): return f'/users/{self.hashid}' @property def password(self): raise AttributeError('password is not a readable attribute') @password.setter def password(self, password): self.password_hash = generate_password_hash(password) @property def path(self): return os.path.join( current_app.config.get('NOPAQUE_DATA_DIR'), 'users', str(self.id)) def can(self, permission): return self.role.has_permission(permission) def confirm_user(self, token): try: payload = jwt.decode( token, current_app.config['SECRET_KEY'], algorithms=['HS256'], issuer=current_app.config['SERVER_NAME'], options={'require': ['exp', 'iat', 'iss', 'purpose', 'sub']} ) except jwt.PyJWTError: return False if payload.get('purpose') != 'confirm_user': return False if payload.get('sub') != self.id: return False self.confirmed = True db.session.add(self) return True def delete(self): shutil.rmtree(self.path, ignore_errors=True) db.session.delete(self) def generate_confirm_user_token(self, expiration=3600): utc_now = datetime.utcnow() payload = { 'exp': utc_now + timedelta(seconds=expiration), 'iat': utc_now, 'iss': current_app.config['SERVER_NAME'], 'purpose': 'confirm_user', 'sub': self.id } return jwt.encode(payload, current_app.config['SECRET_KEY'], algorithm='HS256') def generate_password_reset_token(self, expiration=3600): utc_now = datetime.utcnow() payload = { 'exp': utc_now + timedelta(seconds=expiration), 'iat': utc_now, 'iss': current_app.config['SERVER_NAME'], 'purpose': 'reset_password', 'sub': self.id } return jwt.encode(payload, current_app.config['SECRET_KEY'], algorithm='HS256') def get_token(self, expires_in=3600): now = datetime.utcnow() if self.token and self.token_expiration > now + timedelta(seconds=60): return self.token self.token = base64.b64encode(os.urandom(24)).decode('utf-8') self.token_expiration = now + timedelta(seconds=expires_in) db.session.add(self) return self.token def is_administrator(self): return self.can(Permission.ADMINISTRATE) def makedirs(self): os.mkdir(self.path) os.mkdir(os.path.join(self.path, 'tesseract_ocr_models')) os.mkdir(os.path.join(self.path, 'corpora')) os.mkdir(os.path.join(self.path, 'jobs')) def revoke_token(self): self.token_expiration = datetime.utcnow() - timedelta(seconds=1) def to_dict(self, backrefs=False, relationships=False): dict_user = { 'id': self.hashid, 'role_id': self.role.hashid, 'confirmed': self.confirmed, 'email': self.email, 'last_seen': self.last_seen.isoformat() + 'Z', 'member_since': self.member_since.isoformat() + 'Z', 'username': self.username, 'settings': { 'dark_mode': self.setting_dark_mode, 'job_status_mail_notification_level': self.setting_job_status_mail_notification_level.name } } if backrefs: dict_user['role'] = self.role.to_dict( backrefs=True, relationships=False) if relationships: dict_user['corpora'] = { x.hashid: x.to_dict(backrefs=False, relationships=True) for x in self.corpora } dict_user['jobs'] = { x.hashid: x.to_dict(backrefs=False, relationships=True) for x in self.jobs } dict_user['tesseract_ocr_models'] = { x.hashid: x.to_dict(backrefs=False, relationships=True) for x in self.tesseract_ocr_models } return dict_user def verify_password(self, password): return check_password_hash(self.password_hash, password) @staticmethod def check_token(token): user = User.query.filter_by(token=token).first() if user is None or user.token_expiration < datetime.utcnow(): return None return user @staticmethod def insert_defaults(): if User.query.filter_by(username='nopaque').first() is not None: return user = User(username='nopaque') db.session.add(user) db.session.flush(objects=[user]) db.session.refresh(user) try: user.makedirs() except OSError as e: current_app.logger.error(e) db.session.rollback() db.session.commit() @staticmethod def reset_password(token, new_password): try: payload = jwt.decode( token, current_app.config['SECRET_KEY'], algorithms=['HS256'], issuer=current_app.config['SERVER_NAME'], options={'require': ['exp', 'iat', 'iss', 'purpose', 'sub']} ) except jwt.PyJWTError: return False if payload.get('purpose') != 'reset_password': return False user_id = payload.get('sub') if user_id is None: return False user = User.query.get(user_id) if user is None: return False user.password = new_password db.session.add(user) return True class TesseractOCRModel(FileMixin, HashidMixin, db.Model): __tablename__ = 'tesseract_ocr_models' # Primary key id = db.Column(db.Integer, primary_key=True) # Foreign keys user_id = db.Column(db.Integer, db.ForeignKey('users.id')) # Fields compatible_service_versions = db.Column(ContainerColumn(list, 255)) description = db.Column(db.String(255)) publisher = db.Column(db.String(128)) publisher_url = db.Column(db.String(512)) publishing_url = db.Column(db.String(512)) publishing_year = db.Column(db.Integer) shared = db.Column(db.Boolean, default=False) title = db.Column(db.String(64)) version = db.Column(db.String(16)) # Backrefs: user: User @property def path(self): return os.path.join( self.user.path, 'tesseract_ocr_models', str(self.id) ) def to_dict(self, backrefs=False, relationships=False): dict_tesseract_ocr_model = { 'id': self.hashid, 'user_id': self.user.hashid, 'compatible_service_versions': self.compatible_service_versions, 'description': self.description, 'publisher': self.publisher, 'publisher_url': self.publisher_url, 'publishing_url': self.publishing_url, 'publishing_year': self.publishing_year, 'shared': self.shared, 'title': self.title, **self.file_mixin_to_dict() } if backrefs: dict_tesseract_ocr_model['user'] = self.user.to_dict( backrefs=True, relationships=False) if relationships: pass return dict_tesseract_ocr_model @staticmethod def insert_defaults(): user = User.query.filter_by(username='nopaque').first() defaults_file = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'TesseractOCRModel.defaults.yml' ) with open(defaults_file, 'r') as f: defaults = yaml.safe_load(f) for m in defaults: model = TesseractOCRModel.query.filter_by(title=m['title'], version=m['version']).first() # noqa if model is not None: model.compatible_service_versions = m['compatible_service_versions'] model.description = m['description'] model.publisher = m['publisher'] model.publisher_url = m['publisher_url'] model.publishing_url = m['publishing_url'] model.publishing_year = m['publishing_year'] model.shared = True model.title = m['title'] model.version = m['version'] continue model = TesseractOCRModel( compatible_service_versions=m['compatible_service_versions'], description=m['description'], publisher=m['publisher'], publisher_url=m['publisher_url'], publishing_url=m['publishing_url'], publishing_year=m['publishing_year'], shared=True, title=m['title'], user=user, version=m['version'] ) db.session.add(model) db.session.flush(objects=[model]) db.session.refresh(model) model.filename = f'{model.id}.traineddata' r = requests.get(m['url'], stream=True) pbar = tqdm( desc=f'{model.title} ({model.filename})', unit="B", unit_scale=True, unit_divisor=1024, total=int(r.headers['Content-Length']) ) pbar.clear() with open(model.path, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks pbar.update(len(chunk)) f.write(chunk) pbar.close() db.session.commit() class TranskribusHTRModel(HashidMixin, db.Model): __tablename__ = 'transkribus_htr_models' # Primary key id = db.Column(db.Integer, primary_key=True) # Foreign keys user_id = db.Column(db.Integer, db.ForeignKey('users.id')) # Fields shared = db.Column(db.Boolean, default=False) transkribus_model_id = db.Column(db.Integer) transkribus_name = db.Column(db.String(64)) # Backrefs: user: User def to_dict(self, backrefs=False, relationships=False): dict_tesseract_ocr_model = { 'id': self.hashid, 'user_id': self.user.hashid, 'shared': self.shared, 'transkribus_model_id': self.transkribus_model_id, } if backrefs: dict_tesseract_ocr_model['user'] = \ self.user.to_dict(backrefs=True, relationships=False) if relationships: pass return dict_tesseract_ocr_model @staticmethod def insert_defaults(): user = User.query.filter_by(username='nopaque').first() # models = [ # m for m in TRANSKRIBUS_HTR_MODELS if True # and 'creator' in m and m['creator'] == 'Transkribus Team' # and 'docType' in m and m['docType'] == 'handwritten' # ] for m in TRANSKRIBUS_HTR_MODELS: model = TranskribusHTRModel.query.filter_by(transkribus_model_id=m['modelId']).first() # noqa if model is not None: model.shared = True model.transkribus_model_id = m['modelId'] continue model = TranskribusHTRModel( shared=True, transkribus_model_id=m['modelId'], user=user, ) db.session.add(model) db.session.commit() class JobInput(FileMixin, HashidMixin, db.Model): __tablename__ = 'job_inputs' # Primary key id = db.Column(db.Integer, primary_key=True) # Foreign keys job_id = db.Column(db.Integer, db.ForeignKey('jobs.id')) # Backrefs: job: Job def __repr__(self): return f'' @property def download_url(self): return url_for( 'jobs.download_job_input', job_id=self.job.id, job_input_id=self.id ) @property def jsonpatch_path(self): return f'{self.job.jsonpatch_path}/inputs/{self.hashid}' @property def path(self): return os.path.join(self.job.path, 'inputs', str(self.id)) def to_dict(self, backrefs=False, relationships=False): dict_job_input = { 'id': self.hashid, 'job_id': self.job.hashid, 'download_url': self.download_url, 'url': self.url, **self.file_mixin_to_dict() } if backrefs: dict_job_input['job'] = self.job.to_dict( backrefs=True, relationships=False) return dict_job_input @property def url(self): return url_for( 'jobs.job', job_id=self.job_id, _anchor=f'job-{self.job.hashid}-input-{self.hashid}' ) @property def user_hashid(self): return self.job.user.hashid @property def user_id(self): return self.job.user_id class JobResult(FileMixin, HashidMixin, db.Model): __tablename__ = 'job_results' # Primary key id = db.Column(db.Integer, primary_key=True) # Foreign keys job_id = db.Column(db.Integer, db.ForeignKey('jobs.id')) # Fields description = db.Column(db.String(255)) # Backrefs: job: Job def __repr__(self): return f'' @property def download_url(self): return url_for( 'jobs.download_job_result', job_id=self.job_id, job_result_id=self.id ) @property def jsonpatch_path(self): return f'{self.job.jsonpatch_path}/results/{self.hashid}' @property def path(self): return os.path.join(self.job.path, 'results', str(self.id)) def to_dict(self, backrefs=False, relationships=False): dict_job_result = { 'id': self.hashid, 'job_id': self.job.hashid, 'description': self.description, 'download_url': self.download_url, 'url': self.url, **self.file_mixin_to_dict( backrefs=backrefs, relationships=relationships) } if backrefs: dict_job_result['job'] = self.job.to_dict( backrefs=True, relationships=False) return dict_job_result @property def url(self): return url_for( 'jobs.job', job_id=self.job_id, _anchor=f'job-{self.job.hashid}-result-{self.hashid}' ) @property def user_hashid(self): return self.job.user.hashid @property def user_id(self): return self.job.user_id class Job(HashidMixin, db.Model): ''' Class to define Jobs. ''' __tablename__ = 'jobs' # Primary key id = db.Column(db.Integer, primary_key=True) # Foreign keys user_id = db.Column(db.Integer, db.ForeignKey('users.id')) # Fields creation_date = db.Column(db.DateTime(), default=datetime.utcnow) description = db.Column(db.String(255)) end_date = db.Column(db.DateTime()) service = db.Column(db.String(64)) service_args = db.Column(ContainerColumn(dict, 255)) service_version = db.Column(db.String(16)) status = db.Column( IntEnumColumn(JobStatus), default=JobStatus.INITIALIZING ) title = db.Column(db.String(32)) # Backrefs: user: User # Relationships inputs = db.relationship( 'JobInput', backref='job', cascade='all, delete-orphan', lazy='dynamic' ) results = db.relationship( 'JobResult', backref='job', cascade='all, delete-orphan', lazy='dynamic' ) def __repr__(self): return f'' @property def jsonpatch_path(self): return f'{self.user.jsonpatch_path}/jobs/{self.hashid}' @property def path(self): return os.path.join(self.user.path, 'jobs', str(self.id)) @property def url(self): return url_for('jobs.job', job_id=self.id) @property def user_hashid(self): return self.user.hashid def delete(self): ''' Delete the job and its inputs and results from the database. ''' if self.status not in [JobStatus.COMPLETED, JobStatus.FAILED]: # noqa self.status = JobStatus.CANCELING db.session.commit() while self.status != JobStatus.CANCELED: # In case the daemon handled a job in any way if self.status != JobStatus.CANCELING: self.status = JobStatus.CANCELING db.session.commit() sleep(1) db.session.refresh(self) shutil.rmtree(self.path, ignore_errors=True) db.session.delete(self) def makedirs(self): os.mkdir(self.path) os.mkdir(os.path.join(self.path, 'inputs')) os.mkdir(os.path.join(self.path, 'pipeline_data')) os.mkdir(os.path.join(self.path, 'results')) def restart(self): ''' Restart a job - only if the status is complete or failed ''' if self.status not in [JobStatus.COMPLETED, JobStatus.FAILED]: # noqa raise Exception('Could not restart job: status is not "completed/failed"') # noqa shutil.rmtree(os.path.join(self.path, 'results'), ignore_errors=True) shutil.rmtree(os.path.join(self.path, 'pyflow.data'), ignore_errors=True) # noqa for result in self.results: db.session.delete(result) self.end_date = None self.status = JobStatus.SUBMITTED def to_dict(self, backrefs=False, relationships=False): dict_job = { 'id': self.hashid, 'user_id': self.user.hashid, 'creation_date': self.creation_date.isoformat() + 'Z', 'description': self.description, 'end_date': None if self.end_date is None else f'{self.end_date.isoformat()}Z', # noqa 'service': self.service, 'service_args': self.service_args, 'service_version': self.service_version, 'status': self.status.name, 'title': self.title, 'url': self.url } if backrefs: dict_job['user'] = self.user.to_dict( backrefs=True, relationships=False) if relationships: dict_job['inputs'] = { x.hashid: x.to_dict(backrefs=False, relationships=True) for x in self.inputs } dict_job['results'] = { x.hashid: x.to_dict(backrefs=False, relationships=True) for x in self.results } return dict_job class CorpusFile(FileMixin, HashidMixin, db.Model): __tablename__ = 'corpus_files' # Primary key id = db.Column(db.Integer, primary_key=True) # Foreign keys corpus_id = db.Column(db.Integer, db.ForeignKey('corpora.id')) # Fields address = db.Column(db.String(255)) author = db.Column(db.String(255)) booktitle = db.Column(db.String(255)) chapter = db.Column(db.String(255)) editor = db.Column(db.String(255)) institution = db.Column(db.String(255)) journal = db.Column(db.String(255)) pages = db.Column(db.String(255)) publisher = db.Column(db.String(255)) publishing_year = db.Column(db.Integer) school = db.Column(db.String(255)) title = db.Column(db.String(255)) # Backrefs: corpus: Corpus @property def download_url(self): return url_for( 'corpora.download_corpus_file', corpus_id=self.corpus_id, corpus_file_id=self.id ) @property def jsonpatch_path(self): return f'{self.corpus.jsonpatch_path}/files/{self.hashid}' @property def path(self): return os.path.join(self.corpus.path, 'files', str(self.id)) @property def url(self): return url_for( 'corpora.corpus_file', corpus_id=self.corpus_id, corpus_file_id=self.id ) @property def user_hashid(self): return self.corpus.user.hashid @property def user_id(self): return self.corpus.user_id def delete(self): try: os.remove(self.path) except OSError: current_app.logger.error( f'Removing {self.path} led to an OSError!' ) pass db.session.delete(self) self.corpus.status = CorpusStatus.UNPREPARED def to_dict(self, backrefs=False, relationships=False): dict_corpus_file = { 'id': self.hashid, 'corpus_id': self.corpus.hashid, 'download_url': self.download_url, 'url': self.url, 'address': self.address, 'author': self.author, 'booktitle': self.booktitle, 'chapter': self.chapter, 'editor': self.editor, 'institution': self.institution, 'journal': self.journal, 'pages': self.pages, 'publisher': self.publisher, 'publishing_year': self.publishing_year, 'school': self.school, 'title': self.title, **self.file_mixin_to_dict( backrefs=backrefs, relationships=relationships) } if backrefs: dict_corpus_file['corpus'] = self.corpus.to_dict( backrefs=True, relationships=False) return dict_corpus_file class Corpus(HashidMixin, db.Model): ''' Class to define a corpus. ''' __tablename__ = 'corpora' # Primary key id = db.Column(db.Integer, primary_key=True) # Foreign keys user_id = db.Column(db.Integer, db.ForeignKey('users.id')) # Fields creation_date = db.Column(db.DateTime(), default=datetime.utcnow) description = db.Column(db.String(255)) last_edited_date = db.Column(db.DateTime(), default=datetime.utcnow) status = db.Column( IntEnumColumn(CorpusStatus), default=CorpusStatus.UNPREPARED ) title = db.Column(db.String(32)) num_analysis_sessions = db.Column(db.Integer, default=0) num_tokens = db.Column(db.Integer, default=0) # Backrefs: user: User # Relationships files = db.relationship( 'CorpusFile', backref='corpus', lazy='dynamic', cascade='all, delete-orphan' ) # "static" attributes max_num_tokens = 2_147_483_647 def __repr__(self): return f'' @property def analysis_url(self): return url_for('corpora.analyse_corpus', corpus_id=self.id) @property def jsonpatch_path(self): return f'{self.user.jsonpatch_path}/corpora/{self.hashid}' @property def path(self): return os.path.join(self.user.path, 'corpora', str(self.id)) @property def url(self): return url_for('corpora.corpus', corpus_id=self.id) @property def user_hashid(self): return self.user.hashid def build(self): corpus_element = ET.fromstring('\n') for corpus_file in self.files: normalized_vrt_path = os.path.join(self.path, 'cwb', f'{corpus_file.id}.norm.vrt') try: normalize_vrt_file(corpus_file.path, normalized_vrt_path) except: self.status = CorpusStatus.FAILED return element_tree = ET.parse(normalized_vrt_path) text_element = element_tree.getroot() text_element.set('address', corpus_file.address or 'NULL') text_element.set('author', corpus_file.author) text_element.set('booktitle', corpus_file.booktitle or 'NULL') text_element.set('chapter', corpus_file.chapter or 'NULL') text_element.set('editor', corpus_file.editor or 'NULL') text_element.set('institution', corpus_file.institution or 'NULL') text_element.set('journal', corpus_file.journal or 'NULL') text_element.set('pages', corpus_file.pages or 'NULL') text_element.set('publisher', corpus_file.publisher or 'NULL') text_element.set('publishing_year', str(corpus_file.publishing_year)) # noqa text_element.set('school', corpus_file.school or 'NULL') text_element.set('title', corpus_file.title) text_element.tail = '\n' # corpus_element.insert(1, text_element) corpus_element.append(text_element) ET.ElementTree(corpus_element).write( os.path.join(self.path, 'cwb', 'corpus.vrt'), encoding='utf-8' ) self.last_edited_date = datetime.utcnow() self.status = CorpusStatus.SUBMITTED def delete(self): shutil.rmtree(self.path, ignore_errors=True) db.session.delete(self) def makedirs(self): os.mkdir(self.path) os.mkdir(os.path.join(self.path, 'files')) os.mkdir(os.path.join(self.path, 'cwb')) os.mkdir(os.path.join(self.path, 'cwb', 'data')) os.mkdir(os.path.join(self.path, 'cwb', 'registry')) def to_dict(self, backrefs=False, relationships=False): dict_corpus = { 'id': self.hashid, 'user_id': self.user.hashid, 'analysis_url': self.analysis_url, 'url': self.url, 'creation_date': self.creation_date.isoformat() + 'Z', 'description': self.description, 'max_num_tokens': self.max_num_tokens, 'num_analysis_sessions': self.num_analysis_sessions, 'num_tokens': self.num_tokens, 'status': self.status.name, 'last_edited_date': self.last_edited_date.isoformat() + 'Z', 'title': self.title } if backrefs: dict_corpus['user'] = self.user.to_dict( backrefs=True, relationships=False ) if relationships: dict_corpus['files'] = { x.hashid: x.to_dict(backrefs=False, relationships=True) for x in self.files } return dict_corpus # endregion models ############################################################################## # event_handlers # ############################################################################## # region event_handlers @db.event.listens_for(Corpus, 'after_delete') @db.event.listens_for(CorpusFile, 'after_delete') @db.event.listens_for(Job, 'after_delete') @db.event.listens_for(JobInput, 'after_delete') @db.event.listens_for(JobResult, 'after_delete') def ressource_after_delete(mapper, connection, ressource): jsonpatch = [{'op': 'remove', 'path': ressource.jsonpatch_path}] room = f'users.{ressource.user_hashid}' socketio.emit('users.patch', jsonpatch, room=room) room = f'/users/{ressource.user_hashid}' socketio.emit('PATCH', jsonpatch, room=room) @db.event.listens_for(Corpus, 'after_insert') @db.event.listens_for(CorpusFile, 'after_insert') @db.event.listens_for(Job, 'after_insert') @db.event.listens_for(JobInput, 'after_insert') @db.event.listens_for(JobResult, 'after_insert') def ressource_after_insert_handler(mapper, connection, ressource): value = ressource.to_dict(backrefs=False, relationships=False) for attr in mapper.relationships: value[attr.key] = {} jsonpatch = [ {'op': 'add', 'path': ressource.jsonpatch_path, 'value': value} ] room = f'/users/{ressource.user_hashid}' socketio.emit('PATCH', jsonpatch, room=room) @db.event.listens_for(Corpus, 'after_update') @db.event.listens_for(CorpusFile, 'after_update') @db.event.listens_for(Job, 'after_update') @db.event.listens_for(JobInput, 'after_update') @db.event.listens_for(JobResult, 'after_update') def ressource_after_update_handler(mapper, connection, ressource): jsonpatch = [] for attr in db.inspect(ressource).attrs: if attr.key in mapper.relationships: continue if not attr.load_history().has_changes(): continue if isinstance(attr.value, datetime): value = attr.value.isoformat() + 'Z' elif isinstance(attr.value, Enum): value = attr.value.name else: value = attr.value jsonpatch.append( { 'op': 'replace', 'path': f'{ressource.jsonpatch_path}/{attr.key}', 'value': value } ) if jsonpatch: room = f'/users/{ressource.user_hashid}' socketio.emit('PATCH', jsonpatch, room=room) @db.event.listens_for(Job, 'after_update') def job_after_update_handler(mapper, connection, job): for attr in db.inspect(job).attrs: if attr.key != 'status': continue if not attr.load_history().has_changes(): return if job.user.setting_job_status_mail_notification_level == UserSettingJobStatusMailNotificationLevel.NONE: return if job.user.setting_job_status_mail_notification_level == UserSettingJobStatusMailNotificationLevel.END: if job.status not in [JobStatus.COMPLETED, JobStatus.FAILED]: return msg = create_message( job.user.email, f'Status update for your Job "{job.title}"', 'tasks/email/notification', job=job ) mail.send(msg) # endregion event_handlers ############################################################################## # misc # ############################################################################## # region misc @login.user_loader def load_user(user_id): return User.query.get(int(user_id)) # endregion misc