nopaque/app/models.py

1816 lines
62 KiB
Python
Raw Normal View History

from datetime import datetime, timedelta
from enum import Enum, IntEnum
from flask import abort, current_app, url_for
2021-12-13 12:20:01 +01:00
from flask_hashids import HashidMixin
from flask_login import UserMixin
2023-01-23 11:50:12 +01:00
from sqlalchemy.ext.associationproxy import association_proxy
from time import sleep
from tqdm import tqdm
2023-02-23 13:05:04 +01:00
from typing import Union
from werkzeug.security import generate_password_hash, check_password_hash
2022-09-02 13:07:30 +02:00
from werkzeug.utils import secure_filename
import json
2022-07-18 17:10:09 +02:00
import jwt
import os
2023-03-29 14:32:52 +02:00
import re
import requests
2022-09-02 13:24:14 +02:00
import secrets
import shutil
import xml.etree.ElementTree as ET
import yaml
2022-09-02 13:07:30 +02:00
from app import db, hashids, login, mail, socketio
from app.converters.vrt import normalize_vrt_file
from app.email import create_message
##############################################################################
# enums #
##############################################################################
# region enums
class CorpusStatus(IntEnum):
UNPREPARED = 1
SUBMITTED = 2
QUEUED = 3
BUILDING = 4
BUILT = 5
FAILED = 6
STARTING_ANALYSIS_SESSION = 7
RUNNING_ANALYSIS_SESSION = 8
CANCELING_ANALYSIS_SESSION = 9
@staticmethod
def get(corpus_status: Union['CorpusStatus', int, str]) -> 'CorpusStatus':
if isinstance(corpus_status, CorpusStatus):
return corpus_status
if isinstance(corpus_status, int):
return CorpusStatus(corpus_status)
if isinstance(corpus_status, str):
return CorpusStatus[corpus_status]
raise TypeError('corpus_status must be CorpusStatus, int, or str')
class JobStatus(IntEnum):
INITIALIZING = 1
SUBMITTED = 2
QUEUED = 3
RUNNING = 4
CANCELING = 5
CANCELED = 6
COMPLETED = 7
FAILED = 8
@staticmethod
def get(job_status: Union['JobStatus', int, str]) -> 'JobStatus':
if isinstance(job_status, JobStatus):
return job_status
if isinstance(job_status, int):
return JobStatus(job_status)
if isinstance(job_status, str):
return JobStatus[job_status]
raise TypeError('job_status must be JobStatus, int, or str')
class Permission(IntEnum):
'''
Defines User permissions as integers by the power of 2. User permission
can be evaluated using the bitwise operator &.
'''
ADMINISTRATE = 1
CONTRIBUTE = 2
USE_API = 4
2023-02-23 13:05:04 +01:00
@staticmethod
def get(permission: Union['Permission', int, str]) -> 'Permission':
if isinstance(permission, Permission):
return permission
if isinstance(permission, int):
return Permission(permission)
if isinstance(permission, str):
return Permission[permission]
raise TypeError('permission must be Permission, int, or str')
class UserSettingJobStatusMailNotificationLevel(IntEnum):
NONE = 1
END = 2
ALL = 3
2022-12-13 15:01:04 +01:00
class ProfilePrivacySettings(IntEnum):
SHOW_EMAIL = 1
SHOW_LAST_SEEN = 2
SHOW_MEMBER_SINCE = 4
2023-03-22 12:06:33 +01:00
@staticmethod
def get(profile_privacy_setting: Union['ProfilePrivacySettings', int, str]) -> 'ProfilePrivacySettings':
if isinstance(profile_privacy_setting, ProfilePrivacySettings):
return profile_privacy_setting
if isinstance(profile_privacy_setting, int):
return ProfilePrivacySettings(profile_privacy_setting)
if isinstance(profile_privacy_setting, str):
return ProfilePrivacySettings[profile_privacy_setting]
raise TypeError('profile_privacy_setting must be ProfilePrivacySettings, int, or str')
2023-02-20 10:40:33 +01:00
class CorpusFollowerPermission(IntEnum):
VIEW = 1
2023-04-27 15:11:18 +02:00
MANAGE_FILES = 2
MANAGE_FOLLOWERS = 4
MANAGE_CORPUS = 8
2023-02-23 13:05:04 +01:00
@staticmethod
def get(corpus_follower_permission: Union['CorpusFollowerPermission', int, str]) -> 'CorpusFollowerPermission':
if isinstance(corpus_follower_permission, CorpusFollowerPermission):
return corpus_follower_permission
if isinstance(corpus_follower_permission, int):
return CorpusFollowerPermission(corpus_follower_permission)
if isinstance(corpus_follower_permission, str):
return CorpusFollowerPermission[corpus_follower_permission]
raise TypeError('corpus_follower_permission must be CorpusFollowerPermission, int, or str')
# endregion enums
##############################################################################
# mixins #
##############################################################################
# region mixins
2021-11-30 16:22:16 +01:00
class FileMixin:
2022-04-25 11:32:10 +02:00
'''
Mixin for db.Model classes. All file related models should use this.
'''
2021-11-30 16:22:16 +01:00
creation_date = db.Column(db.DateTime, default=datetime.utcnow)
filename = db.Column(db.String(255))
2021-11-30 16:22:16 +01:00
mimetype = db.Column(db.String(255))
2022-11-24 12:24:29 +01:00
def file_mixin_to_json_serializeable(self, backrefs=False, relationships=False):
2021-11-30 16:22:16 +01:00
return {
2022-09-02 13:07:30 +02:00
'creation_date': f'{self.creation_date.isoformat()}Z',
2021-11-30 16:22:16 +01:00
'filename': self.filename,
'mimetype': self.mimetype
}
2022-10-11 11:32:50 +02:00
@classmethod
def create(cls, file_storage, **kwargs):
filename = kwargs.pop('filename', file_storage.filename)
mimetype = kwargs.pop('mimetype', file_storage.mimetype)
obj = cls(
filename=secure_filename(filename),
mimetype=mimetype,
**kwargs
)
db.session.add(obj)
db.session.flush(objects=[obj])
db.session.refresh(obj)
try:
file_storage.save(obj.path)
except (AttributeError, OSError) as e:
current_app.logger.error(e)
db.session.rollback()
raise e
return obj
# endregion mixins
2021-11-30 16:22:16 +01:00
##############################################################################
# type_decorators #
##############################################################################
# region type_decorators
class IntEnumColumn(db.TypeDecorator):
impl = db.Integer
def __init__(self, enum_type, *args, **kwargs):
super().__init__(*args, **kwargs)
self.enum_type = enum_type
def process_bind_param(self, value, dialect):
if isinstance(value, self.enum_type) and isinstance(value.value, int):
return value.value
elif isinstance(value, int):
return self.enum_type(value).value
2022-12-22 16:02:12 +01:00
elif isinstance(value, str):
return self.enum_type[value].value
else:
return TypeError()
def process_result_value(self, value, dialect):
return self.enum_type(value)
class ContainerColumn(db.TypeDecorator):
impl = db.String
def __init__(self, container_type, *args, **kwargs):
super().__init__(*args, **kwargs)
self.container_type = container_type
def process_bind_param(self, value, dialect):
if isinstance(value, self.container_type):
return json.dumps(value)
2022-12-22 16:02:12 +01:00
elif isinstance(value, str) and isinstance(json.loads(value), self.container_type):
return value
else:
return TypeError()
def process_result_value(self, value, dialect):
return json.loads(value)
# endregion type_decorators
##############################################################################
# Models #
##############################################################################
# region models
2021-11-30 16:22:16 +01:00
class Role(HashidMixin, db.Model):
__tablename__ = 'roles'
2019-08-06 11:47:04 +02:00
# Primary key
id = db.Column(db.Integer, primary_key=True)
2020-04-27 10:30:38 +02:00
# Fields
2019-08-06 11:47:04 +02:00
name = db.Column(db.String(64), unique=True)
2022-09-02 13:07:30 +02:00
default = db.Column(db.Boolean, default=False, index=True)
permissions = db.Column(db.Integer, default=0)
2019-08-06 11:47:04 +02:00
# Relationships
2023-01-23 11:50:12 +01:00
users = db.relationship('User', back_populates='role', lazy='dynamic')
def __repr__(self):
2021-11-30 16:22:16 +01:00
return f'<Role {self.name}>'
2023-02-23 13:05:04 +01:00
def has_permission(self, permission: Union[Permission, int, str]):
2023-03-22 12:06:33 +01:00
p = Permission.get(permission)
return self.permissions & p.value == p.value
2023-02-23 13:05:04 +01:00
def add_permission(self, permission: Union[Permission, int, str]):
2023-03-22 12:06:33 +01:00
p = Permission.get(permission)
if not self.has_permission(p):
self.permissions += p.value
2023-02-23 13:05:04 +01:00
def remove_permission(self, permission: Union[Permission, int, str]):
2023-03-22 12:06:33 +01:00
p = Permission.get(permission)
if self.has_permission(p):
self.permissions -= p.value
def reset_permissions(self):
self.permissions = 0
2022-11-24 12:24:29 +01:00
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
2021-11-30 16:22:16 +01:00
'id': self.hashid,
'default': self.default,
'name': self.name,
2023-02-20 10:40:33 +01:00
'permissions': [
x.name for x in Permission
if self.has_permission(x.value)
]
2021-11-30 16:22:16 +01:00
}
2023-02-22 09:35:19 +01:00
if backrefs:
pass
2021-11-30 16:22:16 +01:00
if relationships:
2022-11-24 12:24:29 +01:00
json_serializeable['users'] = {
x.hashid: x.to_json_serializeable(relationships=True)
2021-11-30 16:22:16 +01:00
for x in self.users
}
2022-11-24 12:24:29 +01:00
return json_serializeable
@staticmethod
def insert_defaults():
2021-11-30 16:22:16 +01:00
roles = {
'User': [],
2021-12-03 14:07:03 +01:00
'API user': [Permission.USE_API],
'Contributor': [Permission.CONTRIBUTE],
'Administrator': [
Permission.ADMINISTRATE,
Permission.CONTRIBUTE,
Permission.USE_API
2022-09-02 13:07:30 +02:00
],
'System user': []
2021-11-30 16:22:16 +01:00
}
default_role_name = 'User'
for role_name, permissions in roles.items():
role = Role.query.filter_by(name=role_name).first()
if role is None:
2021-11-30 16:22:16 +01:00
role = Role(name=role_name)
role.reset_permissions()
2021-11-30 16:22:16 +01:00
for permission in permissions:
role.add_permission(permission)
role.default = role.name == default_role_name
db.session.add(role)
db.session.commit()
2022-12-15 09:53:19 +01:00
2022-09-02 13:24:14 +02:00
class Token(db.Model):
__tablename__ = 'tokens'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
# Fields
access_token = db.Column(db.String(64), index=True)
access_expiration = db.Column(db.DateTime)
refresh_token = db.Column(db.String(64), index=True)
refresh_expiration = db.Column(db.DateTime)
2023-01-23 11:50:12 +01:00
# Relationships
user = db.relationship('User', back_populates='tokens')
2022-09-02 13:24:14 +02:00
def expire(self):
self.access_expiration = datetime.utcnow()
self.refresh_expiration = datetime.utcnow()
2023-02-22 09:35:19 +01:00
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
'id': self.hashid,
'access_token': self.access_token,
'access_expiration': (
None if self.access_expiration is None
else f'{self.access_expiration.isoformat()}Z'
),
'refresh_token': self.refresh_token,
'refresh_expiration': (
None if self.refresh_expiration is None
else f'{self.refresh_expiration.isoformat()}Z'
)
}
if backrefs:
json_serializeable['user'] = \
self.user.to_json_serializeable(backrefs=True)
if relationships:
pass
return json_serializeable
2022-09-02 13:24:14 +02:00
@staticmethod
def clean():
"""Remove any tokens that have been expired for more than a day."""
yesterday = datetime.utcnow() - timedelta(days=1)
Token.query.filter(Token.refresh_expiration < yesterday).delete()
2022-12-15 09:53:19 +01:00
2022-12-05 09:40:02 +01:00
class Avatar(HashidMixin, FileMixin, db.Model):
__tablename__ = 'avatars'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
2023-01-12 16:26:33 +01:00
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
2023-01-23 11:50:12 +01:00
# Relationships
user = db.relationship('User', back_populates='avatar')
2022-12-05 09:40:02 +01:00
@property
def path(self):
return os.path.join(self.user.path, 'avatar')
2022-12-15 09:53:19 +01:00
2022-12-13 15:01:04 +01:00
def delete(self):
try:
os.remove(self.path)
except OSError as e:
current_app.logger.error(e)
db.session.delete(self)
2022-12-15 09:53:19 +01:00
2022-12-19 12:46:18 +01:00
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
2022-12-19 15:46:17 +01:00
'id': self.hashid,
2022-12-19 15:35:06 +01:00
**self.file_mixin_to_json_serializeable()
2022-12-19 12:46:18 +01:00
}
2023-02-22 09:35:19 +01:00
if backrefs:
json_serializeable['user'] = \
self.user.to_json_serializeable(backrefs=True)
if relationships:
pass
2022-12-19 12:46:18 +01:00
return json_serializeable
2023-01-12 16:26:33 +01:00
2023-02-23 13:05:04 +01:00
class CorpusFollowerRole(HashidMixin, db.Model):
__tablename__ = 'corpus_follower_roles'
2023-01-23 11:50:12 +01:00
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Fields
2023-02-23 13:05:04 +01:00
name = db.Column(db.String(64), unique=True)
default = db.Column(db.Boolean, default=False, index=True)
permissions = db.Column(db.Integer, default=0)
2023-01-23 11:50:12 +01:00
# Relationships
2023-02-23 13:05:04 +01:00
corpus_follower_associations = db.relationship(
'CorpusFollowerAssociation',
2023-02-24 09:27:20 +01:00
back_populates='role'
2023-02-23 13:05:04 +01:00
)
2023-01-23 11:50:12 +01:00
def __repr__(self):
2023-02-23 13:05:04 +01:00
return f'<CorpusFollowerRole {self.name}>'
2023-01-12 16:26:33 +01:00
2023-02-23 13:05:04 +01:00
def has_permission(self, permission: Union[CorpusFollowerPermission, int, str]):
perm = CorpusFollowerPermission.get(permission)
return self.permissions & perm.value == perm.value
2023-02-23 13:05:04 +01:00
def add_permission(self, permission: Union[CorpusFollowerPermission, int, str]):
perm = CorpusFollowerPermission.get(permission)
if not self.has_permission(perm):
self.permissions += perm.value
2023-02-23 13:05:04 +01:00
def remove_permission(self, permission: Union[CorpusFollowerPermission, int, str]):
perm = CorpusFollowerPermission.get(permission)
if self.has_permission(perm):
self.permissions -= perm.value
def reset_permissions(self):
self.permissions = 0
2023-01-12 16:26:33 +01:00
2023-02-20 10:40:33 +01:00
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
'id': self.hashid,
2023-02-23 13:05:04 +01:00
'default': self.default,
'name': self.name,
2023-02-20 10:40:33 +01:00
'permissions': [
2023-02-23 13:05:04 +01:00
x.name
for x in CorpusFollowerPermission
if self.has_permission(x)
2023-02-23 13:05:04 +01:00
]
}
if backrefs:
pass
if relationships:
json_serializeable['corpus_follower_association'] = {
x.hashid: x.to_json_serializeable(relationships=True)
for x in self.corpus_follower_association
}
return json_serializeable
@staticmethod
def insert_defaults():
roles = {
2023-04-27 15:11:18 +02:00
'Anonymous': [],
2023-02-23 13:05:04 +01:00
'Viewer': [
CorpusFollowerPermission.VIEW
],
'Contributor': [
CorpusFollowerPermission.VIEW,
2023-04-27 15:11:18 +02:00
CorpusFollowerPermission.MANAGE_FILES
2023-02-20 10:40:33 +01:00
],
2023-02-23 13:05:04 +01:00
'Administrator': [
CorpusFollowerPermission.VIEW,
2023-04-27 15:11:18 +02:00
CorpusFollowerPermission.MANAGE_FILES,
CorpusFollowerPermission.MANAGE_FOLLOWERS,
CorpusFollowerPermission.MANAGE_CORPUS
2023-02-23 13:05:04 +01:00
]
}
default_role_name = 'Viewer'
for role_name, permissions in roles.items():
role = CorpusFollowerRole.query.filter_by(name=role_name).first()
if role is None:
role = CorpusFollowerRole(name=role_name)
role.reset_permissions()
for permission in permissions:
role.add_permission(permission)
role.default = role.name == default_role_name
db.session.add(role)
db.session.commit()
class CorpusFollowerAssociation(HashidMixin, db.Model):
__tablename__ = 'corpus_follower_associations'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
corpus_id = db.Column(db.Integer, db.ForeignKey('corpora.id'))
follower_id = db.Column(db.Integer, db.ForeignKey('users.id'))
role_id = db.Column(db.Integer, db.ForeignKey('corpus_follower_roles.id'))
# Relationships
corpus = db.relationship(
'Corpus',
back_populates='corpus_follower_associations'
)
follower = db.relationship(
'User',
back_populates='corpus_follower_associations'
)
role = db.relationship(
'CorpusFollowerRole',
back_populates='corpus_follower_associations'
)
def __init__(self, **kwargs):
if 'role' not in kwargs:
kwargs['role'] = CorpusFollowerRole.query.filter_by(default=True).first()
2023-02-23 13:05:04 +01:00
super().__init__(**kwargs)
def __repr__(self):
2023-02-24 09:27:20 +01:00
return f'<CorpusFollowerAssociation {self.follower.__repr__()} ~ {self.role.__repr__()} ~ {self.corpus.__repr__()}>'
2023-02-23 13:05:04 +01:00
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
'id': self.hashid,
2023-06-07 13:39:04 +02:00
'corpus': self.corpus.to_json_serializeable(backrefs=True),
2023-02-23 13:05:04 +01:00
'follower': self.follower.to_json_serializeable(),
'role': self.role.to_json_serializeable()
2023-02-20 10:40:33 +01:00
}
2023-02-22 09:35:19 +01:00
if backrefs:
2023-02-23 13:05:04 +01:00
pass
2023-02-22 09:35:19 +01:00
if relationships:
pass
2023-02-20 10:40:33 +01:00
return json_serializeable
2021-11-30 16:22:16 +01:00
class User(HashidMixin, UserMixin, db.Model):
__tablename__ = 'users'
2019-08-06 11:47:04 +02:00
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
role_id = db.Column(db.Integer, db.ForeignKey('roles.id'))
2020-04-27 10:30:38 +02:00
# Fields
2022-09-02 13:07:30 +02:00
email = db.Column(db.String(254), index=True, unique=True)
username = db.Column(db.String(64), index=True, unique=True)
2023-03-29 14:32:52 +02:00
username_pattern = re.compile(r'^[A-Za-zÄÖÜäöüß0-9_.]*$')
2022-09-02 13:07:30 +02:00
password_hash = db.Column(db.String(128))
2019-08-06 11:47:04 +02:00
confirmed = db.Column(db.Boolean, default=False)
2023-04-13 16:08:07 +02:00
terms_of_use_accepted = db.Column(db.Boolean, default=False)
member_since = db.Column(db.DateTime(), default=datetime.utcnow)
setting_job_status_mail_notification_level = db.Column(
IntEnumColumn(UserSettingJobStatusMailNotificationLevel),
default=UserSettingJobStatusMailNotificationLevel.END
)
2022-09-02 13:07:30 +02:00
last_seen = db.Column(db.DateTime())
2022-11-30 14:36:42 +01:00
full_name = db.Column(db.String(64))
about_me = db.Column(db.String(256))
location = db.Column(db.String(64))
website = db.Column(db.String(128))
organization = db.Column(db.String(128))
2022-12-13 15:01:04 +01:00
is_public = db.Column(db.Boolean, default=False)
profile_privacy_settings = db.Column(db.Integer(), default=0)
2019-08-06 11:47:04 +02:00
# Relationships
2022-12-05 09:40:02 +01:00
avatar = db.relationship(
'Avatar',
2023-01-23 11:50:12 +01:00
back_populates='user',
2022-12-05 09:40:02 +01:00
cascade='all, delete-orphan',
uselist=False
)
2023-01-23 11:50:12 +01:00
corpora = db.relationship(
'Corpus',
back_populates='user',
cascade='all, delete-orphan',
lazy='dynamic'
)
2023-02-21 11:05:09 +01:00
corpus_follower_associations = db.relationship(
2023-01-23 11:50:12 +01:00
'CorpusFollowerAssociation',
2023-02-21 11:05:09 +01:00
back_populates='follower',
cascade='all, delete-orphan'
2022-10-13 15:05:54 +02:00
)
2023-01-23 11:50:12 +01:00
followed_corpora = association_proxy(
2023-02-21 11:05:09 +01:00
'corpus_follower_associations',
2023-02-24 10:02:28 +01:00
'corpus',
creator=lambda c: CorpusFollowerAssociation(corpus=c)
2023-01-23 11:50:12 +01:00
)
jobs = db.relationship(
'Job',
back_populates='user',
2021-11-30 16:22:16 +01:00
cascade='all, delete-orphan',
lazy='dynamic'
)
2023-01-23 11:50:12 +01:00
role = db.relationship(
'Role',
back_populates='users'
2023-01-12 16:26:33 +01:00
)
2023-01-23 11:50:12 +01:00
spacy_nlp_pipeline_models = db.relationship(
'SpaCyNLPPipelineModel',
back_populates='user',
cascade='all, delete-orphan',
2023-01-19 08:58:57 +01:00
lazy='dynamic'
)
2023-01-23 11:50:12 +01:00
tesseract_ocr_pipeline_models = db.relationship(
'TesseractOCRPipelineModel',
back_populates='user',
2021-11-30 16:22:16 +01:00
cascade='all, delete-orphan',
lazy='dynamic'
)
2022-09-02 13:24:14 +02:00
tokens = db.relationship(
'Token',
2023-01-23 11:50:12 +01:00
back_populates='user',
2022-09-02 13:24:14 +02:00
cascade='all, delete-orphan',
lazy='dynamic'
)
2021-11-30 16:22:16 +01:00
def __init__(self, **kwargs):
if 'role' not in kwargs:
2023-02-24 10:34:42 +01:00
kwargs['role'] = (
Role.query.filter_by(name='Administrator').first()
if kwargs['email'] == current_app.config['NOPAQUE_ADMIN']
else Role.query.filter_by(default=True).first()
)
2021-11-30 16:22:16 +01:00
super().__init__(**kwargs)
def __repr__(self):
return f'<User {self.username}>'
@property
2021-11-30 16:22:16 +01:00
def jsonpatch_path(self):
return f'/users/{self.hashid}'
@property
def password(self):
raise AttributeError('password is not a readable attribute')
@password.setter
def password(self, password):
self.password_hash = generate_password_hash(password)
2021-11-30 16:22:16 +01:00
@property
def path(self):
return os.path.join(
current_app.config.get('NOPAQUE_DATA_DIR'), 'users', str(self.id))
2022-09-02 13:07:30 +02:00
@staticmethod
def create(**kwargs):
user = User(**kwargs)
db.session.add(user)
db.session.flush(objects=[user])
db.session.refresh(user)
try:
os.mkdir(user.path)
2022-10-13 15:05:54 +02:00
os.mkdir(os.path.join(user.path, 'spacy_nlp_pipeline_models'))
2022-10-12 10:23:05 +02:00
os.mkdir(os.path.join(user.path, 'tesseract_ocr_pipeline_models'))
2022-09-02 13:07:30 +02:00
os.mkdir(os.path.join(user.path, 'corpora'))
os.mkdir(os.path.join(user.path, 'jobs'))
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
raise e
return user
@staticmethod
def insert_defaults():
nopaque_user = User.query.filter_by(username='nopaque').first()
system_user_role = Role.query.filter_by(name='System user').first()
if nopaque_user is None:
nopaque_user = User.create(
username='nopaque',
role=system_user_role
)
db.session.add(nopaque_user)
elif nopaque_user.role != system_user_role:
nopaque_user.role = system_user_role
db.session.commit()
@staticmethod
def reset_password(token, new_password):
try:
payload = jwt.decode(
token,
current_app.config['SECRET_KEY'],
algorithms=['HS256'],
issuer=current_app.config['SERVER_NAME'],
options={'require': ['exp', 'iat', 'iss', 'purpose', 'sub']}
)
except jwt.PyJWTError:
return False
if payload.get('purpose') != 'User.reset_password':
return False
user_hashid = payload.get('sub')
user_id = hashids.decode(user_hashid)
user = User.query.get(user_id)
if user is None:
return False
user.password = new_password
db.session.add(user)
return True
2022-09-02 13:24:14 +02:00
@staticmethod
def verify_access_token(access_token, refresh_token=None):
token = Token.query.filter(Token.access_token == access_token).first()
if token is not None:
if token.access_expiration > datetime.utcnow():
token.user.ping()
db.session.commit()
if token.user.role.name != 'System user':
return token.user
@staticmethod
def verify_refresh_token(refresh_token, access_token):
token = Token.query.filter((Token.refresh_token == refresh_token) & (Token.access_token == access_token)).first()
if token is not None:
if token.refresh_expiration > datetime.utcnow():
return token
# someone tried to refresh with an expired token
# revoke all tokens from this user as a precaution
token.user.revoke_auth_tokens()
db.session.commit()
2021-11-30 16:22:16 +01:00
def can(self, permission):
2023-04-11 11:46:33 +02:00
return self.role is not None and self.role.has_permission(permission)
2022-09-02 13:07:30 +02:00
def confirm(self, confirmation_token):
2019-07-08 15:59:15 +02:00
try:
2022-07-18 17:10:09 +02:00
payload = jwt.decode(
2022-09-02 13:07:30 +02:00
confirmation_token,
2022-07-18 17:10:09 +02:00
current_app.config['SECRET_KEY'],
algorithms=['HS256'],
issuer=current_app.config['SERVER_NAME'],
options={'require': ['exp', 'iat', 'iss', 'purpose', 'sub']}
)
except jwt.PyJWTError:
2019-07-08 15:59:15 +02:00
return False
2022-09-02 13:07:30 +02:00
if payload.get('purpose') != 'user.confirm':
2022-07-18 17:10:09 +02:00
return False
2022-07-18 17:37:05 +02:00
if payload.get('sub') != self.hashid:
2019-07-08 15:59:15 +02:00
return False
self.confirmed = True
db.session.add(self)
return True
2019-11-14 09:48:30 +01:00
def delete(self):
shutil.rmtree(self.path, ignore_errors=True)
db.session.delete(self)
2022-09-02 13:24:14 +02:00
def generate_auth_token(self):
return Token(
access_token=secrets.token_urlsafe(),
access_expiration=datetime.utcnow() + timedelta(minutes=15),
refresh_token=secrets.token_urlsafe(),
refresh_expiration=datetime.utcnow() + timedelta(days=7),
user=self
)
2022-09-02 13:07:30 +02:00
def generate_confirm_token(self, expiration=3600):
now = datetime.utcnow()
2022-07-18 17:10:09 +02:00
payload = {
2022-09-02 13:07:30 +02:00
'exp': now + timedelta(seconds=expiration),
'iat': now,
2022-07-18 17:10:09 +02:00
'iss': current_app.config['SERVER_NAME'],
2022-09-02 13:07:30 +02:00
'purpose': 'user.confirm',
2022-07-18 17:37:05 +02:00
'sub': self.hashid
2022-07-18 17:10:09 +02:00
}
2022-09-02 13:07:30 +02:00
return jwt.encode(
payload,
current_app.config['SECRET_KEY'],
algorithm='HS256'
)
2022-07-18 17:10:09 +02:00
2022-09-02 13:07:30 +02:00
def generate_reset_password_token(self, expiration=3600):
now = datetime.utcnow()
2022-07-18 17:10:09 +02:00
payload = {
2022-09-02 13:07:30 +02:00
'exp': now + timedelta(seconds=expiration),
'iat': now,
2022-07-18 17:10:09 +02:00
'iss': current_app.config['SERVER_NAME'],
2022-09-02 13:07:30 +02:00
'purpose': 'User.reset_password',
2022-07-18 17:37:05 +02:00
'sub': self.hashid
2022-07-18 17:10:09 +02:00
}
2022-09-02 13:07:30 +02:00
return jwt.encode(
payload,
current_app.config['SECRET_KEY'],
algorithm='HS256'
)
2021-11-30 16:22:16 +01:00
def is_administrator(self):
return self.can(Permission.ADMINISTRATE)
2022-09-02 13:07:30 +02:00
def ping(self):
self.last_seen = datetime.utcnow()
2022-09-02 13:24:14 +02:00
def revoke_auth_tokens(self):
for token in self.tokens:
db.session.delete(token)
2022-09-02 13:07:30 +02:00
def verify_password(self, password):
if self.role.name == 'System user':
return False
return check_password_hash(self.password_hash, password)
2022-12-13 15:01:04 +01:00
#region Profile Privacy settings
def has_profile_privacy_setting(self, setting):
2023-03-22 12:06:33 +01:00
s = ProfilePrivacySettings.get(setting)
return self.profile_privacy_settings & s.value == s.value
2022-12-13 15:01:04 +01:00
def add_profile_privacy_setting(self, setting):
2023-03-22 12:06:33 +01:00
s = ProfilePrivacySettings.get(setting)
if not self.has_profile_privacy_setting(s):
self.profile_privacy_settings += s.value
2022-12-13 15:01:04 +01:00
def remove_profile_privacy_setting(self, setting):
2023-03-22 12:06:33 +01:00
s = ProfilePrivacySettings.get(setting)
if self.has_profile_privacy_setting(s):
self.profile_privacy_settings -= s.value
2022-12-13 15:01:04 +01:00
def reset_profile_privacy_settings(self):
self.profile_privacy_settings = 0
#endregion Profile Privacy settings
2023-02-24 09:27:20 +01:00
def follow_corpus(self, corpus, role=None):
if role is None:
cfr = CorpusFollowerRole.query.filter_by(default=True).first()
else:
cfr = role
2023-02-24 09:44:09 +01:00
if self.is_following_corpus(corpus):
cfa = CorpusFollowerAssociation.query.filter_by(corpus=corpus, follower=self).first()
if cfa.role != cfr:
cfa.role = cfr
else:
cfa = CorpusFollowerAssociation(corpus=corpus, role=cfr, follower=self)
db.session.add(cfa)
def unfollow_corpus(self, corpus):
2023-02-24 09:44:09 +01:00
if not self.is_following_corpus(corpus):
return
self.followed_corpora.remove(corpus)
def is_following_corpus(self, corpus):
return corpus in self.followed_corpora
2023-03-02 09:57:43 +01:00
def generate_follow_corpus_token(self, corpus_hashid, role_name, expiration=7):
now = datetime.utcnow()
payload = {
'exp': expiration,
'iat': now,
'iss': current_app.config['SERVER_NAME'],
2023-03-02 09:57:43 +01:00
'purpose': 'User.follow_corpus',
'role_name': role_name,
'sub': corpus_hashid
}
return jwt.encode(
payload,
current_app.config['SECRET_KEY'],
algorithm='HS256'
)
2023-03-02 09:57:43 +01:00
def follow_corpus_by_token(self, token):
try:
payload = jwt.decode(
token,
current_app.config['SECRET_KEY'],
algorithms=['HS256'],
issuer=current_app.config['SERVER_NAME'],
2023-03-02 09:57:43 +01:00
options={'require': ['exp', 'iat', 'iss', 'purpose', 'role_name', 'sub']}
)
except jwt.PyJWTError:
return False
2023-03-02 09:57:43 +01:00
if payload.get('purpose') != 'User.follow_corpus':
return False
corpus_hashid = payload.get('sub')
corpus_id = hashids.decode(corpus_hashid)
corpus = Corpus.query.get_or_404(corpus_id)
if corpus is None:
return False
2023-03-02 09:57:43 +01:00
role_name = payload.get('role_name')
role = CorpusFollowerRole.query.filter_by(name=role_name).first()
if role is None:
return False
self.follow_corpus(corpus, role)
# db.session.add(self)
2023-03-02 09:57:43 +01:00
return True
2022-12-19 15:35:06 +01:00
def to_json_serializeable(self, backrefs=False, relationships=False, filter_by_privacy_settings=False):
2022-11-24 12:24:29 +01:00
json_serializeable = {
2021-11-30 16:22:16 +01:00
'id': self.hashid,
'confirmed': self.confirmed,
2023-10-25 16:21:30 +02:00
'avatar': url_for('users.user_avatar', user_id=self.id),
2021-11-30 16:22:16 +01:00
'email': self.email,
2022-09-02 13:07:30 +02:00
'last_seen': (
None if self.last_seen is None
2023-02-22 09:35:19 +01:00
else f'{self.last_seen.isoformat()}Z'
2022-09-02 13:07:30 +02:00
),
2023-02-22 09:35:19 +01:00
'member_since': f'{self.member_since.isoformat()}Z',
2021-11-30 16:22:16 +01:00
'username': self.username,
2022-12-05 09:40:02 +01:00
'full_name': self.full_name,
'about_me': self.about_me,
'website': self.website,
'location': self.location,
'organization': self.organization,
2022-11-24 12:24:29 +01:00
'job_status_mail_notification_level': \
2022-12-13 15:01:04 +01:00
self.setting_job_status_mail_notification_level.name,
2023-03-29 14:32:52 +02:00
'profile_privacy_settings': {
'is_public': self.is_public,
'show_email': self.has_profile_privacy_setting(ProfilePrivacySettings.SHOW_EMAIL),
'show_last_seen': self.has_profile_privacy_setting(ProfilePrivacySettings.SHOW_LAST_SEEN),
'show_member_since': self.has_profile_privacy_setting(ProfilePrivacySettings.SHOW_MEMBER_SINCE)
}
2021-11-30 16:22:16 +01:00
}
if backrefs:
2022-11-24 12:24:29 +01:00
json_serializeable['role'] = \
self.role.to_json_serializeable(backrefs=True)
2022-12-19 15:35:06 +01:00
if relationships:
2023-02-21 11:05:09 +01:00
json_serializeable['corpus_follower_associations'] = {
2023-02-23 13:05:04 +01:00
x.hashid: x.to_json_serializeable()
2023-02-21 11:05:09 +01:00
for x in self.corpus_follower_associations
2023-02-20 10:40:33 +01:00
}
2022-11-24 12:24:29 +01:00
json_serializeable['corpora'] = {
x.hashid: x.to_json_serializeable(relationships=True)
2021-11-30 16:22:16 +01:00
for x in self.corpora
}
2022-11-24 12:24:29 +01:00
json_serializeable['jobs'] = {
x.hashid: x.to_json_serializeable(relationships=True)
2021-11-30 16:22:16 +01:00
for x in self.jobs
}
2022-11-24 12:24:29 +01:00
json_serializeable['tesseract_ocr_pipeline_models'] = {
x.hashid: x.to_json_serializeable(relationships=True)
2022-10-12 10:23:05 +02:00
for x in self.tesseract_ocr_pipeline_models
2021-11-30 16:22:16 +01:00
}
2022-11-24 12:24:29 +01:00
json_serializeable['spacy_nlp_pipeline_models'] = {
x.hashid: x.to_json_serializeable(relationships=True)
2022-11-07 09:15:38 +01:00
for x in self.spacy_nlp_pipeline_models
}
2022-12-19 12:46:18 +01:00
if filter_by_privacy_settings:
if not self.has_profile_privacy_setting(ProfilePrivacySettings.SHOW_EMAIL):
json_serializeable.pop('email')
if not self.has_profile_privacy_setting(ProfilePrivacySettings.SHOW_LAST_SEEN):
json_serializeable.pop('last_seen')
if not self.has_profile_privacy_setting(ProfilePrivacySettings.SHOW_MEMBER_SINCE):
json_serializeable.pop('member_since')
2022-11-24 12:24:29 +01:00
return json_serializeable
2021-11-30 16:22:16 +01:00
2022-12-15 09:53:19 +01:00
2022-10-12 10:23:05 +02:00
class TesseractOCRPipelineModel(FileMixin, HashidMixin, db.Model):
__tablename__ = 'tesseract_ocr_pipeline_models'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
# Fields
2022-09-02 13:07:30 +02:00
title = db.Column(db.String(64))
description = db.Column(db.String(255))
2022-09-02 13:07:30 +02:00
version = db.Column(db.String(16))
compatible_service_versions = db.Column(ContainerColumn(list, 255))
publisher = db.Column(db.String(128))
2022-04-22 15:27:52 +02:00
publisher_url = db.Column(db.String(512))
publishing_url = db.Column(db.String(512))
publishing_year = db.Column(db.Integer)
is_public = db.Column(db.Boolean, default=False)
2023-01-23 11:50:12 +01:00
# Relationships
user = db.relationship('User', back_populates='tesseract_ocr_pipeline_models')
@property
def path(self):
return os.path.join(
self.user.path,
2022-10-12 10:23:05 +02:00
'tesseract_ocr_pipeline_models',
str(self.id)
)
2022-11-18 11:51:41 +01:00
@property
def jsonpatch_path(self):
return f'{self.user.jsonpatch_path}/tesseract_ocr_pipeline_models/{self.hashid}'
@property
def url(self):
return url_for(
'contributions.tesseract_ocr_pipeline_model',
tesseract_ocr_pipeline_model_id=self.id
)
2022-11-18 11:51:41 +01:00
@property
def user_hashid(self):
return self.user.hashid
@staticmethod
def insert_defaults():
2022-09-02 13:07:30 +02:00
nopaque_user = User.query.filter_by(username='nopaque').first()
defaults_file = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
2022-10-12 10:23:05 +02:00
'TesseractOCRPipelineModel.defaults.yml'
)
with open(defaults_file, 'r') as f:
defaults = yaml.safe_load(f)
for m in defaults:
2022-10-12 10:23:05 +02:00
model = TesseractOCRPipelineModel.query.filter_by(title=m['title'], version=m['version']).first() # noqa
if model is not None:
model.compatible_service_versions = m['compatible_service_versions']
model.description = m['description']
model.publisher = m['publisher']
2022-04-22 15:27:52 +02:00
model.publisher_url = m['publisher_url']
model.publishing_url = m['publishing_url']
model.publishing_year = m['publishing_year']
model.is_public = True
model.title = m['title']
model.version = m['version']
continue
2022-10-12 10:23:05 +02:00
model = TesseractOCRPipelineModel(
compatible_service_versions=m['compatible_service_versions'],
description=m['description'],
publisher=m['publisher'],
2022-04-22 15:27:52 +02:00
publisher_url=m['publisher_url'],
publishing_url=m['publishing_url'],
publishing_year=m['publishing_year'],
is_public=True,
title=m['title'],
2022-09-02 13:07:30 +02:00
user=nopaque_user,
version=m['version']
)
db.session.add(model)
db.session.flush(objects=[model])
db.session.refresh(model)
model.filename = f'{model.id}.traineddata'
r = requests.get(m['url'], stream=True)
pbar = tqdm(
desc=f'{model.title} ({model.filename})',
unit="B",
unit_scale=True,
unit_divisor=1024,
total=int(r.headers['Content-Length'])
)
pbar.clear()
2022-10-13 15:05:54 +02:00
with open(model.path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
pbar.update(len(chunk))
f.write(chunk)
pbar.close()
db.session.commit()
2022-11-03 15:38:35 +01:00
def delete(self):
try:
os.remove(self.path)
except OSError as e:
current_app.logger.error(e)
db.session.delete(self)
2022-11-24 12:24:29 +01:00
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
2022-10-13 15:05:54 +02:00
'id': self.hashid,
'compatible_service_versions': self.compatible_service_versions,
'description': self.description,
'publisher': self.publisher,
'publisher_url': self.publisher_url,
'publishing_url': self.publishing_url,
'publishing_year': self.publishing_year,
'is_public': self.is_public,
2022-10-13 15:05:54 +02:00
'title': self.title,
2022-11-15 15:11:16 +01:00
'version': self.version,
2022-11-24 12:24:29 +01:00
**self.file_mixin_to_json_serializeable()
2022-10-13 15:05:54 +02:00
}
if backrefs:
2022-11-24 12:24:29 +01:00
json_serializeable['user'] = \
self.user.to_json_serializeable(backrefs=True)
2023-02-22 09:35:19 +01:00
if relationships:
pass
2022-11-24 12:24:29 +01:00
return json_serializeable
2022-10-13 15:05:54 +02:00
class SpaCyNLPPipelineModel(FileMixin, HashidMixin, db.Model):
__tablename__ = 'spacy_nlp_pipeline_models'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
# Fields
title = db.Column(db.String(64))
description = db.Column(db.String(255))
version = db.Column(db.String(16))
compatible_service_versions = db.Column(ContainerColumn(list, 255))
publisher = db.Column(db.String(128))
publisher_url = db.Column(db.String(512))
publishing_url = db.Column(db.String(512))
publishing_year = db.Column(db.Integer)
2022-11-07 09:15:38 +01:00
pipeline_name = db.Column(db.String(64))
is_public = db.Column(db.Boolean, default=False)
2023-01-23 11:50:12 +01:00
# Relationships
user = db.relationship('User', back_populates='spacy_nlp_pipeline_models')
2022-10-13 15:05:54 +02:00
@property
def path(self):
return os.path.join(
self.user.path,
'spacy_nlp_pipeline_models',
str(self.id)
)
2022-11-18 11:51:41 +01:00
@property
def jsonpatch_path(self):
return f'{self.user.jsonpatch_path}/spacy_nlp_pipeline_models/{self.hashid}'
@property
def url(self):
return url_for(
'contributions.spacy_nlp_pipeline_model',
spacy_nlp_pipeline_model_id=self.id
)
2022-11-18 11:51:41 +01:00
@property
def user_hashid(self):
return self.user.hashid
2022-10-13 15:05:54 +02:00
@staticmethod
def insert_defaults():
nopaque_user = User.query.filter_by(username='nopaque').first()
defaults_file = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'SpaCyNLPPipelineModel.defaults.yml'
)
with open(defaults_file, 'r') as f:
defaults = yaml.safe_load(f)
for m in defaults:
model = SpaCyNLPPipelineModel.query.filter_by(title=m['title'], version=m['version']).first() # noqa
if model is not None:
model.compatible_service_versions = m['compatible_service_versions']
model.description = m['description']
model.publisher = m['publisher']
model.publisher_url = m['publisher_url']
model.publishing_url = m['publishing_url']
model.publishing_year = m['publishing_year']
model.is_public = True
2022-10-13 15:05:54 +02:00
model.title = m['title']
model.version = m['version']
2022-11-07 09:15:38 +01:00
model.pipeline_name = m['pipeline_name']
2022-10-13 15:05:54 +02:00
continue
model = SpaCyNLPPipelineModel(
compatible_service_versions=m['compatible_service_versions'],
description=m['description'],
publisher=m['publisher'],
publisher_url=m['publisher_url'],
publishing_url=m['publishing_url'],
publishing_year=m['publishing_year'],
is_public=True,
2022-10-13 15:05:54 +02:00
title=m['title'],
user=nopaque_user,
2022-11-07 09:15:38 +01:00
version=m['version'],
pipeline_name=m['pipeline_name']
2022-10-13 15:05:54 +02:00
)
db.session.add(model)
db.session.flush(objects=[model])
db.session.refresh(model)
2022-11-11 13:38:41 +01:00
model.filename = m['url'].split('/')[-1]
2022-10-13 15:05:54 +02:00
r = requests.get(m['url'], stream=True)
pbar = tqdm(
desc=f'{model.title} ({model.filename})',
unit="B",
unit_scale=True,
unit_divisor=1024,
total=int(r.headers['Content-Length'])
)
pbar.clear()
with open(model.path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
pbar.update(len(chunk))
f.write(chunk)
pbar.close()
db.session.commit()
2022-11-07 09:15:38 +01:00
def delete(self):
try:
os.remove(self.path)
except OSError as e:
current_app.logger.error(e)
db.session.delete(self)
2022-11-24 12:24:29 +01:00
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
2022-09-02 13:07:30 +02:00
'id': self.hashid,
'compatible_service_versions': self.compatible_service_versions,
'description': self.description,
'publisher': self.publisher,
'publisher_url': self.publisher_url,
'publishing_url': self.publishing_url,
'publishing_year': self.publishing_year,
2022-11-07 09:15:38 +01:00
'pipeline_name': self.pipeline_name,
'is_public': self.is_public,
2022-09-02 13:07:30 +02:00
'title': self.title,
2022-11-15 15:11:16 +01:00
'version': self.version,
2022-11-24 12:24:29 +01:00
**self.file_mixin_to_json_serializeable()
2022-09-02 13:07:30 +02:00
}
if backrefs:
2023-02-22 09:35:19 +01:00
json_serializeable['user'] = \
self.user.to_json_serializeable(backrefs=True)
if relationships:
pass
2022-11-24 12:24:29 +01:00
return json_serializeable
2022-09-02 13:07:30 +02:00
2021-11-30 16:22:16 +01:00
class JobInput(FileMixin, HashidMixin, db.Model):
2019-10-16 16:52:05 +02:00
__tablename__ = 'job_inputs'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
job_id = db.Column(db.Integer, db.ForeignKey('jobs.id'))
2023-01-23 11:50:12 +01:00
# Relationships
job = db.relationship(
'Job',
back_populates='inputs'
)
2021-11-30 16:22:16 +01:00
def __repr__(self):
return f'<JobInput {self.filename}>'
2019-10-16 16:52:05 +02:00
2020-12-03 15:13:24 +01:00
@property
2022-09-02 13:07:30 +02:00
def content_url(self):
2021-11-30 16:22:16 +01:00
return url_for(
'jobs.download_job_input',
job_id=self.job.id,
job_input_id=self.id
)
2020-12-03 15:13:24 +01:00
@property
def jsonpatch_path(self):
2021-11-30 16:22:16 +01:00
return f'{self.job.jsonpatch_path}/inputs/{self.hashid}'
@property
def path(self):
return os.path.join(self.job.path, 'inputs', str(self.id))
2020-12-03 15:13:24 +01:00
@property
def url(self):
2021-11-30 16:22:16 +01:00
return url_for(
'jobs.job',
job_id=self.job_id,
_anchor=f'job-{self.job.hashid}-input-{self.hashid}'
)
@property
def user_hashid(self):
return self.job.user.hashid
2020-12-03 15:13:24 +01:00
@property
def user_id(self):
2022-12-15 09:53:19 +01:00
return self.job.user.id
2022-11-24 12:24:29 +01:00
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
2022-09-02 13:07:30 +02:00
'id': self.hashid,
2022-11-24 12:24:29 +01:00
**self.file_mixin_to_json_serializeable()
2022-09-02 13:07:30 +02:00
}
if backrefs:
2022-11-24 12:24:29 +01:00
json_serializeable['job'] = \
self.job.to_json_serializeable(backrefs=True)
2023-02-22 09:35:19 +01:00
if relationships:
pass
2022-11-24 12:24:29 +01:00
return json_serializeable
2022-09-02 13:07:30 +02:00
2019-10-16 16:52:05 +02:00
2021-11-30 16:22:16 +01:00
class JobResult(FileMixin, HashidMixin, db.Model):
2019-10-16 16:52:05 +02:00
__tablename__ = 'job_results'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
job_id = db.Column(db.Integer, db.ForeignKey('jobs.id'))
# Fields
description = db.Column(db.String(255))
2023-01-23 11:50:12 +01:00
# Relationships
job = db.relationship(
'Job',
back_populates='results'
)
2021-11-30 16:22:16 +01:00
def __repr__(self):
return f'<JobResult {self.filename}>'
2019-10-16 16:52:05 +02:00
2020-12-03 15:13:24 +01:00
@property
def download_url(self):
2021-11-30 16:22:16 +01:00
return url_for(
'jobs.download_job_result',
job_id=self.job_id,
job_result_id=self.id
)
2020-12-03 15:13:24 +01:00
@property
def jsonpatch_path(self):
2021-11-30 16:22:16 +01:00
return f'{self.job.jsonpatch_path}/results/{self.hashid}'
@property
def path(self):
return os.path.join(self.job.path, 'results', str(self.id))
2020-12-03 15:13:24 +01:00
@property
def url(self):
2021-11-30 16:22:16 +01:00
return url_for(
'jobs.job',
job_id=self.job_id,
_anchor=f'job-{self.job.hashid}-result-{self.hashid}'
)
@property
def user_hashid(self):
return self.job.user.hashid
2020-12-03 15:13:24 +01:00
@property
def user_id(self):
2022-12-15 09:53:19 +01:00
return self.job.user.id
2022-11-24 12:24:29 +01:00
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
2022-09-02 13:07:30 +02:00
'id': self.hashid,
'description': self.description,
2022-11-24 12:24:29 +01:00
**self.file_mixin_to_json_serializeable(
2022-09-02 13:07:30 +02:00
backrefs=backrefs,
relationships=relationships
)
}
if backrefs:
2022-11-24 12:24:29 +01:00
json_serializeable['job'] = \
self.job.to_json_serializeable(backrefs=True)
2023-02-22 09:35:19 +01:00
if relationships:
pass
2022-11-24 12:24:29 +01:00
return json_serializeable
2022-09-02 13:07:30 +02:00
2019-10-17 13:26:20 +02:00
2021-11-30 16:22:16 +01:00
class Job(HashidMixin, db.Model):
2020-05-14 15:30:13 +02:00
'''
2019-08-05 16:45:38 +02:00
Class to define Jobs.
2020-05-14 15:30:13 +02:00
'''
2019-08-05 16:45:38 +02:00
__tablename__ = 'jobs'
2019-08-06 11:47:04 +02:00
# Primary key
2019-08-05 16:45:38 +02:00
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
2020-04-27 10:30:38 +02:00
# Fields
2022-09-02 13:07:30 +02:00
creation_date = \
db.Column(db.DateTime(), default=datetime.utcnow)
2019-08-09 11:48:43 +02:00
description = db.Column(db.String(255))
end_date = db.Column(db.DateTime())
2019-08-06 11:47:04 +02:00
service = db.Column(db.String(64))
service_args = db.Column(ContainerColumn(dict, 255))
2019-08-09 11:48:43 +02:00
service_version = db.Column(db.String(16))
status = db.Column(
IntEnumColumn(JobStatus),
default=JobStatus.INITIALIZING
)
2019-08-06 11:47:04 +02:00
title = db.Column(db.String(32))
2019-10-16 16:52:05 +02:00
# Relationships
2021-11-30 16:22:16 +01:00
inputs = db.relationship(
'JobInput',
2023-01-23 11:50:12 +01:00
back_populates='job',
2021-11-30 16:22:16 +01:00
cascade='all, delete-orphan',
lazy='dynamic'
)
results = db.relationship(
'JobResult',
2023-01-23 11:50:12 +01:00
back_populates='job',
2021-11-30 16:22:16 +01:00
cascade='all, delete-orphan',
lazy='dynamic'
)
2023-01-23 11:50:12 +01:00
user = db.relationship(
'User',
back_populates='jobs'
)
2021-11-30 16:22:16 +01:00
def __repr__(self):
return f'<Job {self.title}>'
@property
def jsonpatch_path(self):
2021-11-30 16:22:16 +01:00
return f'{self.user.jsonpatch_path}/jobs/{self.hashid}'
@property
def path(self):
2021-11-30 16:22:16 +01:00
return os.path.join(self.user.path, 'jobs', str(self.id))
2019-08-05 16:45:38 +02:00
2020-12-03 15:13:24 +01:00
@property
2020-12-07 16:10:40 +01:00
def url(self):
return url_for('jobs.job', job_id=self.id)
2020-12-03 15:13:24 +01:00
2021-11-30 16:22:16 +01:00
@property
def user_hashid(self):
return self.user.hashid
2019-08-05 16:45:38 +02:00
2022-09-02 13:07:30 +02:00
@staticmethod
def create(**kwargs):
job = Job(**kwargs)
db.session.add(job)
db.session.flush(objects=[job])
db.session.refresh(job)
try:
os.mkdir(job.path)
os.mkdir(os.path.join(job.path, 'inputs'))
os.mkdir(os.path.join(job.path, 'pipeline_data'))
os.mkdir(os.path.join(job.path, 'results'))
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
raise e
return job
2019-11-14 09:48:30 +01:00
def delete(self):
2022-09-02 13:07:30 +02:00
''' Delete the job and its inputs and results from the database. '''
if self.status not in [JobStatus.COMPLETED, JobStatus.FAILED]: # noqa
self.status = JobStatus.CANCELING
db.session.commit()
while self.status != JobStatus.CANCELED:
# In case the daemon handled a job in any way
if self.status != JobStatus.CANCELING:
self.status = JobStatus.CANCELING
db.session.commit()
sleep(1)
db.session.refresh(self)
2022-09-02 13:07:30 +02:00
try:
shutil.rmtree(self.path)
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
raise e
2019-11-14 09:48:30 +01:00
db.session.delete(self)
def restart(self):
2022-09-02 13:07:30 +02:00
''' Restart a job - only if the status is failed '''
if self.status != JobStatus.FAILED:
raise Exception('Job status is not "failed"')
shutil.rmtree(os.path.join(self.path, 'results'), ignore_errors=True)
2022-09-02 13:07:30 +02:00
shutil.rmtree(os.path.join(self.path, 'pyflow.data'), ignore_errors=True)
for result in self.results:
db.session.delete(result)
self.end_date = None
self.status = JobStatus.SUBMITTED
2022-11-24 12:24:29 +01:00
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
2021-11-30 16:22:16 +01:00
'id': self.hashid,
2022-09-02 13:07:30 +02:00
'creation_date': f'{self.creation_date.isoformat()}Z',
'description': self.description,
2022-09-02 13:07:30 +02:00
'end_date': (
None if self.end_date is None
else f'{self.end_date.isoformat()}Z'
),
'service': self.service,
'service_args': self.service_args,
'service_version': self.service_version,
'status': self.status.name,
2023-02-22 09:35:19 +01:00
'title': self.title
}
2021-11-30 16:22:16 +01:00
if backrefs:
2022-11-24 12:24:29 +01:00
json_serializeable['user'] = \
self.user.to_json_serializeable(backrefs=True)
2021-11-30 16:22:16 +01:00
if relationships:
2022-11-24 12:24:29 +01:00
json_serializeable['inputs'] = {
x.hashid: x.to_json_serializeable(relationships=True)
2021-11-30 16:22:16 +01:00
for x in self.inputs
}
2022-11-24 12:24:29 +01:00
json_serializeable['results'] = {
x.hashid: x.to_json_serializeable(relationships=True)
2021-11-30 16:22:16 +01:00
for x in self.results
}
2022-11-24 12:24:29 +01:00
return json_serializeable
2019-08-05 16:45:38 +02:00
2021-11-30 16:22:16 +01:00
class CorpusFile(FileMixin, HashidMixin, db.Model):
2019-10-16 16:52:05 +02:00
__tablename__ = 'corpus_files'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
corpus_id = db.Column(db.Integer, db.ForeignKey('corpora.id'))
2020-04-27 10:30:38 +02:00
# Fields
2020-01-08 16:02:42 +01:00
author = db.Column(db.String(255))
description = db.Column(db.String(255))
2022-09-02 13:07:30 +02:00
publishing_year = db.Column(db.Integer)
title = db.Column(db.String(255))
address = db.Column(db.String(255))
2020-01-08 16:02:42 +01:00
booktitle = db.Column(db.String(255))
chapter = db.Column(db.String(255))
editor = db.Column(db.String(255))
institution = db.Column(db.String(255))
journal = db.Column(db.String(255))
pages = db.Column(db.String(255))
publisher = db.Column(db.String(255))
school = db.Column(db.String(255))
2023-01-23 11:50:12 +01:00
# Relationships
corpus = db.relationship(
'Corpus',
back_populates='files'
)
2019-10-16 16:52:05 +02:00
2020-12-03 15:13:24 +01:00
@property
def download_url(self):
2021-11-30 16:22:16 +01:00
return url_for(
'corpora.download_corpus_file',
corpus_id=self.corpus_id,
corpus_file_id=self.id
)
2020-12-03 15:13:24 +01:00
@property
def jsonpatch_path(self):
2021-12-01 16:03:55 +01:00
return f'{self.corpus.jsonpatch_path}/files/{self.hashid}'
@property
def path(self):
return os.path.join(self.corpus.path, 'files', str(self.id))
2020-12-03 15:13:24 +01:00
@property
def url(self):
2021-11-30 16:22:16 +01:00
return url_for(
'corpora.corpus_file',
corpus_id=self.corpus_id,
corpus_file_id=self.id
)
@property
def user_hashid(self):
return self.corpus.user.hashid
2020-12-03 15:13:24 +01:00
@property
def user_id(self):
return self.corpus.user_id
2019-10-30 08:28:52 +01:00
def delete(self):
2020-07-10 11:36:54 +02:00
try:
os.remove(self.path)
2022-11-03 15:38:35 +01:00
except OSError as e:
current_app.logger.error(e)
2019-10-30 08:28:52 +01:00
db.session.delete(self)
self.corpus.status = CorpusStatus.UNPREPARED
2019-10-30 08:28:52 +01:00
2022-11-24 12:24:29 +01:00
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
2021-11-30 16:22:16 +01:00
'id': self.hashid,
'address': self.address,
'author': self.author,
2023-02-22 09:35:19 +01:00
'description': self.description,
2021-11-30 16:22:16 +01:00
'booktitle': self.booktitle,
'chapter': self.chapter,
'editor': self.editor,
'institution': self.institution,
'journal': self.journal,
'pages': self.pages,
'publisher': self.publisher,
'publishing_year': self.publishing_year,
'school': self.school,
'title': self.title,
2022-11-24 12:24:29 +01:00
**self.file_mixin_to_json_serializeable(
2022-09-02 13:07:30 +02:00
backrefs=backrefs,
relationships=relationships
)
2021-11-30 16:22:16 +01:00
}
if backrefs:
2022-11-24 12:24:29 +01:00
json_serializeable['corpus'] = \
self.corpus.to_json_serializeable(backrefs=True)
2023-02-22 09:35:19 +01:00
if relationships:
pass
2022-11-24 12:24:29 +01:00
return json_serializeable
2021-11-30 16:22:16 +01:00
2022-12-15 09:53:19 +01:00
2021-11-30 16:22:16 +01:00
class Corpus(HashidMixin, db.Model):
2020-05-14 15:30:13 +02:00
'''
2019-08-06 12:06:41 +02:00
Class to define a corpus.
2020-05-14 15:30:13 +02:00
'''
2019-08-06 12:06:41 +02:00
__tablename__ = 'corpora'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
2020-04-27 10:30:38 +02:00
# Fields
2019-08-06 15:41:07 +02:00
creation_date = db.Column(db.DateTime(), default=datetime.utcnow)
description = db.Column(db.String(255))
status = db.Column(
IntEnumColumn(CorpusStatus),
default=CorpusStatus.UNPREPARED
)
2019-08-06 12:06:41 +02:00
title = db.Column(db.String(32))
2021-11-16 15:23:57 +01:00
num_analysis_sessions = db.Column(db.Integer, default=0)
num_tokens = db.Column(db.Integer, default=0)
is_public = db.Column(db.Boolean, default=False)
2019-10-16 16:52:05 +02:00
# Relationships
2021-11-30 16:22:16 +01:00
files = db.relationship(
'CorpusFile',
2023-01-23 11:50:12 +01:00
back_populates='corpus',
2021-11-30 16:22:16 +01:00
lazy='dynamic',
cascade='all, delete-orphan'
)
2023-02-21 11:05:09 +01:00
corpus_follower_associations = db.relationship(
2023-01-23 11:50:12 +01:00
'CorpusFollowerAssociation',
2023-02-21 11:05:09 +01:00
back_populates='corpus',
cascade='all, delete-orphan'
2023-01-23 11:50:12 +01:00
)
2023-02-21 11:05:09 +01:00
followers = association_proxy(
'corpus_follower_associations',
2023-02-24 10:02:28 +01:00
'follower',
creator=lambda u: CorpusFollowerAssociation(follower=u)
2023-01-23 11:50:12 +01:00
)
user = db.relationship('User', back_populates='corpora')
# "static" attributes
2022-05-05 15:10:03 +02:00
max_num_tokens = 2_147_483_647
2019-08-06 12:06:41 +02:00
2021-11-30 16:22:16 +01:00
def __repr__(self):
return f'<Corpus {self.title}>'
2020-12-07 16:10:40 +01:00
@property
def analysis_url(self):
2023-04-18 15:55:53 +02:00
return url_for('corpora.analysis', corpus_id=self.id)
2020-12-07 16:10:40 +01:00
@property
def jsonpatch_path(self):
2021-11-30 16:22:16 +01:00
return f'{self.user.jsonpatch_path}/corpora/{self.hashid}'
@property
def path(self):
2021-11-30 16:22:16 +01:00
return os.path.join(self.user.path, 'corpora', str(self.id))
2020-12-04 14:16:00 +01:00
@property
def url(self):
return url_for('corpora.corpus', corpus_id=self.id)
2021-11-30 16:22:16 +01:00
@property
def user_hashid(self):
return self.user.hashid
2019-08-06 12:06:41 +02:00
2022-09-02 13:07:30 +02:00
@staticmethod
def create(**kwargs):
corpus = Corpus(**kwargs)
db.session.add(corpus)
db.session.flush(objects=[corpus])
db.session.refresh(corpus)
try:
os.mkdir(corpus.path)
os.mkdir(os.path.join(corpus.path, 'files'))
os.mkdir(os.path.join(corpus.path, 'cwb'))
os.mkdir(os.path.join(corpus.path, 'cwb', 'data'))
os.mkdir(os.path.join(corpus.path, 'cwb', 'registry'))
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
raise e
return corpus
2020-07-10 11:36:54 +02:00
def build(self):
build_dir = os.path.join(self.path, 'cwb')
shutil.rmtree(build_dir, ignore_errors=True)
os.mkdir(build_dir)
os.mkdir(os.path.join(build_dir, 'data'))
os.mkdir(os.path.join(build_dir, 'registry'))
2021-11-16 15:23:57 +01:00
corpus_element = ET.fromstring('<corpus>\n</corpus>')
2020-07-10 11:36:54 +02:00
for corpus_file in self.files:
normalized_vrt_path = os.path.join(build_dir, f'{corpus_file.id}.norm.vrt')
2022-04-12 16:11:40 +02:00
try:
normalize_vrt_file(corpus_file.path, normalized_vrt_path)
except:
self.status = CorpusStatus.FAILED
return
element_tree = ET.parse(normalized_vrt_path)
text_element = element_tree.getroot()
text_element.set('author', corpus_file.author)
2022-09-02 13:07:30 +02:00
text_element.set('title', corpus_file.title)
text_element.set(
'publishing_year',
f'{corpus_file.publishing_year}'
)
text_element.set('address', corpus_file.address or 'NULL')
text_element.set('booktitle', corpus_file.booktitle or 'NULL')
text_element.set('chapter', corpus_file.chapter or 'NULL')
text_element.set('editor', corpus_file.editor or 'NULL')
text_element.set('institution', corpus_file.institution or 'NULL')
text_element.set('journal', corpus_file.journal or 'NULL')
2022-09-02 13:07:30 +02:00
text_element.set('pages', f'{corpus_file.pages}' or 'NULL')
text_element.set('publisher', corpus_file.publisher or 'NULL')
text_element.set('school', corpus_file.school or 'NULL')
2022-04-13 09:47:02 +02:00
text_element.tail = '\n'
# corpus_element.insert(1, text_element)
corpus_element.append(text_element)
ET.ElementTree(corpus_element).write(
os.path.join(build_dir, 'corpus.vrt'),
encoding='utf-8'
)
self.status = CorpusStatus.SUBMITTED
2020-07-10 11:36:54 +02:00
2019-10-30 08:28:52 +01:00
def delete(self):
shutil.rmtree(self.path, ignore_errors=True)
db.session.delete(self)
2022-11-24 12:24:29 +01:00
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
2021-11-30 16:22:16 +01:00
'id': self.hashid,
2022-09-02 13:07:30 +02:00
'creation_date': f'{self.creation_date.isoformat()}Z',
2021-11-30 16:22:16 +01:00
'description': self.description,
'max_num_tokens': self.max_num_tokens,
'num_analysis_sessions': self.num_analysis_sessions,
'num_tokens': self.num_tokens,
'status': self.status.name,
'title': self.title,
'is_public': self.is_public
2021-11-30 16:22:16 +01:00
}
if backrefs:
2023-02-20 10:40:33 +01:00
json_serializeable['user'] = \
self.user.to_json_serializeable(backrefs=True)
2021-11-30 16:22:16 +01:00
if relationships:
2023-02-21 11:05:09 +01:00
json_serializeable['corpus_follower_associations'] = {
2023-02-23 13:05:04 +01:00
x.hashid: x.to_json_serializeable()
2023-02-21 11:05:09 +01:00
for x in self.corpus_follower_associations
2023-02-20 10:40:33 +01:00
}
2022-11-24 12:24:29 +01:00
json_serializeable['files'] = {
x.hashid: x.to_json_serializeable(relationships=True)
2021-11-30 16:22:16 +01:00
for x in self.files
}
2022-11-24 12:24:29 +01:00
return json_serializeable
# endregion models
##############################################################################
# event_handlers #
##############################################################################
# region event_handlers
@db.event.listens_for(Corpus, 'after_delete')
@db.event.listens_for(CorpusFile, 'after_delete')
@db.event.listens_for(Job, 'after_delete')
@db.event.listens_for(JobInput, 'after_delete')
@db.event.listens_for(JobResult, 'after_delete')
2023-01-11 13:29:47 +01:00
@db.event.listens_for(SpaCyNLPPipelineModel, 'after_delete')
@db.event.listens_for(TesseractOCRPipelineModel, 'after_delete')
2023-05-09 14:18:59 +02:00
def resource_after_delete(mapper, connection, resource):
jsonpatch = [
{
'op': 'remove',
'path': resource.jsonpatch_path
}
]
room = f'/users/{resource.user_hashid}'
2022-07-04 14:09:17 +02:00
socketio.emit('PATCH', jsonpatch, room=room)
@db.event.listens_for(CorpusFollowerAssociation, 'after_delete')
2023-05-09 14:18:59 +02:00
def cfa_after_delete_handler(mapper, connection, cfa):
jsonpatch_path = f'/users/{cfa.corpus.user.hashid}/corpora/{cfa.corpus.hashid}/corpus_follower_associations/{cfa.hashid}'
jsonpatch = [
{
'op': 'remove',
'path': jsonpatch_path
}
]
2023-05-09 14:18:59 +02:00
room = f'/users/{cfa.corpus.user.hashid}'
socketio.emit('PATCH', jsonpatch, room=room)
@db.event.listens_for(Corpus, 'after_insert')
@db.event.listens_for(CorpusFile, 'after_insert')
@db.event.listens_for(Job, 'after_insert')
@db.event.listens_for(JobInput, 'after_insert')
@db.event.listens_for(JobResult, 'after_insert')
2023-01-11 13:29:47 +01:00
@db.event.listens_for(SpaCyNLPPipelineModel, 'after_insert')
@db.event.listens_for(TesseractOCRPipelineModel, 'after_insert')
2023-05-09 14:18:59 +02:00
def resource_after_insert_handler(mapper, connection, resource):
jsonpatch_value = resource.to_json_serializeable()
for attr in mapper.relationships:
2023-05-09 14:18:59 +02:00
jsonpatch_value[attr.key] = {}
jsonpatch = [
2023-05-09 14:18:59 +02:00
{
'op': 'add',
'path': resource.jsonpatch_path,
'value': jsonpatch_value
}
]
2023-05-09 14:18:59 +02:00
room = f'/users/{resource.user_hashid}'
2022-07-04 14:09:17 +02:00
socketio.emit('PATCH', jsonpatch, room=room)
@db.event.listens_for(CorpusFollowerAssociation, 'after_insert')
2023-05-09 14:18:59 +02:00
def cfa_after_insert_handler(mapper, connection, cfa):
jsonpatch_value = cfa.to_json_serializeable()
jsonpatch_path = f'/users/{cfa.corpus.user.hashid}/corpora/{cfa.corpus.hashid}/corpus_follower_associations/{cfa.hashid}'
jsonpatch = [
{
'op': 'add',
'path': jsonpatch_path,
'value': jsonpatch_value
}
]
2023-05-09 14:18:59 +02:00
room = f'/users/{cfa.corpus.user.hashid}'
socketio.emit('PATCH', jsonpatch, room=room)
@db.event.listens_for(Corpus, 'after_update')
@db.event.listens_for(CorpusFile, 'after_update')
@db.event.listens_for(Job, 'after_update')
@db.event.listens_for(JobInput, 'after_update')
@db.event.listens_for(JobResult, 'after_update')
2023-01-11 13:29:47 +01:00
@db.event.listens_for(SpaCyNLPPipelineModel, 'after_update')
@db.event.listens_for(TesseractOCRPipelineModel, 'after_update')
2023-05-09 14:18:59 +02:00
def resource_after_update_handler(mapper, connection, resource):
jsonpatch = []
2023-05-09 14:18:59 +02:00
for attr in db.inspect(resource).attrs:
if attr.key in mapper.relationships:
continue
if not attr.load_history().has_changes():
continue
2023-05-09 14:18:59 +02:00
jsonpatch_path = f'{resource.jsonpatch_path}/{attr.key}'
if isinstance(attr.value, datetime):
2023-05-09 14:18:59 +02:00
jsonpatch_value = f'{attr.value.isoformat()}Z'
elif isinstance(attr.value, Enum):
2023-05-09 14:18:59 +02:00
jsonpatch_value = attr.value.name
else:
2023-05-09 14:18:59 +02:00
jsonpatch_value = attr.value
jsonpatch.append(
{
'op': 'replace',
2023-05-09 14:18:59 +02:00
'path': jsonpatch_path,
'value': jsonpatch_value
}
)
if jsonpatch:
2023-05-09 14:18:59 +02:00
room = f'/users/{resource.user_hashid}'
2022-07-04 14:09:17 +02:00
socketio.emit('PATCH', jsonpatch, room=room)
@db.event.listens_for(Job, 'after_update')
def job_after_update_handler(mapper, connection, job):
for attr in db.inspect(job).attrs:
if attr.key != 'status':
continue
if not attr.load_history().has_changes():
return
if job.user.setting_job_status_mail_notification_level == UserSettingJobStatusMailNotificationLevel.NONE:
return
if job.user.setting_job_status_mail_notification_level == UserSettingJobStatusMailNotificationLevel.END:
if job.status not in [JobStatus.COMPLETED, JobStatus.FAILED]:
return
msg = create_message(
job.user.email,
f'Status update for your Job "{job.title}"',
'tasks/email/notification',
job=job
)
mail.send(msg)
# endregion event_handlers
2019-11-04 15:06:54 +01:00
2019-08-22 09:35:23 +02:00
##############################################################################
# misc #
##############################################################################
# region misc
@login.user_loader
def load_user(user_id):
return User.query.get(int(user_id))
# endregion misc