2021-09-14 10:52:23 +00:00
|
|
|
from datetime import datetime, timedelta
|
2022-02-08 11:26:20 +00:00
|
|
|
from enum import IntEnum
|
2020-12-03 14:13:24 +00:00
|
|
|
from flask import current_app, url_for
|
2021-12-13 11:20:01 +00:00
|
|
|
from flask_hashids import HashidMixin
|
2021-09-15 10:31:53 +00:00
|
|
|
from flask_login import UserMixin
|
2019-08-22 07:35:23 +00:00
|
|
|
from itsdangerous import BadSignature, TimedJSONWebSignatureSerializer
|
2020-07-09 13:07:43 +00:00
|
|
|
from time import sleep
|
2022-02-03 11:39:16 +00:00
|
|
|
from tqdm import tqdm
|
2019-07-05 12:47:35 +00:00
|
|
|
from werkzeug.security import generate_password_hash, check_password_hash
|
2021-12-13 11:20:01 +00:00
|
|
|
from . import db, login
|
2021-09-14 10:52:23 +00:00
|
|
|
import base64
|
2022-02-03 11:39:16 +00:00
|
|
|
import json
|
2020-07-09 13:07:43 +00:00
|
|
|
import os
|
2022-02-03 11:39:16 +00:00
|
|
|
import requests
|
2020-07-09 13:07:43 +00:00
|
|
|
import shutil
|
2022-02-03 11:39:16 +00:00
|
|
|
import xml.etree.ElementTree as ET
|
|
|
|
import yaml
|
2019-07-05 12:47:35 +00:00
|
|
|
|
|
|
|
|
2022-02-08 11:26:20 +00:00
|
|
|
class CorpusStatus(IntEnum):
|
|
|
|
UNPREPARED = 1
|
|
|
|
SUBMITTED = 2
|
|
|
|
QUEUED = 3
|
|
|
|
BUILDING = 4
|
|
|
|
BUILT = 5
|
|
|
|
FAILED = 6
|
|
|
|
STARTING_ANALYSIS_SESSION = 7
|
|
|
|
RUNNING_ANALYSIS_SESSION = 8
|
|
|
|
CANCELING_ANALYSIS_SESSION = 9
|
|
|
|
|
|
|
|
|
|
|
|
class JobStatus(IntEnum):
|
|
|
|
INITIALIZING = 1
|
|
|
|
SUBMITTED = 2
|
|
|
|
QUEUED = 3
|
|
|
|
RUNNING = 4
|
|
|
|
CANCELING = 5
|
|
|
|
CANCELED = 6
|
|
|
|
COMPLETED = 7
|
|
|
|
FAILED = 8
|
|
|
|
|
|
|
|
|
|
|
|
class JobStatusMailNotificationLevel(IntEnum):
|
|
|
|
NONE = 1
|
|
|
|
END = 2
|
|
|
|
ALL = 3
|
|
|
|
|
|
|
|
|
|
|
|
class Permission(IntEnum):
|
2021-12-13 11:20:01 +00:00
|
|
|
'''
|
|
|
|
Defines User permissions as integers by the power of 2. User permission
|
|
|
|
can be evaluated using the bitwise operator &.
|
|
|
|
'''
|
|
|
|
ADMINISTRATE = 4
|
|
|
|
CONTRIBUTE = 2
|
|
|
|
USE_API = 1
|
2021-11-30 15:22:16 +00:00
|
|
|
|
|
|
|
|
|
|
|
class FileMixin:
|
|
|
|
creation_date = db.Column(db.DateTime, default=datetime.utcnow)
|
2022-02-03 11:39:16 +00:00
|
|
|
filename = db.Column(db.String(255))
|
2021-11-30 15:22:16 +00:00
|
|
|
last_edited_date = db.Column(db.DateTime, default=datetime.utcnow)
|
|
|
|
mimetype = db.Column(db.String(255))
|
|
|
|
|
|
|
|
def file_mixin_to_dict(self, backrefs=False, relationships=False):
|
|
|
|
return {
|
|
|
|
'creation_date': self.creation_date.isoformat() + 'Z',
|
|
|
|
'filename': self.filename,
|
|
|
|
'last_edited_date': self.last_edited_date.isoformat() + 'Z',
|
|
|
|
'mimetype': self.mimetype
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
class Role(HashidMixin, db.Model):
|
2019-07-05 12:47:35 +00:00
|
|
|
__tablename__ = 'roles'
|
2019-08-06 09:47:04 +00:00
|
|
|
# Primary key
|
2019-07-05 12:47:35 +00:00
|
|
|
id = db.Column(db.Integer, primary_key=True)
|
2020-04-27 08:30:38 +00:00
|
|
|
# Fields
|
2019-07-09 13:41:16 +00:00
|
|
|
default = db.Column(db.Boolean, default=False, index=True)
|
2019-08-06 09:47:04 +00:00
|
|
|
name = db.Column(db.String(64), unique=True)
|
2020-11-13 14:01:53 +00:00
|
|
|
permissions = db.Column(db.Integer)
|
2019-08-06 09:47:04 +00:00
|
|
|
# Relationships
|
2019-07-09 13:41:16 +00:00
|
|
|
users = db.relationship('User', backref='role', lazy='dynamic')
|
|
|
|
|
|
|
|
def __init__(self, **kwargs):
|
2021-11-30 15:22:16 +00:00
|
|
|
super().__init__(**kwargs)
|
2019-07-09 13:41:16 +00:00
|
|
|
if self.permissions is None:
|
|
|
|
self.permissions = 0
|
2019-07-05 12:47:35 +00:00
|
|
|
|
|
|
|
def __repr__(self):
|
2021-11-30 15:22:16 +00:00
|
|
|
return f'<Role {self.name}>'
|
2019-07-05 12:47:35 +00:00
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
def add_permission(self, permission):
|
|
|
|
if not self.has_permission(permission):
|
|
|
|
self.permissions += permission
|
2019-07-09 13:41:16 +00:00
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
def has_permission(self, permission):
|
|
|
|
return self.permissions & permission == permission
|
|
|
|
|
|
|
|
def remove_permission(self, permission):
|
|
|
|
if self.has_permission(permission):
|
|
|
|
self.permissions -= permission
|
2019-07-09 13:41:16 +00:00
|
|
|
|
|
|
|
def reset_permissions(self):
|
|
|
|
self.permissions = 0
|
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
def to_dict(self, backrefs=False, relationships=False):
|
|
|
|
dict_role = {
|
|
|
|
'id': self.hashid,
|
|
|
|
'default': self.default,
|
|
|
|
'name': self.name,
|
|
|
|
'permissions': self.permissions
|
|
|
|
}
|
|
|
|
if relationships:
|
|
|
|
dict_role['users']: {
|
|
|
|
x.to_dict(backrefs=False, relationships=True)
|
|
|
|
for x in self.users
|
|
|
|
}
|
|
|
|
return dict_role
|
2019-07-09 13:41:16 +00:00
|
|
|
|
|
|
|
@staticmethod
|
2022-02-03 11:39:16 +00:00
|
|
|
def insert_defaults():
|
2021-11-30 15:22:16 +00:00
|
|
|
roles = {
|
|
|
|
'User': [],
|
2021-12-03 13:07:03 +00:00
|
|
|
'API user': [Permission.USE_API],
|
|
|
|
'Contributor': [Permission.CONTRIBUTE],
|
|
|
|
'Administrator': [
|
|
|
|
Permission.ADMINISTRATE,
|
|
|
|
Permission.CONTRIBUTE,
|
|
|
|
Permission.USE_API
|
|
|
|
]
|
2021-11-30 15:22:16 +00:00
|
|
|
}
|
|
|
|
default_role_name = 'User'
|
|
|
|
for role_name, permissions in roles.items():
|
|
|
|
role = Role.query.filter_by(name=role_name).first()
|
2019-07-09 13:41:16 +00:00
|
|
|
if role is None:
|
2021-11-30 15:22:16 +00:00
|
|
|
role = Role(name=role_name)
|
2019-07-09 13:41:16 +00:00
|
|
|
role.reset_permissions()
|
2021-11-30 15:22:16 +00:00
|
|
|
for permission in permissions:
|
|
|
|
role.add_permission(permission)
|
|
|
|
role.default = role.name == default_role_name
|
2019-07-09 13:41:16 +00:00
|
|
|
db.session.add(role)
|
|
|
|
db.session.commit()
|
|
|
|
|
2019-07-05 12:47:35 +00:00
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
class User(HashidMixin, UserMixin, db.Model):
|
2019-07-05 12:47:35 +00:00
|
|
|
__tablename__ = 'users'
|
2019-08-06 09:47:04 +00:00
|
|
|
# Primary key
|
2019-07-05 12:47:35 +00:00
|
|
|
id = db.Column(db.Integer, primary_key=True)
|
2020-04-29 10:17:16 +00:00
|
|
|
# Foreign keys
|
|
|
|
role_id = db.Column(db.Integer, db.ForeignKey('roles.id'))
|
2020-04-27 08:30:38 +00:00
|
|
|
# Fields
|
2019-08-06 09:47:04 +00:00
|
|
|
confirmed = db.Column(db.Boolean, default=False)
|
2019-09-12 12:24:43 +00:00
|
|
|
email = db.Column(db.String(254), unique=True, index=True)
|
2020-04-29 10:17:16 +00:00
|
|
|
last_seen = db.Column(db.DateTime(), default=datetime.utcnow)
|
2020-04-27 11:50:54 +00:00
|
|
|
member_since = db.Column(db.DateTime(), default=datetime.utcnow)
|
2020-04-29 10:17:16 +00:00
|
|
|
password_hash = db.Column(db.String(128))
|
2021-09-14 10:52:23 +00:00
|
|
|
token = db.Column(db.String(32), index=True, unique=True)
|
|
|
|
token_expiration = db.Column(db.DateTime)
|
2020-04-29 10:17:16 +00:00
|
|
|
username = db.Column(db.String(64), unique=True, index=True)
|
2021-11-30 15:22:16 +00:00
|
|
|
setting_dark_mode = db.Column(db.Boolean, default=False)
|
2022-02-08 11:26:20 +00:00
|
|
|
setting_job_status_mail_notification_level_enum_value = db.Column(
|
|
|
|
'setting_job_status_mail_notification_level',
|
|
|
|
db.Integer,
|
|
|
|
default=2
|
|
|
|
)
|
2021-11-30 15:22:16 +00:00
|
|
|
# Backrefs: role: Role
|
2019-08-06 09:47:04 +00:00
|
|
|
# Relationships
|
2022-02-03 11:39:16 +00:00
|
|
|
tesseract_ocr_models = db.relationship(
|
|
|
|
'TesseractOCRModel',
|
|
|
|
backref='user',
|
|
|
|
cascade='all, delete-orphan',
|
|
|
|
lazy='dynamic'
|
|
|
|
)
|
2021-11-30 15:22:16 +00:00
|
|
|
corpora = db.relationship(
|
|
|
|
'Corpus',
|
|
|
|
backref='user',
|
|
|
|
cascade='all, delete-orphan',
|
|
|
|
lazy='dynamic'
|
|
|
|
)
|
|
|
|
jobs = db.relationship(
|
|
|
|
'Job',
|
|
|
|
backref='user',
|
|
|
|
cascade='all, delete-orphan',
|
|
|
|
lazy='dynamic'
|
|
|
|
)
|
|
|
|
|
|
|
|
def __init__(self, **kwargs):
|
|
|
|
super().__init__(**kwargs)
|
|
|
|
if self.role is not None:
|
|
|
|
return
|
|
|
|
if self.email == current_app.config['NOPAQUE_ADMIN']:
|
|
|
|
self.role = Role.query.filter_by(name='Administrator').first()
|
|
|
|
else:
|
|
|
|
self.role = Role.query.filter_by(default=True).first()
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return f'<User {self.username}>'
|
2019-07-05 12:47:35 +00:00
|
|
|
|
2020-11-13 09:01:51 +00:00
|
|
|
@property
|
2021-11-30 15:22:16 +00:00
|
|
|
def jsonpatch_path(self):
|
|
|
|
return f'/users/{self.hashid}'
|
2020-11-13 09:01:51 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def password(self):
|
|
|
|
raise AttributeError('password is not a readable attribute')
|
|
|
|
|
|
|
|
@password.setter
|
|
|
|
def password(self, password):
|
|
|
|
self.password_hash = generate_password_hash(password)
|
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
@property
|
|
|
|
def path(self):
|
|
|
|
return os.path.join(
|
2022-02-08 11:26:20 +00:00
|
|
|
current_app.config.get('NOPAQUE_DATA_DIR'), 'users', str(self.id))
|
|
|
|
|
|
|
|
@property
|
|
|
|
def setting_job_status_mail_notification_level(self):
|
|
|
|
return JobStatusMailNotificationLevel(
|
|
|
|
self.setting_job_status_mail_notification_level_enum_value
|
|
|
|
)
|
|
|
|
|
|
|
|
@setting_job_status_mail_notification_level.setter
|
|
|
|
def setting_job_status_mail_notification_level(self, enum_member):
|
|
|
|
if not isinstance(enum_member, JobStatusMailNotificationLevel):
|
|
|
|
return TypeError()
|
|
|
|
self.setting_job_status_mail_notification_level_enum_value = \
|
|
|
|
enum_member.value
|
2019-07-05 12:47:35 +00:00
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
def can(self, permission):
|
|
|
|
return self.role.has_permission(permission)
|
2019-07-08 11:55:56 +00:00
|
|
|
|
2019-07-08 13:59:15 +00:00
|
|
|
def confirm(self, token):
|
2019-08-22 07:35:23 +00:00
|
|
|
s = TimedJSONWebSignatureSerializer(current_app.config['SECRET_KEY'])
|
2019-07-08 13:59:15 +00:00
|
|
|
try:
|
|
|
|
data = s.loads(token.encode('utf-8'))
|
2019-08-22 07:35:23 +00:00
|
|
|
except BadSignature:
|
2019-07-08 13:59:15 +00:00
|
|
|
return False
|
2021-11-30 15:22:16 +00:00
|
|
|
if data.get('confirm') != self.hashid:
|
2019-07-08 13:59:15 +00:00
|
|
|
return False
|
|
|
|
self.confirmed = True
|
|
|
|
db.session.add(self)
|
|
|
|
return True
|
|
|
|
|
2019-11-14 08:48:30 +00:00
|
|
|
def delete(self):
|
2020-11-13 09:01:51 +00:00
|
|
|
shutil.rmtree(self.path, ignore_errors=True)
|
2019-09-17 14:31:41 +00:00
|
|
|
db.session.delete(self)
|
2019-09-11 12:51:59 +00:00
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
def generate_confirmation_token(self, expiration=3600):
|
|
|
|
s = TimedJSONWebSignatureSerializer(
|
|
|
|
current_app.config['SECRET_KEY'], expiration)
|
|
|
|
return s.dumps({'confirm': self.hashid}).decode('utf-8')
|
|
|
|
|
|
|
|
def generate_reset_token(self, expiration=3600):
|
|
|
|
s = TimedJSONWebSignatureSerializer(
|
|
|
|
current_app.config['SECRET_KEY'], expiration)
|
|
|
|
return s.dumps({'reset': self.hashid}).decode('utf-8')
|
|
|
|
|
2021-09-14 10:52:23 +00:00
|
|
|
def get_token(self, expires_in=3600):
|
|
|
|
now = datetime.utcnow()
|
|
|
|
if self.token and self.token_expiration > now + timedelta(seconds=60):
|
|
|
|
return self.token
|
|
|
|
self.token = base64.b64encode(os.urandom(24)).decode('utf-8')
|
|
|
|
self.token_expiration = now + timedelta(seconds=expires_in)
|
|
|
|
db.session.add(self)
|
|
|
|
return self.token
|
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
def is_administrator(self):
|
|
|
|
return self.can(Permission.ADMINISTRATE)
|
|
|
|
|
2022-02-03 11:39:16 +00:00
|
|
|
def makedirs(self):
|
|
|
|
os.mkdir(self.path)
|
|
|
|
os.mkdir(os.path.join(self.path, 'tesseract_ocr_models'))
|
|
|
|
os.mkdir(os.path.join(self.path, 'corpora'))
|
|
|
|
os.mkdir(os.path.join(self.path, 'jobs'))
|
|
|
|
|
2021-09-14 10:52:23 +00:00
|
|
|
def revoke_token(self):
|
|
|
|
self.token_expiration = datetime.utcnow() - timedelta(seconds=1)
|
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
def to_dict(self, backrefs=False, relationships=False):
|
|
|
|
dict_user = {
|
|
|
|
'id': self.hashid,
|
|
|
|
'role_id': self.role.hashid,
|
|
|
|
'confirmed': self.confirmed,
|
|
|
|
'email': self.email,
|
|
|
|
'last_seen': self.last_seen.isoformat() + 'Z',
|
|
|
|
'member_since': self.member_since.isoformat() + 'Z',
|
|
|
|
'username': self.username,
|
|
|
|
'settings': {
|
|
|
|
'dark_mode': self.setting_dark_mode,
|
2022-02-08 11:26:20 +00:00
|
|
|
'job_status_mail_notification_level':
|
|
|
|
self.setting_job_status_mail_notification_level.name
|
2021-11-30 15:22:16 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if backrefs:
|
|
|
|
dict_user['role'] = self.role.to_dict(
|
|
|
|
backrefs=True, relationships=False)
|
|
|
|
if relationships:
|
|
|
|
dict_user['corpora'] = {
|
|
|
|
x.hashid: x.to_dict(backrefs=False, relationships=True)
|
|
|
|
for x in self.corpora
|
|
|
|
}
|
|
|
|
dict_user['jobs'] = {
|
|
|
|
x.hashid: x.to_dict(backrefs=False, relationships=True)
|
|
|
|
for x in self.jobs
|
|
|
|
}
|
2022-02-08 11:26:20 +00:00
|
|
|
dict_user['tesseract_ocr_models'] = {
|
2021-11-30 15:22:16 +00:00
|
|
|
x.hashid: x.to_dict(backrefs=False, relationships=True)
|
2022-02-08 11:26:20 +00:00
|
|
|
for x in self.tesseract_ocr_models
|
2021-11-30 15:22:16 +00:00
|
|
|
}
|
|
|
|
return dict_user
|
|
|
|
|
|
|
|
def verify_password(self, password):
|
|
|
|
return check_password_hash(self.password_hash, password)
|
|
|
|
|
2021-09-14 10:52:23 +00:00
|
|
|
@staticmethod
|
|
|
|
def check_token(token):
|
|
|
|
user = User.query.filter_by(token=token).first()
|
|
|
|
if user is None or user.token_expiration < datetime.utcnow():
|
|
|
|
return None
|
|
|
|
return user
|
|
|
|
|
2022-02-03 11:39:16 +00:00
|
|
|
@staticmethod
|
|
|
|
def insert_defaults():
|
|
|
|
if User.query.filter_by(username='nopaque').first() is not None:
|
|
|
|
return
|
|
|
|
user = User(username='nopaque')
|
|
|
|
db.session.add(user)
|
|
|
|
db.session.flush(objects=[user])
|
|
|
|
db.session.refresh(user)
|
|
|
|
try:
|
|
|
|
user.makedirs()
|
|
|
|
except OSError as e:
|
|
|
|
current_app.logger.error(e)
|
|
|
|
db.session.rollback()
|
|
|
|
db.session.commit()
|
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
@staticmethod
|
|
|
|
def reset_password(token, new_password):
|
|
|
|
s = TimedJSONWebSignatureSerializer(current_app.config['SECRET_KEY'])
|
|
|
|
try:
|
|
|
|
data = s.loads(token.encode('utf-8'))
|
|
|
|
except BadSignature:
|
|
|
|
return False
|
|
|
|
user = User.query.get(data.get('reset'))
|
|
|
|
if user is None:
|
|
|
|
return False
|
|
|
|
user.password = new_password
|
|
|
|
db.session.add(user)
|
|
|
|
return True
|
2019-09-09 14:17:59 +00:00
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
|
2022-02-03 11:39:16 +00:00
|
|
|
class TesseractOCRModel(FileMixin, HashidMixin, db.Model):
|
|
|
|
__tablename__ = 'tesseract_ocr_models'
|
|
|
|
# Primary key
|
|
|
|
id = db.Column(db.Integer, primary_key=True)
|
|
|
|
# Foreign keys
|
|
|
|
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
|
|
|
|
# Fields
|
|
|
|
compatible_service_versions = db.Column(db.String(255))
|
|
|
|
description = db.Column(db.String(255))
|
|
|
|
publisher = db.Column(db.String(128))
|
|
|
|
publishing_year = db.Column(db.Integer)
|
|
|
|
title = db.Column(db.String(64))
|
|
|
|
version = db.Column(db.String(16))
|
|
|
|
# Backrefs: user: User
|
|
|
|
|
|
|
|
@property
|
|
|
|
def path(self):
|
|
|
|
return os.path.join(
|
|
|
|
self.user.path,
|
|
|
|
'tesseract_ocr_models',
|
|
|
|
str(self.id)
|
|
|
|
)
|
|
|
|
|
2022-02-08 11:26:20 +00:00
|
|
|
def to_dict(self, backrefs=False, relationships=False):
|
|
|
|
compatible_service_versions = json.loads(self.compatible_service_versions) # noqa
|
|
|
|
dict_tesseract_ocr_model = {
|
|
|
|
'id': self.hashid,
|
|
|
|
'user_id': self.user.hashid,
|
|
|
|
'compatible_service_versions': compatible_service_versions,
|
|
|
|
'description': self.description,
|
|
|
|
'publisher': self.publisher,
|
|
|
|
'publishing_year': self.publishing_year,
|
|
|
|
'title': self.title,
|
|
|
|
**self.file_mixin_to_dict()
|
|
|
|
}
|
|
|
|
if backrefs:
|
|
|
|
dict_tesseract_ocr_model['user'] = self.user.to_dict(
|
|
|
|
backrefs=True, relationships=False)
|
|
|
|
if relationships:
|
|
|
|
pass
|
|
|
|
return dict_tesseract_ocr_model
|
|
|
|
|
2022-02-03 11:39:16 +00:00
|
|
|
@staticmethod
|
|
|
|
def insert_defaults():
|
|
|
|
user = User.query.filter_by(username='nopaque').first()
|
|
|
|
defaults_file = os.path.join(
|
|
|
|
os.path.dirname(os.path.abspath(__file__)),
|
|
|
|
'TesseractOCRModel.defaults.yml'
|
|
|
|
)
|
|
|
|
with open(defaults_file, 'r') as f:
|
|
|
|
defaults = yaml.safe_load(f)
|
|
|
|
for m in defaults:
|
|
|
|
if TesseractOCRModel.query.filter_by(title=m['title'], version=m['version']).first() is not None: # noqa
|
|
|
|
continue
|
|
|
|
tesseract_ocr_model = TesseractOCRModel(
|
|
|
|
compatible_service_versions=json.dumps(m['compatible_service_versions']), # noqa
|
|
|
|
description=m['description'],
|
|
|
|
publisher=m['publisher'],
|
|
|
|
publishing_year=m['publishing_year'],
|
|
|
|
title=m['title'],
|
|
|
|
user=user,
|
|
|
|
version=m['version']
|
|
|
|
)
|
|
|
|
db.session.add(tesseract_ocr_model)
|
|
|
|
db.session.flush(objects=[tesseract_ocr_model])
|
|
|
|
db.session.refresh(tesseract_ocr_model)
|
|
|
|
tesseract_ocr_model.filename = f'{tesseract_ocr_model.id}.traineddata' # noqa
|
|
|
|
r = requests.get(m['url'], stream=True)
|
|
|
|
pbar = tqdm(
|
|
|
|
desc=f'{tesseract_ocr_model.title} ({tesseract_ocr_model.filename})', # noqa
|
|
|
|
unit="B",
|
|
|
|
unit_scale=True,
|
|
|
|
unit_divisor=1024,
|
|
|
|
total=int(r.headers['Content-Length'])
|
|
|
|
)
|
|
|
|
pbar.clear()
|
|
|
|
with open(tesseract_ocr_model.path, 'wb') as f:
|
|
|
|
for chunk in r.iter_content(chunk_size=1024):
|
|
|
|
if chunk: # filter out keep-alive new chunks
|
|
|
|
pbar.update(len(chunk))
|
|
|
|
f.write(chunk)
|
|
|
|
pbar.close()
|
|
|
|
db.session.commit()
|
|
|
|
|
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
class JobInput(FileMixin, HashidMixin, db.Model):
|
2019-10-16 14:52:05 +00:00
|
|
|
__tablename__ = 'job_inputs'
|
|
|
|
# Primary key
|
|
|
|
id = db.Column(db.Integer, primary_key=True)
|
2020-04-29 10:17:16 +00:00
|
|
|
# Foreign keys
|
|
|
|
job_id = db.Column(db.Integer, db.ForeignKey('jobs.id'))
|
2021-11-30 15:22:16 +00:00
|
|
|
# Backrefs: job: Job
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return f'<JobInput {self.filename}>'
|
2019-10-16 14:52:05 +00:00
|
|
|
|
2020-12-03 14:13:24 +00:00
|
|
|
@property
|
|
|
|
def download_url(self):
|
2021-11-30 15:22:16 +00:00
|
|
|
return url_for(
|
|
|
|
'jobs.download_job_input',
|
|
|
|
job_id=self.job.id,
|
|
|
|
job_input_id=self.id
|
|
|
|
)
|
2020-12-03 14:13:24 +00:00
|
|
|
|
2021-09-10 14:25:32 +00:00
|
|
|
@property
|
|
|
|
def jsonpatch_path(self):
|
2021-11-30 15:22:16 +00:00
|
|
|
return f'{self.job.jsonpatch_path}/inputs/{self.hashid}'
|
2021-09-10 14:25:32 +00:00
|
|
|
|
2020-11-13 09:01:51 +00:00
|
|
|
@property
|
|
|
|
def path(self):
|
2022-02-03 11:39:16 +00:00
|
|
|
return os.path.join(self.job.path, 'inputs', str(self.id))
|
2020-11-13 09:01:51 +00:00
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
def to_dict(self, backrefs=False, relationships=False):
|
|
|
|
dict_job_input = {
|
|
|
|
'id': self.hashid,
|
|
|
|
'job_id': self.job.hashid,
|
|
|
|
'download_url': self.download_url,
|
|
|
|
'url': self.url,
|
|
|
|
**self.file_mixin_to_dict()
|
|
|
|
}
|
|
|
|
if backrefs:
|
|
|
|
dict_job_input['job'] = self.job.to_dict(
|
|
|
|
backrefs=True, relationships=False)
|
|
|
|
return dict_job_input
|
|
|
|
|
2020-12-03 14:13:24 +00:00
|
|
|
@property
|
|
|
|
def url(self):
|
2021-11-30 15:22:16 +00:00
|
|
|
return url_for(
|
|
|
|
'jobs.job',
|
|
|
|
job_id=self.job_id,
|
|
|
|
_anchor=f'job-{self.job.hashid}-input-{self.hashid}'
|
|
|
|
)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def user_hashid(self):
|
|
|
|
return self.job.user.hashid
|
2020-12-03 14:13:24 +00:00
|
|
|
|
2021-09-10 14:25:32 +00:00
|
|
|
@property
|
|
|
|
def user_id(self):
|
|
|
|
return self.job.user_id
|
|
|
|
|
2019-10-16 14:52:05 +00:00
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
class JobResult(FileMixin, HashidMixin, db.Model):
|
2019-10-16 14:52:05 +00:00
|
|
|
__tablename__ = 'job_results'
|
|
|
|
# Primary key
|
|
|
|
id = db.Column(db.Integer, primary_key=True)
|
2020-04-29 10:17:16 +00:00
|
|
|
# Foreign keys
|
|
|
|
job_id = db.Column(db.Integer, db.ForeignKey('jobs.id'))
|
2022-02-03 11:39:16 +00:00
|
|
|
# Fields
|
|
|
|
description = db.Column(db.String(255))
|
2021-11-30 15:22:16 +00:00
|
|
|
# Backrefs: job: Job
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return f'<JobResult {self.filename}>'
|
2019-10-16 14:52:05 +00:00
|
|
|
|
2020-12-03 14:13:24 +00:00
|
|
|
@property
|
|
|
|
def download_url(self):
|
2021-11-30 15:22:16 +00:00
|
|
|
return url_for(
|
|
|
|
'jobs.download_job_result',
|
|
|
|
job_id=self.job_id,
|
|
|
|
job_result_id=self.id
|
|
|
|
)
|
2020-12-03 14:13:24 +00:00
|
|
|
|
2021-09-10 14:25:32 +00:00
|
|
|
@property
|
|
|
|
def jsonpatch_path(self):
|
2021-11-30 15:22:16 +00:00
|
|
|
return f'{self.job.jsonpatch_path}/results/{self.hashid}'
|
2021-09-10 14:25:32 +00:00
|
|
|
|
2020-11-13 09:01:51 +00:00
|
|
|
@property
|
|
|
|
def path(self):
|
2022-02-03 11:39:16 +00:00
|
|
|
return os.path.join(self.job.path, 'results', str(self.id))
|
2020-11-13 09:01:51 +00:00
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
def to_dict(self, backrefs=False, relationships=False):
|
|
|
|
dict_job_result = {
|
|
|
|
'id': self.hashid,
|
|
|
|
'job_id': self.job.hashid,
|
2022-02-03 11:39:16 +00:00
|
|
|
'description': self.description,
|
2021-11-30 15:22:16 +00:00
|
|
|
'download_url': self.download_url,
|
|
|
|
'url': self.url,
|
|
|
|
**self.file_mixin_to_dict(
|
|
|
|
backrefs=backrefs, relationships=relationships)
|
|
|
|
}
|
|
|
|
if backrefs:
|
|
|
|
dict_job_result['job'] = self.job.to_dict(
|
|
|
|
backrefs=True, relationships=False)
|
|
|
|
return dict_job_result
|
|
|
|
|
2020-12-03 14:13:24 +00:00
|
|
|
@property
|
|
|
|
def url(self):
|
2021-11-30 15:22:16 +00:00
|
|
|
return url_for(
|
|
|
|
'jobs.job',
|
|
|
|
job_id=self.job_id,
|
|
|
|
_anchor=f'job-{self.job.hashid}-result-{self.hashid}'
|
|
|
|
)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def user_hashid(self):
|
|
|
|
return self.job.user.hashid
|
2020-12-03 14:13:24 +00:00
|
|
|
|
2021-09-10 14:25:32 +00:00
|
|
|
@property
|
|
|
|
def user_id(self):
|
|
|
|
return self.job.user_id
|
|
|
|
|
2019-10-17 11:26:20 +00:00
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
class Job(HashidMixin, db.Model):
|
2020-05-14 13:30:13 +00:00
|
|
|
'''
|
2019-08-05 14:45:38 +00:00
|
|
|
Class to define Jobs.
|
2020-05-14 13:30:13 +00:00
|
|
|
'''
|
2019-08-05 14:45:38 +00:00
|
|
|
__tablename__ = 'jobs'
|
2019-08-06 09:47:04 +00:00
|
|
|
# Primary key
|
2019-08-05 14:45:38 +00:00
|
|
|
id = db.Column(db.Integer, primary_key=True)
|
2020-04-29 10:17:16 +00:00
|
|
|
# Foreign keys
|
|
|
|
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
|
2020-04-27 08:30:38 +00:00
|
|
|
# Fields
|
2019-08-06 12:26:22 +00:00
|
|
|
creation_date = db.Column(db.DateTime(), default=datetime.utcnow)
|
2019-08-09 09:48:43 +00:00
|
|
|
description = db.Column(db.String(255))
|
2019-08-09 13:59:53 +00:00
|
|
|
end_date = db.Column(db.DateTime())
|
2019-08-06 09:47:04 +00:00
|
|
|
service = db.Column(db.String(64))
|
|
|
|
'''
|
2022-02-03 11:39:16 +00:00
|
|
|
' Dictionary as JSON formatted string.
|
|
|
|
' Example: {"binarization": True}
|
2019-08-06 09:47:04 +00:00
|
|
|
'''
|
|
|
|
service_args = db.Column(db.String(255))
|
2019-08-09 09:48:43 +00:00
|
|
|
service_version = db.Column(db.String(16))
|
2022-02-08 11:26:20 +00:00
|
|
|
status_enum_value = db.Column('status', db.Integer, default=1)
|
2019-08-06 09:47:04 +00:00
|
|
|
title = db.Column(db.String(32))
|
2021-11-30 15:22:16 +00:00
|
|
|
# Backrefs: user: User
|
2019-10-16 14:52:05 +00:00
|
|
|
# Relationships
|
2021-11-30 15:22:16 +00:00
|
|
|
inputs = db.relationship(
|
|
|
|
'JobInput',
|
|
|
|
backref='job',
|
|
|
|
cascade='all, delete-orphan',
|
|
|
|
lazy='dynamic'
|
|
|
|
)
|
|
|
|
results = db.relationship(
|
|
|
|
'JobResult',
|
|
|
|
backref='job',
|
|
|
|
cascade='all, delete-orphan',
|
|
|
|
lazy='dynamic'
|
|
|
|
)
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return f'<Job {self.title}>'
|
2020-11-13 09:01:51 +00:00
|
|
|
|
2021-09-10 14:25:32 +00:00
|
|
|
@property
|
|
|
|
def jsonpatch_path(self):
|
2021-11-30 15:22:16 +00:00
|
|
|
return f'{self.user.jsonpatch_path}/jobs/{self.hashid}'
|
2021-09-10 14:25:32 +00:00
|
|
|
|
2020-11-13 09:01:51 +00:00
|
|
|
@property
|
|
|
|
def path(self):
|
2021-11-30 15:22:16 +00:00
|
|
|
return os.path.join(self.user.path, 'jobs', str(self.id))
|
2019-08-05 14:45:38 +00:00
|
|
|
|
2022-02-08 11:26:20 +00:00
|
|
|
@property
|
|
|
|
def status(self):
|
|
|
|
return JobStatus(self.status_enum_value)
|
|
|
|
|
|
|
|
@status.setter
|
|
|
|
def status(self, enum_member):
|
|
|
|
if not isinstance(enum_member, JobStatus):
|
|
|
|
return TypeError()
|
|
|
|
self.status_enum_value = enum_member.value
|
|
|
|
|
2020-12-03 14:13:24 +00:00
|
|
|
@property
|
2020-12-07 15:10:40 +00:00
|
|
|
def url(self):
|
|
|
|
return url_for('jobs.job', job_id=self.id)
|
2020-12-03 14:13:24 +00:00
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
@property
|
|
|
|
def user_hashid(self):
|
|
|
|
return self.user.hashid
|
2019-08-05 14:45:38 +00:00
|
|
|
|
2019-11-14 08:48:30 +00:00
|
|
|
def delete(self):
|
2020-05-14 13:30:13 +00:00
|
|
|
'''
|
2020-04-21 16:34:21 +00:00
|
|
|
Delete the job and its inputs and results from the database.
|
2020-05-14 13:30:13 +00:00
|
|
|
'''
|
2022-02-08 11:26:20 +00:00
|
|
|
if self.status not in [JobStatus.COMPLETED, JobStatus.FAILED]: # noqa
|
|
|
|
self.status = JobStatus.CANCELING
|
2020-07-09 13:07:43 +00:00
|
|
|
db.session.commit()
|
2022-02-08 11:26:20 +00:00
|
|
|
while self.status != JobStatus.CANCELED:
|
2020-07-09 13:07:43 +00:00
|
|
|
# In case the daemon handled a job in any way
|
2022-02-08 11:26:20 +00:00
|
|
|
if self.status != JobStatus.CANCELING:
|
|
|
|
self.status = JobStatus.CANCELING
|
2020-07-09 13:07:43 +00:00
|
|
|
db.session.commit()
|
|
|
|
sleep(1)
|
|
|
|
db.session.refresh(self)
|
2020-11-13 09:01:51 +00:00
|
|
|
shutil.rmtree(self.path, ignore_errors=True)
|
2019-11-14 08:48:30 +00:00
|
|
|
db.session.delete(self)
|
|
|
|
|
2022-02-03 11:39:16 +00:00
|
|
|
def makedirs(self):
|
|
|
|
os.mkdir(self.path)
|
|
|
|
os.mkdir(os.path.join(self.path, 'inputs'))
|
|
|
|
os.mkdir(os.path.join(self.path, 'pipeline_data'))
|
|
|
|
os.mkdir(os.path.join(self.path, 'results'))
|
|
|
|
|
2020-07-09 07:42:30 +00:00
|
|
|
def restart(self):
|
|
|
|
'''
|
2021-05-03 09:12:40 +00:00
|
|
|
Restart a job - only if the status is complete or failed
|
2020-07-09 07:42:30 +00:00
|
|
|
'''
|
|
|
|
|
2022-02-08 11:26:20 +00:00
|
|
|
if self.status not in [JobStatus.COMPLETED, JobStatus.FAILED]: # noqa
|
|
|
|
raise Exception('Could not restart job: status is not "completed/failed"') # noqa
|
2022-02-03 11:39:16 +00:00
|
|
|
shutil.rmtree(os.path.join(self.path, 'results'), ignore_errors=True)
|
2020-11-13 09:01:51 +00:00
|
|
|
shutil.rmtree(os.path.join(self.path, 'pyflow.data'), ignore_errors=True) # noqa
|
2021-08-23 14:31:06 +00:00
|
|
|
for result in self.results:
|
|
|
|
db.session.delete(result)
|
2020-07-09 07:42:30 +00:00
|
|
|
self.end_date = None
|
2022-02-08 11:26:20 +00:00
|
|
|
self.status = JobStatus.SUBMITTED
|
2020-07-09 07:42:30 +00:00
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
def to_dict(self, backrefs=False, relationships=False):
|
2022-02-03 11:39:16 +00:00
|
|
|
service_args = json.loads(self.service_args)
|
|
|
|
if self.service == 'tesseract-ocr' and 'model' in service_args:
|
|
|
|
tesseract_ocr_pipeline_model = TesseractOCRModel.query.get(service_args['model']) # noqa
|
|
|
|
service_args['model'] = tesseract_ocr_pipeline_model.title
|
2021-08-18 13:09:56 +00:00
|
|
|
dict_job = {
|
2021-11-30 15:22:16 +00:00
|
|
|
'id': self.hashid,
|
|
|
|
'user_id': self.user.hashid,
|
2021-09-14 10:52:23 +00:00
|
|
|
'creation_date': self.creation_date.isoformat() + 'Z',
|
2021-08-18 13:09:56 +00:00
|
|
|
'description': self.description,
|
2021-11-30 15:22:16 +00:00
|
|
|
'end_date': None if self.end_date is None else f'{self.end_date.isoformat()}Z', # noqa
|
2021-08-18 13:09:56 +00:00
|
|
|
'service': self.service,
|
2022-02-03 11:39:16 +00:00
|
|
|
'service_args': service_args,
|
2021-08-18 13:09:56 +00:00
|
|
|
'service_version': self.service_version,
|
2022-02-08 11:26:20 +00:00
|
|
|
'status': self.status.name,
|
2021-08-18 13:09:56 +00:00
|
|
|
'title': self.title,
|
2021-11-30 15:22:16 +00:00
|
|
|
'url': self.url
|
2021-08-18 13:09:56 +00:00
|
|
|
}
|
2021-11-30 15:22:16 +00:00
|
|
|
if backrefs:
|
|
|
|
dict_job['user'] = self.user.to_dict(
|
|
|
|
backrefs=True, relationships=False)
|
|
|
|
if relationships:
|
|
|
|
dict_job['inputs'] = {
|
|
|
|
x.hashid: x.to_dict(backrefs=False, relationships=True)
|
|
|
|
for x in self.inputs
|
|
|
|
}
|
|
|
|
dict_job['results'] = {
|
|
|
|
x.hashid: x.to_dict(backrefs=False, relationships=True)
|
|
|
|
for x in self.results
|
|
|
|
}
|
2021-08-18 13:09:56 +00:00
|
|
|
return dict_job
|
2019-08-16 07:49:27 +00:00
|
|
|
|
2019-08-05 14:45:38 +00:00
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
class CorpusFile(FileMixin, HashidMixin, db.Model):
|
2019-10-16 14:52:05 +00:00
|
|
|
__tablename__ = 'corpus_files'
|
|
|
|
# Primary key
|
|
|
|
id = db.Column(db.Integer, primary_key=True)
|
2020-04-29 10:17:16 +00:00
|
|
|
# Foreign keys
|
|
|
|
corpus_id = db.Column(db.Integer, db.ForeignKey('corpora.id'))
|
2020-04-27 08:30:38 +00:00
|
|
|
# Fields
|
2020-01-08 15:02:42 +00:00
|
|
|
address = db.Column(db.String(255))
|
|
|
|
author = db.Column(db.String(255))
|
|
|
|
booktitle = db.Column(db.String(255))
|
|
|
|
chapter = db.Column(db.String(255))
|
|
|
|
editor = db.Column(db.String(255))
|
|
|
|
institution = db.Column(db.String(255))
|
|
|
|
journal = db.Column(db.String(255))
|
|
|
|
pages = db.Column(db.String(255))
|
|
|
|
publisher = db.Column(db.String(255))
|
2019-10-28 14:46:25 +00:00
|
|
|
publishing_year = db.Column(db.Integer)
|
2020-01-08 15:02:42 +00:00
|
|
|
school = db.Column(db.String(255))
|
|
|
|
title = db.Column(db.String(255))
|
2021-11-30 15:22:16 +00:00
|
|
|
# Backrefs: corpus: Corpus
|
2019-10-16 14:52:05 +00:00
|
|
|
|
2020-12-03 14:13:24 +00:00
|
|
|
@property
|
|
|
|
def download_url(self):
|
2021-11-30 15:22:16 +00:00
|
|
|
return url_for(
|
|
|
|
'corpora.download_corpus_file',
|
|
|
|
corpus_id=self.corpus_id,
|
|
|
|
corpus_file_id=self.id
|
|
|
|
)
|
2020-12-03 14:13:24 +00:00
|
|
|
|
2021-09-10 14:25:32 +00:00
|
|
|
@property
|
|
|
|
def jsonpatch_path(self):
|
2021-12-01 15:03:55 +00:00
|
|
|
return f'{self.corpus.jsonpatch_path}/files/{self.hashid}'
|
2021-09-10 14:25:32 +00:00
|
|
|
|
2020-11-13 09:01:51 +00:00
|
|
|
@property
|
|
|
|
def path(self):
|
2022-02-03 11:39:16 +00:00
|
|
|
return os.path.join(self.corpus.path, 'files', str(self.id))
|
2020-11-13 09:01:51 +00:00
|
|
|
|
2020-12-03 14:13:24 +00:00
|
|
|
@property
|
|
|
|
def url(self):
|
2021-11-30 15:22:16 +00:00
|
|
|
return url_for(
|
|
|
|
'corpora.corpus_file',
|
|
|
|
corpus_id=self.corpus_id,
|
|
|
|
corpus_file_id=self.id
|
|
|
|
)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def user_hashid(self):
|
|
|
|
return self.corpus.user.hashid
|
2020-12-03 14:13:24 +00:00
|
|
|
|
2021-09-10 14:25:32 +00:00
|
|
|
@property
|
|
|
|
def user_id(self):
|
|
|
|
return self.corpus.user_id
|
|
|
|
|
2019-10-30 07:28:52 +00:00
|
|
|
def delete(self):
|
2020-07-10 09:36:54 +00:00
|
|
|
try:
|
2020-11-13 09:01:51 +00:00
|
|
|
os.remove(self.path)
|
2020-07-10 09:36:54 +00:00
|
|
|
except OSError:
|
2021-09-16 09:15:31 +00:00
|
|
|
current_app.logger.error(
|
2021-12-13 11:20:01 +00:00
|
|
|
f'Removing {self.path} led to an OSError!'
|
2021-09-16 09:15:31 +00:00
|
|
|
)
|
2020-07-10 09:36:54 +00:00
|
|
|
pass
|
2019-10-30 07:28:52 +00:00
|
|
|
db.session.delete(self)
|
2022-02-08 11:26:20 +00:00
|
|
|
self.corpus.status = CorpusStatus.UNPREPARED
|
2019-10-30 07:28:52 +00:00
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
def to_dict(self, backrefs=False, relationships=False):
|
|
|
|
dict_corpus_file = {
|
|
|
|
'id': self.hashid,
|
|
|
|
'corpus_id': self.corpus.hashid,
|
|
|
|
'download_url': self.download_url,
|
|
|
|
'url': self.url,
|
|
|
|
'address': self.address,
|
|
|
|
'author': self.author,
|
|
|
|
'booktitle': self.booktitle,
|
|
|
|
'chapter': self.chapter,
|
|
|
|
'editor': self.editor,
|
|
|
|
'institution': self.institution,
|
|
|
|
'journal': self.journal,
|
|
|
|
'pages': self.pages,
|
|
|
|
'publisher': self.publisher,
|
|
|
|
'publishing_year': self.publishing_year,
|
|
|
|
'school': self.school,
|
|
|
|
'title': self.title,
|
|
|
|
**self.file_mixin_to_dict(
|
|
|
|
backrefs=backrefs, relationships=relationships)
|
|
|
|
}
|
|
|
|
if backrefs:
|
|
|
|
dict_corpus_file['corpus'] = self.corpus.to_dict(
|
|
|
|
backrefs=True, relationships=False)
|
2021-12-01 15:03:55 +00:00
|
|
|
return dict_corpus_file
|
2021-11-30 15:22:16 +00:00
|
|
|
|
|
|
|
|
|
|
|
class Corpus(HashidMixin, db.Model):
|
2020-05-14 13:30:13 +00:00
|
|
|
'''
|
2019-08-06 10:06:41 +00:00
|
|
|
Class to define a corpus.
|
2020-05-14 13:30:13 +00:00
|
|
|
'''
|
2019-08-06 10:06:41 +00:00
|
|
|
__tablename__ = 'corpora'
|
|
|
|
# Primary key
|
|
|
|
id = db.Column(db.Integer, primary_key=True)
|
2020-04-29 10:17:16 +00:00
|
|
|
# Foreign keys
|
|
|
|
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
|
2020-04-27 08:30:38 +00:00
|
|
|
# Fields
|
2019-08-06 13:41:07 +00:00
|
|
|
creation_date = db.Column(db.DateTime(), default=datetime.utcnow)
|
2019-08-12 06:57:21 +00:00
|
|
|
description = db.Column(db.String(255))
|
2020-04-30 09:32:48 +00:00
|
|
|
last_edited_date = db.Column(db.DateTime(), default=datetime.utcnow)
|
2022-02-08 11:26:20 +00:00
|
|
|
status_enum_value = db.Column('status', db.Integer, default=1)
|
2019-08-06 10:06:41 +00:00
|
|
|
title = db.Column(db.String(32))
|
2021-11-16 14:23:57 +00:00
|
|
|
num_analysis_sessions = db.Column(db.Integer, default=0)
|
|
|
|
num_tokens = db.Column(db.Integer, default=0)
|
2020-10-29 14:20:30 +00:00
|
|
|
archive_file = db.Column(db.String(255))
|
2021-11-30 15:22:16 +00:00
|
|
|
# Backrefs: user: User
|
2019-10-16 14:52:05 +00:00
|
|
|
# Relationships
|
2021-11-30 15:22:16 +00:00
|
|
|
files = db.relationship(
|
|
|
|
'CorpusFile',
|
|
|
|
backref='corpus',
|
|
|
|
lazy='dynamic',
|
|
|
|
cascade='all, delete-orphan'
|
|
|
|
)
|
2022-02-08 11:26:20 +00:00
|
|
|
# "static" attributes
|
2021-11-16 14:23:57 +00:00
|
|
|
max_num_tokens = 2147483647
|
2019-08-06 10:06:41 +00:00
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
def __repr__(self):
|
|
|
|
return f'<Corpus {self.title}>'
|
|
|
|
|
2020-12-07 15:10:40 +00:00
|
|
|
@property
|
|
|
|
def analysis_url(self):
|
|
|
|
return url_for('corpora.analyse_corpus', corpus_id=self.id)
|
|
|
|
|
2021-09-10 14:25:32 +00:00
|
|
|
@property
|
|
|
|
def jsonpatch_path(self):
|
2021-11-30 15:22:16 +00:00
|
|
|
return f'{self.user.jsonpatch_path}/corpora/{self.hashid}'
|
2021-09-10 14:25:32 +00:00
|
|
|
|
2020-11-13 09:01:51 +00:00
|
|
|
@property
|
|
|
|
def path(self):
|
2021-11-30 15:22:16 +00:00
|
|
|
return os.path.join(self.user.path, 'corpora', str(self.id))
|
2020-11-13 09:01:51 +00:00
|
|
|
|
2022-02-08 11:26:20 +00:00
|
|
|
@property
|
|
|
|
def status(self):
|
|
|
|
return CorpusStatus(self.status_enum_value)
|
|
|
|
|
|
|
|
@status.setter
|
|
|
|
def status(self, enum_member):
|
|
|
|
if not isinstance(enum_member, CorpusStatus):
|
|
|
|
return TypeError()
|
|
|
|
self.status_enum_value = enum_member.value
|
|
|
|
|
2020-12-04 13:16:00 +00:00
|
|
|
@property
|
|
|
|
def url(self):
|
|
|
|
return url_for('corpora.corpus', corpus_id=self.id)
|
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
@property
|
|
|
|
def user_hashid(self):
|
|
|
|
return self.user.hashid
|
2019-08-06 10:06:41 +00:00
|
|
|
|
2020-07-10 09:36:54 +00:00
|
|
|
def build(self):
|
2021-11-16 14:23:57 +00:00
|
|
|
corpus_element = ET.fromstring('<corpus>\n</corpus>')
|
2020-07-10 09:36:54 +00:00
|
|
|
for corpus_file in self.files:
|
2020-11-13 09:01:51 +00:00
|
|
|
element_tree = ET.parse(corpus_file.path)
|
2022-02-03 11:39:16 +00:00
|
|
|
text_element = element_tree.getroot()
|
|
|
|
text_element.set('address', corpus_file.address or 'NULL')
|
|
|
|
text_element.set('author', corpus_file.author)
|
|
|
|
text_element.set('booktitle', corpus_file.booktitle or 'NULL')
|
|
|
|
text_element.set('chapter', corpus_file.chapter or 'NULL')
|
|
|
|
text_element.set('editor', corpus_file.editor or 'NULL')
|
|
|
|
text_element.set('institution', corpus_file.institution or 'NULL')
|
|
|
|
text_element.set('journal', corpus_file.journal or 'NULL')
|
|
|
|
text_element.set('pages', corpus_file.pages or 'NULL')
|
|
|
|
text_element.set('publisher', corpus_file.publisher or 'NULL')
|
|
|
|
text_element.set('publishing_year', str(corpus_file.publishing_year)) # noqa
|
|
|
|
text_element.set('school', corpus_file.school or 'NULL')
|
|
|
|
text_element.set('title', corpus_file.title)
|
|
|
|
corpus_element.insert(1, text_element)
|
|
|
|
ET.ElementTree(corpus_element).write(
|
|
|
|
os.path.join(self.path, 'cwb', 'corpus.vrt'),
|
|
|
|
encoding='utf-8'
|
|
|
|
)
|
2020-07-10 09:36:54 +00:00
|
|
|
self.last_edited_date = datetime.utcnow()
|
2022-02-08 11:26:20 +00:00
|
|
|
self.status = CorpusStatus.SUBMITTED
|
2020-07-10 09:36:54 +00:00
|
|
|
|
2019-10-30 07:28:52 +00:00
|
|
|
def delete(self):
|
2020-11-13 09:01:51 +00:00
|
|
|
shutil.rmtree(self.path, ignore_errors=True)
|
2019-09-24 12:04:49 +00:00
|
|
|
db.session.delete(self)
|
|
|
|
|
2022-02-03 11:39:16 +00:00
|
|
|
def makedirs(self):
|
|
|
|
os.mkdir(self.path)
|
|
|
|
os.mkdir(os.path.join(self.path, 'files'))
|
|
|
|
os.mkdir(os.path.join(self.path, 'cwb'))
|
|
|
|
os.mkdir(os.path.join(self.path, 'cwb', 'data'))
|
|
|
|
os.mkdir(os.path.join(self.path, 'cwb', 'registry'))
|
|
|
|
|
2021-11-30 15:22:16 +00:00
|
|
|
def to_dict(self, backrefs=False, relationships=False):
|
|
|
|
dict_corpus = {
|
|
|
|
'id': self.hashid,
|
|
|
|
'user_id': self.user.hashid,
|
|
|
|
'analysis_url': self.analysis_url,
|
|
|
|
'url': self.url,
|
|
|
|
'creation_date': self.creation_date.isoformat() + 'Z',
|
|
|
|
'description': self.description,
|
|
|
|
'max_num_tokens': self.max_num_tokens,
|
|
|
|
'num_analysis_sessions': self.num_analysis_sessions,
|
|
|
|
'num_tokens': self.num_tokens,
|
2022-02-08 11:26:20 +00:00
|
|
|
'status': self.status.name,
|
2021-11-30 15:22:16 +00:00
|
|
|
'last_edited_date': self.last_edited_date.isoformat() + 'Z',
|
|
|
|
'title': self.title
|
|
|
|
}
|
|
|
|
if backrefs:
|
|
|
|
dict_corpus['user'] = self.user.to_dict(
|
|
|
|
backrefs=True, relationships=False)
|
|
|
|
if relationships:
|
|
|
|
dict_corpus['files'] = {
|
2021-12-01 15:03:55 +00:00
|
|
|
x.hashid: x.to_dict(backrefs=False, relationships=True)
|
2021-11-30 15:22:16 +00:00
|
|
|
for x in self.files
|
|
|
|
}
|
|
|
|
return dict_corpus
|
2019-11-04 14:06:54 +00:00
|
|
|
|
2019-08-22 07:35:23 +00:00
|
|
|
|
2021-09-15 10:31:53 +00:00
|
|
|
@login.user_loader
|
2019-07-05 12:47:35 +00:00
|
|
|
def load_user(user_id):
|
|
|
|
return User.query.get(int(user_id))
|