nopaque/app/models.py

930 lines
30 KiB
Python
Raw Normal View History

2022-04-12 14:11:40 +00:00
from app.converters.vrt import normalize_vrt_file
from datetime import datetime, timedelta
from enum import IntEnum
2020-12-03 14:13:24 +00:00
from flask import current_app, url_for
2021-12-13 11:20:01 +00:00
from flask_hashids import HashidMixin
from flask_login import UserMixin
2019-08-22 07:35:23 +00:00
from itsdangerous import BadSignature, TimedJSONWebSignatureSerializer
from time import sleep
from tqdm import tqdm
from werkzeug.security import generate_password_hash, check_password_hash
2021-12-13 11:20:01 +00:00
from . import db, login
import base64
import json
import os
import requests
import shutil
import xml.etree.ElementTree as ET
import yaml
class IntEnumColumn(db.TypeDecorator):
impl = db.Integer
def __init__(self, enum_type, *args, **kwargs):
super().__init__(*args, **kwargs)
self.enum_type = enum_type
def process_bind_param(self, value, dialect):
if isinstance(value, self.enum_type) and isinstance(value.value, int):
return value.value
elif isinstance(value, int):
return self.enum_type(value).value
else:
return TypeError()
def process_result_value(self, value, dialect):
return self.enum_type(value)
class ContainerColumn(db.TypeDecorator):
impl = db.String
def __init__(self, container_type, *args, **kwargs):
super().__init__(*args, **kwargs)
self.container_type = container_type
def process_bind_param(self, value, dialect):
if isinstance(value, self.container_type):
return json.dumps(value)
elif isinstance(value, str) and isinstance(json.loads(value), self.container_type): # noqa
return value
else:
return TypeError()
def process_result_value(self, value, dialect):
return json.loads(value)
2021-11-30 15:22:16 +00:00
class FileMixin:
creation_date = db.Column(db.DateTime, default=datetime.utcnow)
filename = db.Column(db.String(255))
2021-11-30 15:22:16 +00:00
last_edited_date = db.Column(db.DateTime, default=datetime.utcnow)
mimetype = db.Column(db.String(255))
def file_mixin_to_dict(self, backrefs=False, relationships=False):
return {
'creation_date': self.creation_date.isoformat() + 'Z',
'filename': self.filename,
'last_edited_date': self.last_edited_date.isoformat() + 'Z',
'mimetype': self.mimetype
}
class Permission(IntEnum):
'''
Defines User permissions as integers by the power of 2. User permission
can be evaluated using the bitwise operator &.
'''
ADMINISTRATE = 1
CONTRIBUTE = 2
USE_API = 4
2021-11-30 15:22:16 +00:00
class Role(HashidMixin, db.Model):
__tablename__ = 'roles'
2019-08-06 09:47:04 +00:00
# Primary key
id = db.Column(db.Integer, primary_key=True)
2020-04-27 08:30:38 +00:00
# Fields
default = db.Column(db.Boolean, default=False, index=True)
2019-08-06 09:47:04 +00:00
name = db.Column(db.String(64), unique=True)
2020-11-13 14:01:53 +00:00
permissions = db.Column(db.Integer)
2019-08-06 09:47:04 +00:00
# Relationships
users = db.relationship('User', backref='role', lazy='dynamic')
def __init__(self, **kwargs):
2021-11-30 15:22:16 +00:00
super().__init__(**kwargs)
if self.permissions is None:
self.permissions = 0
def __repr__(self):
2021-11-30 15:22:16 +00:00
return f'<Role {self.name}>'
2021-11-30 15:22:16 +00:00
def add_permission(self, permission):
if not self.has_permission(permission):
self.permissions += permission
2021-11-30 15:22:16 +00:00
def has_permission(self, permission):
return self.permissions & permission == permission
def remove_permission(self, permission):
if self.has_permission(permission):
self.permissions -= permission
def reset_permissions(self):
self.permissions = 0
2021-11-30 15:22:16 +00:00
def to_dict(self, backrefs=False, relationships=False):
dict_role = {
'id': self.hashid,
'default': self.default,
'name': self.name,
'permissions': self.permissions
}
if relationships:
dict_role['users'] = {
2021-11-30 15:22:16 +00:00
x.to_dict(backrefs=False, relationships=True)
for x in self.users
}
return dict_role
@staticmethod
def insert_defaults():
2021-11-30 15:22:16 +00:00
roles = {
'User': [],
2021-12-03 13:07:03 +00:00
'API user': [Permission.USE_API],
'Contributor': [Permission.CONTRIBUTE],
'Administrator': [
Permission.ADMINISTRATE,
Permission.CONTRIBUTE,
Permission.USE_API
]
2021-11-30 15:22:16 +00:00
}
default_role_name = 'User'
for role_name, permissions in roles.items():
role = Role.query.filter_by(name=role_name).first()
if role is None:
2021-11-30 15:22:16 +00:00
role = Role(name=role_name)
role.reset_permissions()
2021-11-30 15:22:16 +00:00
for permission in permissions:
role.add_permission(permission)
role.default = role.name == default_role_name
db.session.add(role)
db.session.commit()
class UserSettingJobStatusMailNotificationLevel(IntEnum):
NONE = 1
END = 2
ALL = 3
2021-11-30 15:22:16 +00:00
class User(HashidMixin, UserMixin, db.Model):
__tablename__ = 'users'
2019-08-06 09:47:04 +00:00
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
role_id = db.Column(db.Integer, db.ForeignKey('roles.id'))
2020-04-27 08:30:38 +00:00
# Fields
2019-08-06 09:47:04 +00:00
confirmed = db.Column(db.Boolean, default=False)
email = db.Column(db.String(254), unique=True, index=True)
last_seen = db.Column(db.DateTime(), default=datetime.utcnow)
member_since = db.Column(db.DateTime(), default=datetime.utcnow)
password_hash = db.Column(db.String(128))
token = db.Column(db.String(32), index=True, unique=True)
token_expiration = db.Column(db.DateTime)
username = db.Column(db.String(64), unique=True, index=True)
2021-11-30 15:22:16 +00:00
setting_dark_mode = db.Column(db.Boolean, default=False)
setting_job_status_mail_notification_level = db.Column(
IntEnumColumn(UserSettingJobStatusMailNotificationLevel),
default=UserSettingJobStatusMailNotificationLevel.END
)
2021-11-30 15:22:16 +00:00
# Backrefs: role: Role
2019-08-06 09:47:04 +00:00
# Relationships
tesseract_ocr_models = db.relationship(
'TesseractOCRModel',
backref='user',
cascade='all, delete-orphan',
lazy='dynamic'
)
2021-11-30 15:22:16 +00:00
corpora = db.relationship(
'Corpus',
backref='user',
cascade='all, delete-orphan',
lazy='dynamic'
)
jobs = db.relationship(
'Job',
backref='user',
cascade='all, delete-orphan',
lazy='dynamic'
)
def __init__(self, **kwargs):
super().__init__(**kwargs)
if self.role is not None:
return
if self.email == current_app.config['NOPAQUE_ADMIN']:
self.role = Role.query.filter_by(name='Administrator').first()
else:
self.role = Role.query.filter_by(default=True).first()
def __repr__(self):
return f'<User {self.username}>'
@property
2021-11-30 15:22:16 +00:00
def jsonpatch_path(self):
return f'/users/{self.hashid}'
@property
def password(self):
raise AttributeError('password is not a readable attribute')
@password.setter
def password(self, password):
self.password_hash = generate_password_hash(password)
2021-11-30 15:22:16 +00:00
@property
def path(self):
return os.path.join(
current_app.config.get('NOPAQUE_DATA_DIR'), 'users', str(self.id))
2021-11-30 15:22:16 +00:00
def can(self, permission):
return self.role.has_permission(permission)
2019-07-08 13:59:15 +00:00
def confirm(self, token):
2019-08-22 07:35:23 +00:00
s = TimedJSONWebSignatureSerializer(current_app.config['SECRET_KEY'])
2019-07-08 13:59:15 +00:00
try:
data = s.loads(token.encode('utf-8'))
2019-08-22 07:35:23 +00:00
except BadSignature:
2019-07-08 13:59:15 +00:00
return False
2021-11-30 15:22:16 +00:00
if data.get('confirm') != self.hashid:
2019-07-08 13:59:15 +00:00
return False
self.confirmed = True
db.session.add(self)
return True
2019-11-14 08:48:30 +00:00
def delete(self):
shutil.rmtree(self.path, ignore_errors=True)
db.session.delete(self)
2021-11-30 15:22:16 +00:00
def generate_confirmation_token(self, expiration=3600):
s = TimedJSONWebSignatureSerializer(
current_app.config['SECRET_KEY'], expiration)
return s.dumps({'confirm': self.hashid}).decode('utf-8')
def generate_reset_token(self, expiration=3600):
s = TimedJSONWebSignatureSerializer(
current_app.config['SECRET_KEY'], expiration)
return s.dumps({'reset': self.hashid}).decode('utf-8')
def get_token(self, expires_in=3600):
now = datetime.utcnow()
if self.token and self.token_expiration > now + timedelta(seconds=60):
return self.token
self.token = base64.b64encode(os.urandom(24)).decode('utf-8')
self.token_expiration = now + timedelta(seconds=expires_in)
db.session.add(self)
return self.token
2021-11-30 15:22:16 +00:00
def is_administrator(self):
return self.can(Permission.ADMINISTRATE)
def makedirs(self):
os.mkdir(self.path)
os.mkdir(os.path.join(self.path, 'tesseract_ocr_models'))
os.mkdir(os.path.join(self.path, 'corpora'))
os.mkdir(os.path.join(self.path, 'jobs'))
def revoke_token(self):
self.token_expiration = datetime.utcnow() - timedelta(seconds=1)
2021-11-30 15:22:16 +00:00
def to_dict(self, backrefs=False, relationships=False):
dict_user = {
'id': self.hashid,
'role_id': self.role.hashid,
'confirmed': self.confirmed,
'email': self.email,
'last_seen': self.last_seen.isoformat() + 'Z',
'member_since': self.member_since.isoformat() + 'Z',
'username': self.username,
'settings': {
'dark_mode': self.setting_dark_mode,
'job_status_mail_notification_level':
self.setting_job_status_mail_notification_level.name
2021-11-30 15:22:16 +00:00
}
}
if backrefs:
dict_user['role'] = self.role.to_dict(
backrefs=True, relationships=False)
if relationships:
dict_user['corpora'] = {
x.hashid: x.to_dict(backrefs=False, relationships=True)
for x in self.corpora
}
dict_user['jobs'] = {
x.hashid: x.to_dict(backrefs=False, relationships=True)
for x in self.jobs
}
dict_user['tesseract_ocr_models'] = {
2021-11-30 15:22:16 +00:00
x.hashid: x.to_dict(backrefs=False, relationships=True)
for x in self.tesseract_ocr_models
2021-11-30 15:22:16 +00:00
}
return dict_user
def verify_password(self, password):
return check_password_hash(self.password_hash, password)
@staticmethod
def check_token(token):
user = User.query.filter_by(token=token).first()
if user is None or user.token_expiration < datetime.utcnow():
return None
return user
@staticmethod
def insert_defaults():
if User.query.filter_by(username='nopaque').first() is not None:
return
user = User(username='nopaque')
db.session.add(user)
db.session.flush(objects=[user])
db.session.refresh(user)
try:
user.makedirs()
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
db.session.commit()
2021-11-30 15:22:16 +00:00
@staticmethod
def reset_password(token, new_password):
s = TimedJSONWebSignatureSerializer(current_app.config['SECRET_KEY'])
try:
data = s.loads(token.encode('utf-8'))
except BadSignature:
return False
user = User.query.get(data.get('reset'))
if user is None:
return False
user.password = new_password
db.session.add(user)
return True
2019-09-09 14:17:59 +00:00
2021-11-30 15:22:16 +00:00
class TesseractOCRModel(FileMixin, HashidMixin, db.Model):
__tablename__ = 'tesseract_ocr_models'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
# Fields
compatible_service_versions = db.Column(ContainerColumn(list, 255))
description = db.Column(db.String(255))
publisher = db.Column(db.String(128))
publishing_year = db.Column(db.Integer)
shared = db.Column(db.Boolean, default=False)
title = db.Column(db.String(64))
version = db.Column(db.String(16))
# Backrefs: user: User
@property
def path(self):
return os.path.join(
self.user.path,
'tesseract_ocr_models',
str(self.id)
)
def to_dict(self, backrefs=False, relationships=False):
dict_tesseract_ocr_model = {
'id': self.hashid,
'user_id': self.user.hashid,
'compatible_service_versions': self.compatible_service_versions,
'description': self.description,
'publisher': self.publisher,
'publishing_year': self.publishing_year,
'title': self.title,
**self.file_mixin_to_dict()
}
if backrefs:
dict_tesseract_ocr_model['user'] = self.user.to_dict(
backrefs=True, relationships=False)
if relationships:
pass
return dict_tesseract_ocr_model
@staticmethod
def insert_defaults():
user = User.query.filter_by(username='nopaque').first()
defaults_file = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'TesseractOCRModel.defaults.yml'
)
with open(defaults_file, 'r') as f:
defaults = yaml.safe_load(f)
for m in defaults:
model = TesseractOCRModel.query.filter_by(title=m['title'], version=m['version']).first() # noqa
if model is not None:
model.compatible_service_versions = m['compatible_service_versions']
model.description = m['description']
model.publisher = m['publisher']
model.publishing_year = m['publishing_year']
model.title = m['title']
model.version = m['version']
continue
model = TesseractOCRModel(
compatible_service_versions=m['compatible_service_versions'],
description=m['description'],
publisher=m['publisher'],
publishing_year=m['publishing_year'],
shared=True,
title=m['title'],
user=user,
version=m['version']
)
db.session.add(model)
db.session.flush(objects=[model])
db.session.refresh(model)
model.filename = f'{model.id}.traineddata'
r = requests.get(m['url'], stream=True)
pbar = tqdm(
desc=f'{model.title} ({model.filename})',
unit="B",
unit_scale=True,
unit_divisor=1024,
total=int(r.headers['Content-Length'])
)
pbar.clear()
with open(model.path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
pbar.update(len(chunk))
f.write(chunk)
pbar.close()
db.session.commit()
2021-11-30 15:22:16 +00:00
class JobInput(FileMixin, HashidMixin, db.Model):
2019-10-16 14:52:05 +00:00
__tablename__ = 'job_inputs'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
job_id = db.Column(db.Integer, db.ForeignKey('jobs.id'))
2021-11-30 15:22:16 +00:00
# Backrefs: job: Job
def __repr__(self):
return f'<JobInput {self.filename}>'
2019-10-16 14:52:05 +00:00
2020-12-03 14:13:24 +00:00
@property
def download_url(self):
2021-11-30 15:22:16 +00:00
return url_for(
'jobs.download_job_input',
job_id=self.job.id,
job_input_id=self.id
)
2020-12-03 14:13:24 +00:00
@property
def jsonpatch_path(self):
2021-11-30 15:22:16 +00:00
return f'{self.job.jsonpatch_path}/inputs/{self.hashid}'
@property
def path(self):
return os.path.join(self.job.path, 'inputs', str(self.id))
2021-11-30 15:22:16 +00:00
def to_dict(self, backrefs=False, relationships=False):
dict_job_input = {
'id': self.hashid,
'job_id': self.job.hashid,
'download_url': self.download_url,
'url': self.url,
**self.file_mixin_to_dict()
}
if backrefs:
dict_job_input['job'] = self.job.to_dict(
backrefs=True, relationships=False)
return dict_job_input
2020-12-03 14:13:24 +00:00
@property
def url(self):
2021-11-30 15:22:16 +00:00
return url_for(
'jobs.job',
job_id=self.job_id,
_anchor=f'job-{self.job.hashid}-input-{self.hashid}'
)
@property
def user_hashid(self):
return self.job.user.hashid
2020-12-03 14:13:24 +00:00
@property
def user_id(self):
return self.job.user_id
2019-10-16 14:52:05 +00:00
2021-11-30 15:22:16 +00:00
class JobResult(FileMixin, HashidMixin, db.Model):
2019-10-16 14:52:05 +00:00
__tablename__ = 'job_results'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
job_id = db.Column(db.Integer, db.ForeignKey('jobs.id'))
# Fields
description = db.Column(db.String(255))
2021-11-30 15:22:16 +00:00
# Backrefs: job: Job
def __repr__(self):
return f'<JobResult {self.filename}>'
2019-10-16 14:52:05 +00:00
2020-12-03 14:13:24 +00:00
@property
def download_url(self):
2021-11-30 15:22:16 +00:00
return url_for(
'jobs.download_job_result',
job_id=self.job_id,
job_result_id=self.id
)
2020-12-03 14:13:24 +00:00
@property
def jsonpatch_path(self):
2021-11-30 15:22:16 +00:00
return f'{self.job.jsonpatch_path}/results/{self.hashid}'
@property
def path(self):
return os.path.join(self.job.path, 'results', str(self.id))
2021-11-30 15:22:16 +00:00
def to_dict(self, backrefs=False, relationships=False):
dict_job_result = {
'id': self.hashid,
'job_id': self.job.hashid,
'description': self.description,
2021-11-30 15:22:16 +00:00
'download_url': self.download_url,
'url': self.url,
**self.file_mixin_to_dict(
backrefs=backrefs, relationships=relationships)
}
if backrefs:
dict_job_result['job'] = self.job.to_dict(
backrefs=True, relationships=False)
return dict_job_result
2020-12-03 14:13:24 +00:00
@property
def url(self):
2021-11-30 15:22:16 +00:00
return url_for(
'jobs.job',
job_id=self.job_id,
_anchor=f'job-{self.job.hashid}-result-{self.hashid}'
)
@property
def user_hashid(self):
return self.job.user.hashid
2020-12-03 14:13:24 +00:00
@property
def user_id(self):
return self.job.user_id
2019-10-17 11:26:20 +00:00
class JobStatus(IntEnum):
INITIALIZING = 1
SUBMITTED = 2
QUEUED = 3
RUNNING = 4
CANCELING = 5
CANCELED = 6
COMPLETED = 7
FAILED = 8
2021-11-30 15:22:16 +00:00
class Job(HashidMixin, db.Model):
2020-05-14 13:30:13 +00:00
'''
2019-08-05 14:45:38 +00:00
Class to define Jobs.
2020-05-14 13:30:13 +00:00
'''
2019-08-05 14:45:38 +00:00
__tablename__ = 'jobs'
2019-08-06 09:47:04 +00:00
# Primary key
2019-08-05 14:45:38 +00:00
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
2020-04-27 08:30:38 +00:00
# Fields
2019-08-06 12:26:22 +00:00
creation_date = db.Column(db.DateTime(), default=datetime.utcnow)
2019-08-09 09:48:43 +00:00
description = db.Column(db.String(255))
end_date = db.Column(db.DateTime())
2019-08-06 09:47:04 +00:00
service = db.Column(db.String(64))
service_args = db.Column(ContainerColumn(dict, 255))
2019-08-09 09:48:43 +00:00
service_version = db.Column(db.String(16))
status = db.Column(
IntEnumColumn(JobStatus),
default=JobStatus.INITIALIZING
)
2019-08-06 09:47:04 +00:00
title = db.Column(db.String(32))
2021-11-30 15:22:16 +00:00
# Backrefs: user: User
2019-10-16 14:52:05 +00:00
# Relationships
2021-11-30 15:22:16 +00:00
inputs = db.relationship(
'JobInput',
backref='job',
cascade='all, delete-orphan',
lazy='dynamic'
)
results = db.relationship(
'JobResult',
backref='job',
cascade='all, delete-orphan',
lazy='dynamic'
)
def __repr__(self):
return f'<Job {self.title}>'
@property
def jsonpatch_path(self):
2021-11-30 15:22:16 +00:00
return f'{self.user.jsonpatch_path}/jobs/{self.hashid}'
@property
def path(self):
2021-11-30 15:22:16 +00:00
return os.path.join(self.user.path, 'jobs', str(self.id))
2019-08-05 14:45:38 +00:00
2020-12-03 14:13:24 +00:00
@property
2020-12-07 15:10:40 +00:00
def url(self):
return url_for('jobs.job', job_id=self.id)
2020-12-03 14:13:24 +00:00
2021-11-30 15:22:16 +00:00
@property
def user_hashid(self):
return self.user.hashid
2019-08-05 14:45:38 +00:00
2019-11-14 08:48:30 +00:00
def delete(self):
2020-05-14 13:30:13 +00:00
'''
Delete the job and its inputs and results from the database.
2020-05-14 13:30:13 +00:00
'''
if self.status not in [JobStatus.COMPLETED, JobStatus.FAILED]: # noqa
self.status = JobStatus.CANCELING
db.session.commit()
while self.status != JobStatus.CANCELED:
# In case the daemon handled a job in any way
if self.status != JobStatus.CANCELING:
self.status = JobStatus.CANCELING
db.session.commit()
sleep(1)
db.session.refresh(self)
shutil.rmtree(self.path, ignore_errors=True)
2019-11-14 08:48:30 +00:00
db.session.delete(self)
def makedirs(self):
os.mkdir(self.path)
os.mkdir(os.path.join(self.path, 'inputs'))
os.mkdir(os.path.join(self.path, 'pipeline_data'))
os.mkdir(os.path.join(self.path, 'results'))
def restart(self):
'''
Restart a job - only if the status is complete or failed
'''
if self.status not in [JobStatus.COMPLETED, JobStatus.FAILED]: # noqa
raise Exception('Could not restart job: status is not "completed/failed"') # noqa
shutil.rmtree(os.path.join(self.path, 'results'), ignore_errors=True)
shutil.rmtree(os.path.join(self.path, 'pyflow.data'), ignore_errors=True) # noqa
for result in self.results:
db.session.delete(result)
self.end_date = None
self.status = JobStatus.SUBMITTED
2021-11-30 15:22:16 +00:00
def to_dict(self, backrefs=False, relationships=False):
dict_job = {
2021-11-30 15:22:16 +00:00
'id': self.hashid,
'user_id': self.user.hashid,
'creation_date': self.creation_date.isoformat() + 'Z',
'description': self.description,
2021-11-30 15:22:16 +00:00
'end_date': None if self.end_date is None else f'{self.end_date.isoformat()}Z', # noqa
'service': self.service,
'service_args': self.service_args,
'service_version': self.service_version,
'status': self.status.name,
'title': self.title,
2021-11-30 15:22:16 +00:00
'url': self.url
}
2021-11-30 15:22:16 +00:00
if backrefs:
dict_job['user'] = self.user.to_dict(
backrefs=True, relationships=False)
if relationships:
dict_job['inputs'] = {
x.hashid: x.to_dict(backrefs=False, relationships=True)
for x in self.inputs
}
dict_job['results'] = {
x.hashid: x.to_dict(backrefs=False, relationships=True)
for x in self.results
}
return dict_job
2019-08-05 14:45:38 +00:00
2021-11-30 15:22:16 +00:00
class CorpusFile(FileMixin, HashidMixin, db.Model):
2019-10-16 14:52:05 +00:00
__tablename__ = 'corpus_files'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
corpus_id = db.Column(db.Integer, db.ForeignKey('corpora.id'))
2020-04-27 08:30:38 +00:00
# Fields
2020-01-08 15:02:42 +00:00
address = db.Column(db.String(255))
author = db.Column(db.String(255))
booktitle = db.Column(db.String(255))
chapter = db.Column(db.String(255))
editor = db.Column(db.String(255))
institution = db.Column(db.String(255))
journal = db.Column(db.String(255))
pages = db.Column(db.String(255))
publisher = db.Column(db.String(255))
publishing_year = db.Column(db.Integer)
2020-01-08 15:02:42 +00:00
school = db.Column(db.String(255))
title = db.Column(db.String(255))
2021-11-30 15:22:16 +00:00
# Backrefs: corpus: Corpus
2019-10-16 14:52:05 +00:00
2020-12-03 14:13:24 +00:00
@property
def download_url(self):
2021-11-30 15:22:16 +00:00
return url_for(
'corpora.download_corpus_file',
corpus_id=self.corpus_id,
corpus_file_id=self.id
)
2020-12-03 14:13:24 +00:00
@property
def jsonpatch_path(self):
2021-12-01 15:03:55 +00:00
return f'{self.corpus.jsonpatch_path}/files/{self.hashid}'
@property
def path(self):
return os.path.join(self.corpus.path, 'files', str(self.id))
2020-12-03 14:13:24 +00:00
@property
def url(self):
2021-11-30 15:22:16 +00:00
return url_for(
'corpora.corpus_file',
corpus_id=self.corpus_id,
corpus_file_id=self.id
)
@property
def user_hashid(self):
return self.corpus.user.hashid
2020-12-03 14:13:24 +00:00
@property
def user_id(self):
return self.corpus.user_id
2019-10-30 07:28:52 +00:00
def delete(self):
2020-07-10 09:36:54 +00:00
try:
os.remove(self.path)
2020-07-10 09:36:54 +00:00
except OSError:
current_app.logger.error(
2021-12-13 11:20:01 +00:00
f'Removing {self.path} led to an OSError!'
)
2020-07-10 09:36:54 +00:00
pass
2019-10-30 07:28:52 +00:00
db.session.delete(self)
self.corpus.status = CorpusStatus.UNPREPARED
2019-10-30 07:28:52 +00:00
2021-11-30 15:22:16 +00:00
def to_dict(self, backrefs=False, relationships=False):
dict_corpus_file = {
'id': self.hashid,
'corpus_id': self.corpus.hashid,
'download_url': self.download_url,
'url': self.url,
'address': self.address,
'author': self.author,
'booktitle': self.booktitle,
'chapter': self.chapter,
'editor': self.editor,
'institution': self.institution,
'journal': self.journal,
'pages': self.pages,
'publisher': self.publisher,
'publishing_year': self.publishing_year,
'school': self.school,
'title': self.title,
**self.file_mixin_to_dict(
backrefs=backrefs, relationships=relationships)
}
if backrefs:
dict_corpus_file['corpus'] = self.corpus.to_dict(
backrefs=True, relationships=False)
2021-12-01 15:03:55 +00:00
return dict_corpus_file
2021-11-30 15:22:16 +00:00
class CorpusStatus(IntEnum):
UNPREPARED = 1
SUBMITTED = 2
QUEUED = 3
BUILDING = 4
BUILT = 5
FAILED = 6
STARTING_ANALYSIS_SESSION = 7
RUNNING_ANALYSIS_SESSION = 8
CANCELING_ANALYSIS_SESSION = 9
2021-11-30 15:22:16 +00:00
class Corpus(HashidMixin, db.Model):
2020-05-14 13:30:13 +00:00
'''
2019-08-06 10:06:41 +00:00
Class to define a corpus.
2020-05-14 13:30:13 +00:00
'''
2019-08-06 10:06:41 +00:00
__tablename__ = 'corpora'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
2020-04-27 08:30:38 +00:00
# Fields
2019-08-06 13:41:07 +00:00
creation_date = db.Column(db.DateTime(), default=datetime.utcnow)
description = db.Column(db.String(255))
last_edited_date = db.Column(db.DateTime(), default=datetime.utcnow)
status = db.Column(
IntEnumColumn(CorpusStatus),
default=CorpusStatus.UNPREPARED
)
2019-08-06 10:06:41 +00:00
title = db.Column(db.String(32))
2021-11-16 14:23:57 +00:00
num_analysis_sessions = db.Column(db.Integer, default=0)
num_tokens = db.Column(db.Integer, default=0)
2021-11-30 15:22:16 +00:00
# Backrefs: user: User
2019-10-16 14:52:05 +00:00
# Relationships
2021-11-30 15:22:16 +00:00
files = db.relationship(
'CorpusFile',
backref='corpus',
lazy='dynamic',
cascade='all, delete-orphan'
)
# "static" attributes
2021-11-16 14:23:57 +00:00
max_num_tokens = 2147483647
2019-08-06 10:06:41 +00:00
2021-11-30 15:22:16 +00:00
def __repr__(self):
return f'<Corpus {self.title}>'
2020-12-07 15:10:40 +00:00
@property
def analysis_url(self):
return url_for('corpora.analyse_corpus', corpus_id=self.id)
@property
def jsonpatch_path(self):
2021-11-30 15:22:16 +00:00
return f'{self.user.jsonpatch_path}/corpora/{self.hashid}'
@property
def path(self):
2021-11-30 15:22:16 +00:00
return os.path.join(self.user.path, 'corpora', str(self.id))
2020-12-04 13:16:00 +00:00
@property
def url(self):
return url_for('corpora.corpus', corpus_id=self.id)
2021-11-30 15:22:16 +00:00
@property
def user_hashid(self):
return self.user.hashid
2019-08-06 10:06:41 +00:00
2020-07-10 09:36:54 +00:00
def build(self):
2021-11-16 14:23:57 +00:00
corpus_element = ET.fromstring('<corpus>\n</corpus>')
2020-07-10 09:36:54 +00:00
for corpus_file in self.files:
2022-04-12 14:11:40 +00:00
normalized_vrt_path = os.path.join(self.path, 'cwb', f'{corpus_file.id}.norm.vrt')
try:
normalize_vrt_file(corpus_file.path, normalized_vrt_path)
except:
self.status = CorpusStatus.FAILED
return
element_tree = ET.parse(normalized_vrt_path)
text_element = element_tree.getroot()
text_element.set('address', corpus_file.address or 'NULL')
text_element.set('author', corpus_file.author)
text_element.set('booktitle', corpus_file.booktitle or 'NULL')
text_element.set('chapter', corpus_file.chapter or 'NULL')
text_element.set('editor', corpus_file.editor or 'NULL')
text_element.set('institution', corpus_file.institution or 'NULL')
text_element.set('journal', corpus_file.journal or 'NULL')
text_element.set('pages', corpus_file.pages or 'NULL')
text_element.set('publisher', corpus_file.publisher or 'NULL')
text_element.set('publishing_year', str(corpus_file.publishing_year)) # noqa
text_element.set('school', corpus_file.school or 'NULL')
text_element.set('title', corpus_file.title)
2022-04-13 07:47:02 +00:00
text_element.tail = '\n'
# corpus_element.insert(1, text_element)
corpus_element.append(text_element)
ET.ElementTree(corpus_element).write(
os.path.join(self.path, 'cwb', 'corpus.vrt'),
encoding='utf-8'
)
2020-07-10 09:36:54 +00:00
self.last_edited_date = datetime.utcnow()
self.status = CorpusStatus.SUBMITTED
2020-07-10 09:36:54 +00:00
2019-10-30 07:28:52 +00:00
def delete(self):
shutil.rmtree(self.path, ignore_errors=True)
db.session.delete(self)
def makedirs(self):
os.mkdir(self.path)
os.mkdir(os.path.join(self.path, 'files'))
os.mkdir(os.path.join(self.path, 'cwb'))
os.mkdir(os.path.join(self.path, 'cwb', 'data'))
os.mkdir(os.path.join(self.path, 'cwb', 'registry'))
2021-11-30 15:22:16 +00:00
def to_dict(self, backrefs=False, relationships=False):
dict_corpus = {
'id': self.hashid,
'user_id': self.user.hashid,
'analysis_url': self.analysis_url,
'url': self.url,
'creation_date': self.creation_date.isoformat() + 'Z',
'description': self.description,
'max_num_tokens': self.max_num_tokens,
'num_analysis_sessions': self.num_analysis_sessions,
'num_tokens': self.num_tokens,
'status': self.status.name,
2021-11-30 15:22:16 +00:00
'last_edited_date': self.last_edited_date.isoformat() + 'Z',
'title': self.title
}
if backrefs:
dict_corpus['user'] = self.user.to_dict(
backrefs=True,
relationships=False
)
2021-11-30 15:22:16 +00:00
if relationships:
dict_corpus['files'] = {
2021-12-01 15:03:55 +00:00
x.hashid: x.to_dict(backrefs=False, relationships=True)
2021-11-30 15:22:16 +00:00
for x in self.files
}
return dict_corpus
2019-11-04 14:06:54 +00:00
2019-08-22 07:35:23 +00:00
@login.user_loader
def load_user(user_id):
return User.query.get(int(user_id))