2021-09-14 12:52:23 +02:00
from datetime import datetime , timedelta
2022-06-28 12:30:02 +02:00
from enum import Enum , IntEnum
2022-10-12 15:10:55 +02:00
import re
2020-12-03 15:13:24 +01:00
from flask import current_app , url_for
2021-12-13 12:20:01 +01:00
from flask_hashids import HashidMixin
2021-09-15 12:31:53 +02:00
from flask_login import UserMixin
2020-07-09 15:07:43 +02:00
from time import sleep
2022-02-03 12:39:16 +01:00
from tqdm import tqdm
2019-07-05 14:47:35 +02:00
from werkzeug . security import generate_password_hash , check_password_hash
2022-09-02 13:07:30 +02:00
from werkzeug . utils import secure_filename
2022-02-03 12:39:16 +01:00
import json
2022-07-18 17:10:09 +02:00
import jwt
2020-07-09 15:07:43 +02:00
import os
2022-02-03 12:39:16 +01:00
import requests
2022-09-02 13:24:14 +02:00
import secrets
2020-07-09 15:07:43 +02:00
import shutil
2022-02-03 12:39:16 +01:00
import xml . etree . ElementTree as ET
import yaml
2022-09-02 13:07:30 +02:00
from app import db , hashids , login , mail , socketio
from app . converters . vrt import normalize_vrt_file
from app . email import create_message
2019-07-05 14:47:35 +02:00
2022-06-28 12:30:02 +02:00
##############################################################################
# enums #
##############################################################################
# region enums
class CorpusStatus ( IntEnum ) :
UNPREPARED = 1
SUBMITTED = 2
QUEUED = 3
BUILDING = 4
BUILT = 5
FAILED = 6
STARTING_ANALYSIS_SESSION = 7
RUNNING_ANALYSIS_SESSION = 8
CANCELING_ANALYSIS_SESSION = 9
class JobStatus ( IntEnum ) :
INITIALIZING = 1
SUBMITTED = 2
QUEUED = 3
RUNNING = 4
CANCELING = 5
CANCELED = 6
COMPLETED = 7
FAILED = 8
class Permission ( IntEnum ) :
'''
Defines User permissions as integers by the power of 2. User permission
can be evaluated using the bitwise operator & .
'''
ADMINISTRATE = 1
CONTRIBUTE = 2
USE_API = 4
class UserSettingJobStatusMailNotificationLevel ( IntEnum ) :
NONE = 1
END = 2
ALL = 3
# endregion enums
##############################################################################
# mixins #
##############################################################################
# region mixins
2021-11-30 16:22:16 +01:00
class FileMixin :
2022-04-25 11:32:10 +02:00
'''
Mixin for db . Model classes . All file related models should use this .
'''
2021-11-30 16:22:16 +01:00
creation_date = db . Column ( db . DateTime , default = datetime . utcnow )
2022-02-03 12:39:16 +01:00
filename = db . Column ( db . String ( 255 ) )
2022-09-02 13:07:30 +02:00
last_edited_date = db . Column ( db . DateTime )
2021-11-30 16:22:16 +01:00
mimetype = db . Column ( db . String ( 255 ) )
2022-09-02 13:07:30 +02:00
def file_mixin_to_json ( self , backrefs = False , relationships = False ) :
2021-11-30 16:22:16 +01:00
return {
2022-09-02 13:07:30 +02:00
' creation_date ' : f ' { self . creation_date . isoformat ( ) } Z ' ,
2021-11-30 16:22:16 +01:00
' filename ' : self . filename ,
2022-09-02 13:07:30 +02:00
' last_edited_date ' : (
None if self . last_edited_date is None
else f ' { self . last_edited_date . isoformat ( ) } Z '
) ,
2021-11-30 16:22:16 +01:00
' mimetype ' : self . mimetype
}
2022-10-11 11:32:50 +02:00
@classmethod
def create ( cls , file_storage , * * kwargs ) :
filename = kwargs . pop ( ' filename ' , file_storage . filename )
mimetype = kwargs . pop ( ' mimetype ' , file_storage . mimetype )
obj = cls (
filename = secure_filename ( filename ) ,
mimetype = mimetype ,
* * kwargs
)
db . session . add ( obj )
db . session . flush ( objects = [ obj ] )
db . session . refresh ( obj )
try :
file_storage . save ( obj . path )
except ( AttributeError , OSError ) as e :
current_app . logger . error ( e )
db . session . rollback ( )
raise e
return obj
2022-06-28 12:30:02 +02:00
# endregion mixins
2021-11-30 16:22:16 +01:00
2022-06-28 12:30:02 +02:00
##############################################################################
# type_decorators #
##############################################################################
# region type_decorators
class IntEnumColumn ( db . TypeDecorator ) :
impl = db . Integer
2022-04-04 13:31:09 +02:00
2022-06-28 12:30:02 +02:00
def __init__ ( self , enum_type , * args , * * kwargs ) :
super ( ) . __init__ ( * args , * * kwargs )
self . enum_type = enum_type
2022-04-04 13:31:09 +02:00
2022-06-28 12:30:02 +02:00
def process_bind_param ( self , value , dialect ) :
if isinstance ( value , self . enum_type ) and isinstance ( value . value , int ) :
return value . value
elif isinstance ( value , int ) :
return self . enum_type ( value ) . value
else :
return TypeError ( )
def process_result_value ( self , value , dialect ) :
return self . enum_type ( value )
class ContainerColumn ( db . TypeDecorator ) :
impl = db . String
def __init__ ( self , container_type , * args , * * kwargs ) :
super ( ) . __init__ ( * args , * * kwargs )
self . container_type = container_type
def process_bind_param ( self , value , dialect ) :
if isinstance ( value , self . container_type ) :
return json . dumps ( value )
2022-09-02 13:07:30 +02:00
elif ( isinstance ( value , str )
and isinstance ( json . loads ( value ) , self . container_type ) ) :
2022-06-28 12:30:02 +02:00
return value
else :
return TypeError ( )
def process_result_value ( self , value , dialect ) :
return json . loads ( value )
# endregion type_decorators
##############################################################################
# Models #
##############################################################################
# region models
2021-11-30 16:22:16 +01:00
class Role ( HashidMixin , db . Model ) :
2019-07-05 14:47:35 +02:00
__tablename__ = ' roles '
2019-08-06 11:47:04 +02:00
# Primary key
2019-07-05 14:47:35 +02:00
id = db . Column ( db . Integer , primary_key = True )
2020-04-27 10:30:38 +02:00
# Fields
2019-08-06 11:47:04 +02:00
name = db . Column ( db . String ( 64 ) , unique = True )
2022-09-02 13:07:30 +02:00
default = db . Column ( db . Boolean , default = False , index = True )
permissions = db . Column ( db . Integer , default = 0 )
2019-08-06 11:47:04 +02:00
# Relationships
2019-07-09 15:41:16 +02:00
users = db . relationship ( ' User ' , backref = ' role ' , lazy = ' dynamic ' )
2019-07-05 14:47:35 +02:00
def __repr__ ( self ) :
2021-11-30 16:22:16 +01:00
return f ' <Role { self . name } > '
2019-07-05 14:47:35 +02:00
2021-11-30 16:22:16 +01:00
def add_permission ( self , permission ) :
if not self . has_permission ( permission ) :
self . permissions + = permission
2019-07-09 15:41:16 +02:00
2021-11-30 16:22:16 +01:00
def has_permission ( self , permission ) :
return self . permissions & permission == permission
def remove_permission ( self , permission ) :
if self . has_permission ( permission ) :
self . permissions - = permission
2019-07-09 15:41:16 +02:00
def reset_permissions ( self ) :
self . permissions = 0
2022-09-02 13:07:30 +02:00
def to_json ( self , backrefs = False , relationships = False ) :
_json = {
2021-11-30 16:22:16 +01:00
' id ' : self . hashid ,
' default ' : self . default ,
' name ' : self . name ,
' permissions ' : self . permissions
}
if relationships :
2022-09-02 13:07:30 +02:00
_json [ ' users ' ] = {
x . hashid : x . to_json ( relationships = True )
2021-11-30 16:22:16 +01:00
for x in self . users
}
2022-09-02 13:07:30 +02:00
return _json
2019-07-09 15:41:16 +02:00
@staticmethod
2022-02-03 12:39:16 +01:00
def insert_defaults ( ) :
2021-11-30 16:22:16 +01:00
roles = {
' User ' : [ ] ,
2021-12-03 14:07:03 +01:00
' API user ' : [ Permission . USE_API ] ,
' Contributor ' : [ Permission . CONTRIBUTE ] ,
' Administrator ' : [
Permission . ADMINISTRATE ,
Permission . CONTRIBUTE ,
Permission . USE_API
2022-09-02 13:07:30 +02:00
] ,
' System user ' : [ ]
2021-11-30 16:22:16 +01:00
}
default_role_name = ' User '
for role_name , permissions in roles . items ( ) :
role = Role . query . filter_by ( name = role_name ) . first ( )
2019-07-09 15:41:16 +02:00
if role is None :
2021-11-30 16:22:16 +01:00
role = Role ( name = role_name )
2019-07-09 15:41:16 +02:00
role . reset_permissions ( )
2021-11-30 16:22:16 +01:00
for permission in permissions :
role . add_permission ( permission )
role . default = role . name == default_role_name
2019-07-09 15:41:16 +02:00
db . session . add ( role )
db . session . commit ( )
2019-07-05 14:47:35 +02:00
2022-09-02 13:24:14 +02:00
class Token ( db . Model ) :
__tablename__ = ' tokens '
# Primary key
id = db . Column ( db . Integer , primary_key = True )
# Foreign keys
user_id = db . Column ( db . Integer , db . ForeignKey ( ' users.id ' ) )
# Fields
access_token = db . Column ( db . String ( 64 ) , index = True )
access_expiration = db . Column ( db . DateTime )
refresh_token = db . Column ( db . String ( 64 ) , index = True )
refresh_expiration = db . Column ( db . DateTime )
# Backrefs: user: User
def expire ( self ) :
self . access_expiration = datetime . utcnow ( )
self . refresh_expiration = datetime . utcnow ( )
@staticmethod
def clean ( ) :
""" Remove any tokens that have been expired for more than a day. """
yesterday = datetime . utcnow ( ) - timedelta ( days = 1 )
Token . query . filter ( Token . refresh_expiration < yesterday ) . delete ( )
2021-11-30 16:22:16 +01:00
class User ( HashidMixin , UserMixin , db . Model ) :
2019-07-05 14:47:35 +02:00
__tablename__ = ' users '
2019-08-06 11:47:04 +02:00
# Primary key
2019-07-05 14:47:35 +02:00
id = db . Column ( db . Integer , primary_key = True )
2020-04-29 12:17:16 +02:00
# Foreign keys
role_id = db . Column ( db . Integer , db . ForeignKey ( ' roles.id ' ) )
2020-04-27 10:30:38 +02:00
# Fields
2022-09-02 13:07:30 +02:00
email = db . Column ( db . String ( 254 ) , index = True , unique = True )
username = db . Column ( db . String ( 64 ) , index = True , unique = True )
password_hash = db . Column ( db . String ( 128 ) )
2019-08-06 11:47:04 +02:00
confirmed = db . Column ( db . Boolean , default = False )
2020-04-27 13:50:54 +02:00
member_since = db . Column ( db . DateTime ( ) , default = datetime . utcnow )
2021-11-30 16:22:16 +01:00
setting_dark_mode = db . Column ( db . Boolean , default = False )
2022-02-09 16:02:37 +01:00
setting_job_status_mail_notification_level = db . Column (
2022-02-10 12:01:31 +01:00
IntEnumColumn ( UserSettingJobStatusMailNotificationLevel ) ,
2022-02-09 16:02:37 +01:00
default = UserSettingJobStatusMailNotificationLevel . END
2022-02-08 12:26:20 +01:00
)
2022-09-02 13:07:30 +02:00
last_seen = db . Column ( db . DateTime ( ) )
2021-11-30 16:22:16 +01:00
# Backrefs: role: Role
2019-08-06 11:47:04 +02:00
# Relationships
2022-10-12 10:23:05 +02:00
tesseract_ocr_pipeline_models = db . relationship (
' TesseractOCRPipelineModel ' ,
2022-02-03 12:39:16 +01:00
backref = ' user ' ,
cascade = ' all, delete-orphan ' ,
lazy = ' dynamic '
)
2022-10-13 15:05:54 +02:00
spacy_nlp_pipeline_models = db . relationship (
' SpaCyNLPPipelineModel ' ,
backref = ' user ' ,
cascade = ' all, delete-orphan ' ,
lazy = ' dynamic '
)
2021-11-30 16:22:16 +01:00
corpora = db . relationship (
' Corpus ' ,
backref = ' user ' ,
cascade = ' all, delete-orphan ' ,
lazy = ' dynamic '
)
jobs = db . relationship (
' Job ' ,
backref = ' user ' ,
cascade = ' all, delete-orphan ' ,
lazy = ' dynamic '
)
2022-09-02 13:24:14 +02:00
tokens = db . relationship (
' Token ' ,
backref = ' user ' ,
cascade = ' all, delete-orphan ' ,
lazy = ' dynamic '
)
2021-11-30 16:22:16 +01:00
def __init__ ( self , * * kwargs ) :
super ( ) . __init__ ( * * kwargs )
if self . role is not None :
return
if self . email == current_app . config [ ' NOPAQUE_ADMIN ' ] :
self . role = Role . query . filter_by ( name = ' Administrator ' ) . first ( )
else :
self . role = Role . query . filter_by ( default = True ) . first ( )
def __repr__ ( self ) :
return f ' <User { self . username } > '
2019-07-05 14:47:35 +02:00
2020-11-13 10:01:51 +01:00
@property
2021-11-30 16:22:16 +01:00
def jsonpatch_path ( self ) :
return f ' /users/ { self . hashid } '
2020-11-13 10:01:51 +01:00
@property
def password ( self ) :
raise AttributeError ( ' password is not a readable attribute ' )
@password.setter
def password ( self , password ) :
self . password_hash = generate_password_hash ( password )
2021-11-30 16:22:16 +01:00
@property
def path ( self ) :
return os . path . join (
2022-02-08 12:26:20 +01:00
current_app . config . get ( ' NOPAQUE_DATA_DIR ' ) , ' users ' , str ( self . id ) )
2022-09-02 13:07:30 +02:00
@staticmethod
def create ( * * kwargs ) :
user = User ( * * kwargs )
db . session . add ( user )
db . session . flush ( objects = [ user ] )
db . session . refresh ( user )
try :
os . mkdir ( user . path )
2022-10-13 15:05:54 +02:00
os . mkdir ( os . path . join ( user . path , ' spacy_nlp_pipeline_models ' ) )
2022-10-12 10:23:05 +02:00
os . mkdir ( os . path . join ( user . path , ' tesseract_ocr_pipeline_models ' ) )
2022-09-02 13:07:30 +02:00
os . mkdir ( os . path . join ( user . path , ' corpora ' ) )
os . mkdir ( os . path . join ( user . path , ' jobs ' ) )
except OSError as e :
current_app . logger . error ( e )
db . session . rollback ( )
raise e
return user
@staticmethod
def insert_defaults ( ) :
nopaque_user = User . query . filter_by ( username = ' nopaque ' ) . first ( )
system_user_role = Role . query . filter_by ( name = ' System user ' ) . first ( )
if nopaque_user is None :
nopaque_user = User . create (
username = ' nopaque ' ,
role = system_user_role
)
db . session . add ( nopaque_user )
elif nopaque_user . role != system_user_role :
nopaque_user . role = system_user_role
db . session . commit ( )
@staticmethod
def reset_password ( token , new_password ) :
try :
payload = jwt . decode (
token ,
current_app . config [ ' SECRET_KEY ' ] ,
algorithms = [ ' HS256 ' ] ,
issuer = current_app . config [ ' SERVER_NAME ' ] ,
options = { ' require ' : [ ' exp ' , ' iat ' , ' iss ' , ' purpose ' , ' sub ' ] }
)
except jwt . PyJWTError :
return False
if payload . get ( ' purpose ' ) != ' User.reset_password ' :
return False
user_hashid = payload . get ( ' sub ' )
user_id = hashids . decode ( user_hashid )
user = User . query . get ( user_id )
if user is None :
return False
user . password = new_password
db . session . add ( user )
return True
2022-09-02 13:24:14 +02:00
@staticmethod
def verify_access_token ( access_token , refresh_token = None ) :
token = Token . query . filter ( Token . access_token == access_token ) . first ( )
if token is not None :
if token . access_expiration > datetime . utcnow ( ) :
token . user . ping ( )
db . session . commit ( )
if token . user . role . name != ' System user ' :
return token . user
@staticmethod
def verify_refresh_token ( refresh_token , access_token ) :
token = Token . query . filter ( ( Token . refresh_token == refresh_token ) & ( Token . access_token == access_token ) ) . first ( )
if token is not None :
if token . refresh_expiration > datetime . utcnow ( ) :
return token
# someone tried to refresh with an expired token
# revoke all tokens from this user as a precaution
token . user . revoke_auth_tokens ( )
db . session . commit ( )
2021-11-30 16:22:16 +01:00
def can ( self , permission ) :
return self . role . has_permission ( permission )
2019-07-08 13:55:56 +02:00
2022-09-02 13:07:30 +02:00
def confirm ( self , confirmation_token ) :
2019-07-08 15:59:15 +02:00
try :
2022-07-18 17:10:09 +02:00
payload = jwt . decode (
2022-09-02 13:07:30 +02:00
confirmation_token ,
2022-07-18 17:10:09 +02:00
current_app . config [ ' SECRET_KEY ' ] ,
algorithms = [ ' HS256 ' ] ,
issuer = current_app . config [ ' SERVER_NAME ' ] ,
options = { ' require ' : [ ' exp ' , ' iat ' , ' iss ' , ' purpose ' , ' sub ' ] }
)
2022-09-02 13:07:30 +02:00
current_app . logger . warning ( payload )
2022-07-18 17:10:09 +02:00
except jwt . PyJWTError :
2019-07-08 15:59:15 +02:00
return False
2022-09-02 13:07:30 +02:00
if payload . get ( ' purpose ' ) != ' user.confirm ' :
2022-07-18 17:10:09 +02:00
return False
2022-07-18 17:37:05 +02:00
if payload . get ( ' sub ' ) != self . hashid :
2019-07-08 15:59:15 +02:00
return False
self . confirmed = True
db . session . add ( self )
return True
2019-11-14 09:48:30 +01:00
def delete ( self ) :
2020-11-13 10:01:51 +01:00
shutil . rmtree ( self . path , ignore_errors = True )
2019-09-17 16:31:41 +02:00
db . session . delete ( self )
2019-09-11 14:51:59 +02:00
2022-09-02 13:24:14 +02:00
def generate_auth_token ( self ) :
return Token (
access_token = secrets . token_urlsafe ( ) ,
access_expiration = datetime . utcnow ( ) + timedelta ( minutes = 15 ) ,
refresh_token = secrets . token_urlsafe ( ) ,
refresh_expiration = datetime . utcnow ( ) + timedelta ( days = 7 ) ,
user = self
)
2022-09-02 13:07:30 +02:00
def generate_confirm_token ( self , expiration = 3600 ) :
now = datetime . utcnow ( )
2022-07-18 17:10:09 +02:00
payload = {
2022-09-02 13:07:30 +02:00
' exp ' : now + timedelta ( seconds = expiration ) ,
' iat ' : now ,
2022-07-18 17:10:09 +02:00
' iss ' : current_app . config [ ' SERVER_NAME ' ] ,
2022-09-02 13:07:30 +02:00
' purpose ' : ' user.confirm ' ,
2022-07-18 17:37:05 +02:00
' sub ' : self . hashid
2022-07-18 17:10:09 +02:00
}
2022-09-02 13:07:30 +02:00
return jwt . encode (
payload ,
current_app . config [ ' SECRET_KEY ' ] ,
algorithm = ' HS256 '
)
2022-07-18 17:10:09 +02:00
2022-09-02 13:07:30 +02:00
def generate_reset_password_token ( self , expiration = 3600 ) :
now = datetime . utcnow ( )
2022-07-18 17:10:09 +02:00
payload = {
2022-09-02 13:07:30 +02:00
' exp ' : now + timedelta ( seconds = expiration ) ,
' iat ' : now ,
2022-07-18 17:10:09 +02:00
' iss ' : current_app . config [ ' SERVER_NAME ' ] ,
2022-09-02 13:07:30 +02:00
' purpose ' : ' User.reset_password ' ,
2022-07-18 17:37:05 +02:00
' sub ' : self . hashid
2022-07-18 17:10:09 +02:00
}
2022-09-02 13:07:30 +02:00
return jwt . encode (
payload ,
current_app . config [ ' SECRET_KEY ' ] ,
algorithm = ' HS256 '
)
2021-09-14 12:52:23 +02:00
2021-11-30 16:22:16 +01:00
def is_administrator ( self ) :
return self . can ( Permission . ADMINISTRATE )
2022-09-02 13:07:30 +02:00
def ping ( self ) :
self . last_seen = datetime . utcnow ( )
2022-02-03 12:39:16 +01:00
2022-09-02 13:24:14 +02:00
def revoke_auth_tokens ( self ) :
for token in self . tokens :
db . session . delete ( token )
2022-09-02 13:07:30 +02:00
def verify_password ( self , password ) :
if self . role . name == ' System user ' :
return False
return check_password_hash ( self . password_hash , password )
2021-09-14 12:52:23 +02:00
2022-09-02 13:07:30 +02:00
def to_json ( self , backrefs = False , relationships = False ) :
_json = {
2021-11-30 16:22:16 +01:00
' id ' : self . hashid ,
' confirmed ' : self . confirmed ,
' email ' : self . email ,
2022-09-02 13:07:30 +02:00
' last_seen ' : (
None if self . last_seen is None
else f ' { self . last_seen . isoformat ( ) } Z '
) ,
' member_since ' : f ' { self . member_since . isoformat ( ) } Z ' ,
2021-11-30 16:22:16 +01:00
' username ' : self . username ,
' settings ' : {
' dark_mode ' : self . setting_dark_mode ,
2022-09-02 13:07:30 +02:00
' job_status_mail_notification_level ' : \
2022-02-08 12:26:20 +01:00
self . setting_job_status_mail_notification_level . name
2021-11-30 16:22:16 +01:00
}
}
if backrefs :
2022-09-02 13:07:30 +02:00
_json [ ' role ' ] = self . role . to_json ( backrefs = True )
2021-11-30 16:22:16 +01:00
if relationships :
2022-09-02 13:07:30 +02:00
_json [ ' corpora ' ] = {
x . hashid : x . to_json ( relationships = True )
2021-11-30 16:22:16 +01:00
for x in self . corpora
}
2022-09-02 13:07:30 +02:00
_json [ ' jobs ' ] = {
x . hashid : x . to_json ( relationships = True )
2021-11-30 16:22:16 +01:00
for x in self . jobs
}
2022-10-12 10:23:05 +02:00
_json [ ' tesseract_ocr_pipeline_models ' ] = {
2022-09-02 13:07:30 +02:00
x . hashid : x . to_json ( relationships = True )
2022-10-12 10:23:05 +02:00
for x in self . tesseract_ocr_pipeline_models
2021-11-30 16:22:16 +01:00
}
2022-09-02 13:07:30 +02:00
return _json
2021-11-30 16:22:16 +01:00
2022-10-12 10:23:05 +02:00
class TesseractOCRPipelineModel ( FileMixin , HashidMixin , db . Model ) :
__tablename__ = ' tesseract_ocr_pipeline_models '
2022-02-03 12:39:16 +01:00
# Primary key
id = db . Column ( db . Integer , primary_key = True )
# Foreign keys
user_id = db . Column ( db . Integer , db . ForeignKey ( ' users.id ' ) )
# Fields
2022-09-02 13:07:30 +02:00
title = db . Column ( db . String ( 64 ) )
2022-02-03 12:39:16 +01:00
description = db . Column ( db . String ( 255 ) )
2022-09-02 13:07:30 +02:00
version = db . Column ( db . String ( 16 ) )
compatible_service_versions = db . Column ( ContainerColumn ( list , 255 ) )
2022-02-03 12:39:16 +01:00
publisher = db . Column ( db . String ( 128 ) )
2022-04-22 15:27:52 +02:00
publisher_url = db . Column ( db . String ( 512 ) )
publishing_url = db . Column ( db . String ( 512 ) )
2022-02-03 12:39:16 +01:00
publishing_year = db . Column ( db . Integer )
2022-04-04 13:31:09 +02:00
shared = db . Column ( db . Boolean , default = False )
2022-02-03 12:39:16 +01:00
# Backrefs: user: User
@property
def path ( self ) :
return os . path . join (
self . user . path ,
2022-10-12 10:23:05 +02:00
' tesseract_ocr_pipeline_models ' ,
2022-02-03 12:39:16 +01:00
str ( self . id )
)
@staticmethod
def insert_defaults ( ) :
2022-09-02 13:07:30 +02:00
nopaque_user = User . query . filter_by ( username = ' nopaque ' ) . first ( )
2022-02-03 12:39:16 +01:00
defaults_file = os . path . join (
os . path . dirname ( os . path . abspath ( __file__ ) ) ,
2022-10-12 10:23:05 +02:00
' TesseractOCRPipelineModel.defaults.yml '
2022-02-03 12:39:16 +01:00
)
with open ( defaults_file , ' r ' ) as f :
defaults = yaml . safe_load ( f )
for m in defaults :
2022-10-12 10:23:05 +02:00
model = TesseractOCRPipelineModel . query . filter_by ( title = m [ ' title ' ] , version = m [ ' version ' ] ) . first ( ) # noqa
2022-04-04 13:31:09 +02:00
if model is not None :
model . compatible_service_versions = m [ ' compatible_service_versions ' ]
model . description = m [ ' description ' ]
model . publisher = m [ ' publisher ' ]
2022-04-22 15:27:52 +02:00
model . publisher_url = m [ ' publisher_url ' ]
model . publishing_url = m [ ' publishing_url ' ]
2022-04-04 13:31:09 +02:00
model . publishing_year = m [ ' publishing_year ' ]
2022-04-22 15:27:52 +02:00
model . shared = True
2022-04-04 13:31:09 +02:00
model . title = m [ ' title ' ]
model . version = m [ ' version ' ]
2022-02-03 12:39:16 +01:00
continue
2022-10-12 10:23:05 +02:00
model = TesseractOCRPipelineModel (
2022-04-04 13:31:09 +02:00
compatible_service_versions = m [ ' compatible_service_versions ' ] ,
2022-02-03 12:39:16 +01:00
description = m [ ' description ' ] ,
publisher = m [ ' publisher ' ] ,
2022-04-22 15:27:52 +02:00
publisher_url = m [ ' publisher_url ' ] ,
publishing_url = m [ ' publishing_url ' ] ,
2022-02-03 12:39:16 +01:00
publishing_year = m [ ' publishing_year ' ] ,
2022-04-04 13:31:09 +02:00
shared = True ,
2022-02-03 12:39:16 +01:00
title = m [ ' title ' ] ,
2022-09-02 13:07:30 +02:00
user = nopaque_user ,
2022-02-03 12:39:16 +01:00
version = m [ ' version ' ]
)
2022-04-04 13:31:09 +02:00
db . session . add ( model )
db . session . flush ( objects = [ model ] )
db . session . refresh ( model )
model . filename = f ' { model . id } .traineddata '
2022-02-03 12:39:16 +01:00
r = requests . get ( m [ ' url ' ] , stream = True )
pbar = tqdm (
2022-04-04 13:31:09 +02:00
desc = f ' { model . title } ( { model . filename } ) ' ,
2022-02-03 12:39:16 +01:00
unit = " B " ,
unit_scale = True ,
unit_divisor = 1024 ,
total = int ( r . headers [ ' Content-Length ' ] )
)
pbar . clear ( )
2022-10-13 15:05:54 +02:00
with open ( model . path , ' wb ' ) as f :
for chunk in r . iter_content ( chunk_size = 1024 ) :
if chunk : # filter out keep-alive new chunks
pbar . update ( len ( chunk ) )
f . write ( chunk )
pbar . close ( )
db . session . commit ( )
def to_json ( self , backrefs = False , relationships = False ) :
_json = {
' id ' : self . hashid ,
' compatible_service_versions ' : self . compatible_service_versions ,
' description ' : self . description ,
' publisher ' : self . publisher ,
' publisher_url ' : self . publisher_url ,
' publishing_url ' : self . publishing_url ,
' publishing_year ' : self . publishing_year ,
' shared ' : self . shared ,
' title ' : self . title ,
* * self . file_mixin_to_json ( )
}
if backrefs :
_json [ ' user ' ] = self . user . to_json ( backrefs = True )
return _json
2022-10-25 13:07:10 +02:00
@staticmethod
def create ( model_file , * * kwargs ) : # ??? Ist das so richtig? übergeben wir das form? Im anderen Fall ist das nicht so .. aber was soll hier überhaupt rein?
filename = kwargs . get ( ' filename ' , model_file . filename )
mimetype = kwargs . get ( ' mimetype ' , model_file . mimetype )
tesseract_ocr_model = TesseractOCRModel (
filename = secure_filename ( filename ) ,
mimetype = mimetype ,
* * kwargs
)
db . session . add ( tesseract_ocr_model )
db . session . flush ( objects = [ tesseract_ocr_model ] )
db . session . refresh ( tesseract_ocr_model )
try :
# ??? Woher soll file kommen? Wir haben der methode das ganze form übergeben.
# filename = form.file.data.filename
model_file . save ( tesseract_ocr_model . path )
except OSError as e :
current_app . logger . error ( e )
db . session . rollback ( )
raise e
return tesseract_ocr_model
# Kann man das so committen?
# db.session.add(model)
# db.session.flush(objects=[model])
# db.session.refresh(model)
# model.filename = f'{model.id}.traineddata'
# r = requests.get(m['url'], stream=True)
# pbar = tqdm(
# desc=f'{model.title} ({model.filename})',
# unit="B",
# unit_scale=True,
# unit_divisor=1024,
# total=int(r.headers['Content-Length'])
# )
# pbar.clear()
# with open(model.path, 'wb') as f:
# for chunk in r.iter_content(chunk_size=1024):
# if chunk: # filter out keep-alive new chunks
# pbar.update(len(chunk))
# f.write(chunk)
# pbar.close()
#
# db.session.commit()
2022-10-13 15:05:54 +02:00
class SpaCyNLPPipelineModel ( FileMixin , HashidMixin , db . Model ) :
__tablename__ = ' spacy_nlp_pipeline_models '
# Primary key
id = db . Column ( db . Integer , primary_key = True )
# Foreign keys
user_id = db . Column ( db . Integer , db . ForeignKey ( ' users.id ' ) )
# Fields
title = db . Column ( db . String ( 64 ) )
description = db . Column ( db . String ( 255 ) )
version = db . Column ( db . String ( 16 ) )
compatible_service_versions = db . Column ( ContainerColumn ( list , 255 ) )
publisher = db . Column ( db . String ( 128 ) )
publisher_url = db . Column ( db . String ( 512 ) )
publishing_url = db . Column ( db . String ( 512 ) )
publishing_year = db . Column ( db . Integer )
shared = db . Column ( db . Boolean , default = False )
# Backrefs: user: User
@property
def path ( self ) :
return os . path . join (
self . user . path ,
' spacy_nlp_pipeline_models ' ,
str ( self . id )
)
@staticmethod
def insert_defaults ( ) :
nopaque_user = User . query . filter_by ( username = ' nopaque ' ) . first ( )
defaults_file = os . path . join (
os . path . dirname ( os . path . abspath ( __file__ ) ) ,
' SpaCyNLPPipelineModel.defaults.yml '
)
with open ( defaults_file , ' r ' ) as f :
defaults = yaml . safe_load ( f )
for m in defaults :
model = SpaCyNLPPipelineModel . query . filter_by ( title = m [ ' title ' ] , version = m [ ' version ' ] ) . first ( ) # noqa
if model is not None :
model . compatible_service_versions = m [ ' compatible_service_versions ' ]
model . description = m [ ' description ' ]
model . publisher = m [ ' publisher ' ]
model . publisher_url = m [ ' publisher_url ' ]
model . publishing_url = m [ ' publishing_url ' ]
model . publishing_year = m [ ' publishing_year ' ]
model . shared = True
model . title = m [ ' title ' ]
model . version = m [ ' version ' ]
continue
model = SpaCyNLPPipelineModel (
compatible_service_versions = m [ ' compatible_service_versions ' ] ,
description = m [ ' description ' ] ,
publisher = m [ ' publisher ' ] ,
publisher_url = m [ ' publisher_url ' ] ,
publishing_url = m [ ' publishing_url ' ] ,
publishing_year = m [ ' publishing_year ' ] ,
shared = True ,
title = m [ ' title ' ] ,
user = nopaque_user ,
version = m [ ' version ' ]
)
db . session . add ( model )
db . session . flush ( objects = [ model ] )
db . session . refresh ( model )
model . filename = f ' { model . id } .traineddata '
r = requests . get ( m [ ' url ' ] , stream = True )
pbar = tqdm (
desc = f ' { model . title } ( { model . filename } ) ' ,
unit = " B " ,
unit_scale = True ,
unit_divisor = 1024 ,
total = int ( r . headers [ ' Content-Length ' ] )
)
pbar . clear ( )
2022-04-04 13:31:09 +02:00
with open ( model . path , ' wb ' ) as f :
2022-02-03 12:39:16 +01:00
for chunk in r . iter_content ( chunk_size = 1024 ) :
if chunk : # filter out keep-alive new chunks
pbar . update ( len ( chunk ) )
f . write ( chunk )
pbar . close ( )
db . session . commit ( )
2022-09-02 13:07:30 +02:00
def to_json ( self , backrefs = False , relationships = False ) :
_json = {
' id ' : self . hashid ,
' compatible_service_versions ' : self . compatible_service_versions ,
' description ' : self . description ,
' publisher ' : self . publisher ,
' publisher_url ' : self . publisher_url ,
' publishing_url ' : self . publishing_url ,
' publishing_year ' : self . publishing_year ,
' shared ' : self . shared ,
' title ' : self . title ,
* * self . file_mixin_to_json ( )
}
if backrefs :
_json [ ' user ' ] = self . user . to_json ( backrefs = True )
return _json
2022-02-03 12:39:16 +01:00
2021-11-30 16:22:16 +01:00
class JobInput ( FileMixin , HashidMixin , db . Model ) :
2019-10-16 16:52:05 +02:00
__tablename__ = ' job_inputs '
# Primary key
id = db . Column ( db . Integer , primary_key = True )
2020-04-29 12:17:16 +02:00
# Foreign keys
job_id = db . Column ( db . Integer , db . ForeignKey ( ' jobs.id ' ) )
2021-11-30 16:22:16 +01:00
# Backrefs: job: Job
def __repr__ ( self ) :
return f ' <JobInput { self . filename } > '
2019-10-16 16:52:05 +02:00
2020-12-03 15:13:24 +01:00
@property
2022-09-02 13:07:30 +02:00
def content_url ( self ) :
2021-11-30 16:22:16 +01:00
return url_for (
' jobs.download_job_input ' ,
job_id = self . job . id ,
job_input_id = self . id
)
2020-12-03 15:13:24 +01:00
2021-09-10 16:25:32 +02:00
@property
def jsonpatch_path ( self ) :
2021-11-30 16:22:16 +01:00
return f ' { self . job . jsonpatch_path } /inputs/ { self . hashid } '
2021-09-10 16:25:32 +02:00
2020-11-13 10:01:51 +01:00
@property
def path ( self ) :
2022-02-03 12:39:16 +01:00
return os . path . join ( self . job . path , ' inputs ' , str ( self . id ) )
2020-11-13 10:01:51 +01:00
2020-12-03 15:13:24 +01:00
@property
def url ( self ) :
2021-11-30 16:22:16 +01:00
return url_for (
' jobs.job ' ,
job_id = self . job_id ,
_anchor = f ' job- { self . job . hashid } -input- { self . hashid } '
)
@property
def user_hashid ( self ) :
return self . job . user . hashid
2020-12-03 15:13:24 +01:00
2021-09-10 16:25:32 +02:00
@property
def user_id ( self ) :
return self . job . user_id
2022-09-02 13:07:30 +02:00
def to_json ( self , backrefs = False , relationships = False ) :
_json = {
' id ' : self . hashid ,
* * self . file_mixin_to_json ( )
}
if backrefs :
_json [ ' job ' ] = self . job . to_json ( backrefs = True )
return _json
2019-10-16 16:52:05 +02:00
2021-11-30 16:22:16 +01:00
class JobResult ( FileMixin , HashidMixin , db . Model ) :
2019-10-16 16:52:05 +02:00
__tablename__ = ' job_results '
# Primary key
id = db . Column ( db . Integer , primary_key = True )
2020-04-29 12:17:16 +02:00
# Foreign keys
job_id = db . Column ( db . Integer , db . ForeignKey ( ' jobs.id ' ) )
2022-02-03 12:39:16 +01:00
# Fields
description = db . Column ( db . String ( 255 ) )
2021-11-30 16:22:16 +01:00
# Backrefs: job: Job
def __repr__ ( self ) :
return f ' <JobResult { self . filename } > '
2019-10-16 16:52:05 +02:00
2020-12-03 15:13:24 +01:00
@property
def download_url ( self ) :
2021-11-30 16:22:16 +01:00
return url_for (
' jobs.download_job_result ' ,
job_id = self . job_id ,
job_result_id = self . id
)
2020-12-03 15:13:24 +01:00
2021-09-10 16:25:32 +02:00
@property
def jsonpatch_path ( self ) :
2021-11-30 16:22:16 +01:00
return f ' { self . job . jsonpatch_path } /results/ { self . hashid } '
2021-09-10 16:25:32 +02:00
2020-11-13 10:01:51 +01:00
@property
def path ( self ) :
2022-02-03 12:39:16 +01:00
return os . path . join ( self . job . path , ' results ' , str ( self . id ) )
2020-11-13 10:01:51 +01:00
2020-12-03 15:13:24 +01:00
@property
def url ( self ) :
2021-11-30 16:22:16 +01:00
return url_for (
' jobs.job ' ,
job_id = self . job_id ,
_anchor = f ' job- { self . job . hashid } -result- { self . hashid } '
)
@property
def user_hashid ( self ) :
return self . job . user . hashid
2020-12-03 15:13:24 +01:00
2021-09-10 16:25:32 +02:00
@property
def user_id ( self ) :
return self . job . user_id
2022-09-02 13:07:30 +02:00
def to_json ( self , backrefs = False , relationships = False ) :
_json = {
' id ' : self . hashid ,
' description ' : self . description ,
* * self . file_mixin_to_json (
backrefs = backrefs ,
relationships = relationships
)
}
if backrefs :
_json [ ' job ' ] = self . job . to_json ( backrefs = True )
return _json
2019-10-17 13:26:20 +02:00
2021-11-30 16:22:16 +01:00
class Job ( HashidMixin , db . Model ) :
2020-05-14 15:30:13 +02:00
'''
2019-08-05 16:45:38 +02:00
Class to define Jobs .
2020-05-14 15:30:13 +02:00
'''
2019-08-05 16:45:38 +02:00
__tablename__ = ' jobs '
2019-08-06 11:47:04 +02:00
# Primary key
2019-08-05 16:45:38 +02:00
id = db . Column ( db . Integer , primary_key = True )
2020-04-29 12:17:16 +02:00
# Foreign keys
user_id = db . Column ( db . Integer , db . ForeignKey ( ' users.id ' ) )
2020-04-27 10:30:38 +02:00
# Fields
2022-09-02 13:07:30 +02:00
creation_date = \
db . Column ( db . DateTime ( ) , default = datetime . utcnow )
2019-08-09 11:48:43 +02:00
description = db . Column ( db . String ( 255 ) )
2019-08-09 15:59:53 +02:00
end_date = db . Column ( db . DateTime ( ) )
2019-08-06 11:47:04 +02:00
service = db . Column ( db . String ( 64 ) )
2022-04-04 13:31:09 +02:00
service_args = db . Column ( ContainerColumn ( dict , 255 ) )
2019-08-09 11:48:43 +02:00
service_version = db . Column ( db . String ( 16 ) )
2022-02-09 16:02:37 +01:00
status = db . Column (
2022-02-10 12:01:31 +01:00
IntEnumColumn ( JobStatus ) ,
2022-02-09 16:02:37 +01:00
default = JobStatus . INITIALIZING
)
2019-08-06 11:47:04 +02:00
title = db . Column ( db . String ( 32 ) )
2021-11-30 16:22:16 +01:00
# Backrefs: user: User
2019-10-16 16:52:05 +02:00
# Relationships
2021-11-30 16:22:16 +01:00
inputs = db . relationship (
' JobInput ' ,
backref = ' job ' ,
cascade = ' all, delete-orphan ' ,
lazy = ' dynamic '
)
results = db . relationship (
' JobResult ' ,
backref = ' job ' ,
cascade = ' all, delete-orphan ' ,
lazy = ' dynamic '
)
def __repr__ ( self ) :
return f ' <Job { self . title } > '
2020-11-13 10:01:51 +01:00
2021-09-10 16:25:32 +02:00
@property
def jsonpatch_path ( self ) :
2021-11-30 16:22:16 +01:00
return f ' { self . user . jsonpatch_path } /jobs/ { self . hashid } '
2021-09-10 16:25:32 +02:00
2020-11-13 10:01:51 +01:00
@property
def path ( self ) :
2021-11-30 16:22:16 +01:00
return os . path . join ( self . user . path , ' jobs ' , str ( self . id ) )
2019-08-05 16:45:38 +02:00
2020-12-03 15:13:24 +01:00
@property
2020-12-07 16:10:40 +01:00
def url ( self ) :
return url_for ( ' jobs.job ' , job_id = self . id )
2020-12-03 15:13:24 +01:00
2021-11-30 16:22:16 +01:00
@property
def user_hashid ( self ) :
return self . user . hashid
2019-08-05 16:45:38 +02:00
2022-09-02 13:07:30 +02:00
@staticmethod
def create ( * * kwargs ) :
job = Job ( * * kwargs )
db . session . add ( job )
db . session . flush ( objects = [ job ] )
db . session . refresh ( job )
try :
os . mkdir ( job . path )
os . mkdir ( os . path . join ( job . path , ' inputs ' ) )
os . mkdir ( os . path . join ( job . path , ' pipeline_data ' ) )
os . mkdir ( os . path . join ( job . path , ' results ' ) )
except OSError as e :
current_app . logger . error ( e )
db . session . rollback ( )
raise e
return job
2019-11-14 09:48:30 +01:00
def delete ( self ) :
2022-09-02 13:07:30 +02:00
''' Delete the job and its inputs and results from the database. '''
2022-02-08 12:26:20 +01:00
if self . status not in [ JobStatus . COMPLETED , JobStatus . FAILED ] : # noqa
self . status = JobStatus . CANCELING
2020-07-09 15:07:43 +02:00
db . session . commit ( )
2022-02-08 12:26:20 +01:00
while self . status != JobStatus . CANCELED :
2020-07-09 15:07:43 +02:00
# In case the daemon handled a job in any way
2022-02-08 12:26:20 +01:00
if self . status != JobStatus . CANCELING :
self . status = JobStatus . CANCELING
2020-07-09 15:07:43 +02:00
db . session . commit ( )
sleep ( 1 )
db . session . refresh ( self )
2022-09-02 13:07:30 +02:00
try :
shutil . rmtree ( self . path )
except OSError as e :
current_app . logger . error ( e )
db . session . rollback ( )
raise e
2019-11-14 09:48:30 +01:00
db . session . delete ( self )
2020-07-09 09:42:30 +02:00
def restart ( self ) :
2022-09-02 13:07:30 +02:00
''' Restart a job - only if the status is failed '''
if self . status != JobStatus . FAILED :
raise Exception ( ' Job status is not " failed " ' )
2022-02-03 12:39:16 +01:00
shutil . rmtree ( os . path . join ( self . path , ' results ' ) , ignore_errors = True )
2022-09-02 13:07:30 +02:00
shutil . rmtree ( os . path . join ( self . path , ' pyflow.data ' ) , ignore_errors = True )
2021-08-23 16:31:06 +02:00
for result in self . results :
db . session . delete ( result )
2020-07-09 09:42:30 +02:00
self . end_date = None
2022-02-08 12:26:20 +01:00
self . status = JobStatus . SUBMITTED
2020-07-09 09:42:30 +02:00
2022-09-02 13:07:30 +02:00
def to_json ( self , backrefs = False , relationships = False ) :
_json = {
2021-11-30 16:22:16 +01:00
' id ' : self . hashid ,
2022-09-02 13:07:30 +02:00
' creation_date ' : f ' { self . creation_date . isoformat ( ) } Z ' ,
2021-08-18 15:09:56 +02:00
' description ' : self . description ,
2022-09-02 13:07:30 +02:00
' end_date ' : (
None if self . end_date is None
else f ' { self . end_date . isoformat ( ) } Z '
) ,
2021-08-18 15:09:56 +02:00
' service ' : self . service ,
2022-04-04 13:31:09 +02:00
' service_args ' : self . service_args ,
2021-08-18 15:09:56 +02:00
' service_version ' : self . service_version ,
2022-02-08 12:26:20 +01:00
' status ' : self . status . name ,
2021-08-18 15:09:56 +02:00
' title ' : self . title ,
2021-11-30 16:22:16 +01:00
' url ' : self . url
2021-08-18 15:09:56 +02:00
}
2021-11-30 16:22:16 +01:00
if backrefs :
2022-09-02 13:07:30 +02:00
_json [ ' user ' ] = self . user . to_json ( backrefs = True )
2021-11-30 16:22:16 +01:00
if relationships :
2022-09-02 13:07:30 +02:00
_json [ ' inputs ' ] = {
x . hashid : x . to_json ( relationships = True )
2021-11-30 16:22:16 +01:00
for x in self . inputs
}
2022-09-02 13:07:30 +02:00
_json [ ' results ' ] = {
x . hashid : x . to_json ( relationships = True )
2021-11-30 16:22:16 +01:00
for x in self . results
}
2022-09-02 13:07:30 +02:00
return _json
2019-08-16 09:49:27 +02:00
2019-08-05 16:45:38 +02:00
2021-11-30 16:22:16 +01:00
class CorpusFile ( FileMixin , HashidMixin , db . Model ) :
2019-10-16 16:52:05 +02:00
__tablename__ = ' corpus_files '
# Primary key
id = db . Column ( db . Integer , primary_key = True )
2020-04-29 12:17:16 +02:00
# Foreign keys
corpus_id = db . Column ( db . Integer , db . ForeignKey ( ' corpora.id ' ) )
2020-04-27 10:30:38 +02:00
# Fields
2020-01-08 16:02:42 +01:00
author = db . Column ( db . String ( 255 ) )
2022-09-02 13:07:30 +02:00
publishing_year = db . Column ( db . Integer )
title = db . Column ( db . String ( 255 ) )
address = db . Column ( db . String ( 255 ) )
2020-01-08 16:02:42 +01:00
booktitle = db . Column ( db . String ( 255 ) )
chapter = db . Column ( db . String ( 255 ) )
editor = db . Column ( db . String ( 255 ) )
institution = db . Column ( db . String ( 255 ) )
journal = db . Column ( db . String ( 255 ) )
pages = db . Column ( db . String ( 255 ) )
publisher = db . Column ( db . String ( 255 ) )
school = db . Column ( db . String ( 255 ) )
2021-11-30 16:22:16 +01:00
# Backrefs: corpus: Corpus
2019-10-16 16:52:05 +02:00
2020-12-03 15:13:24 +01:00
@property
def download_url ( self ) :
2021-11-30 16:22:16 +01:00
return url_for (
' corpora.download_corpus_file ' ,
corpus_id = self . corpus_id ,
corpus_file_id = self . id
)
2020-12-03 15:13:24 +01:00
2021-09-10 16:25:32 +02:00
@property
def jsonpatch_path ( self ) :
2021-12-01 16:03:55 +01:00
return f ' { self . corpus . jsonpatch_path } /files/ { self . hashid } '
2021-09-10 16:25:32 +02:00
2020-11-13 10:01:51 +01:00
@property
def path ( self ) :
2022-02-03 12:39:16 +01:00
return os . path . join ( self . corpus . path , ' files ' , str ( self . id ) )
2020-11-13 10:01:51 +01:00
2020-12-03 15:13:24 +01:00
@property
def url ( self ) :
2021-11-30 16:22:16 +01:00
return url_for (
' corpora.corpus_file ' ,
corpus_id = self . corpus_id ,
corpus_file_id = self . id
)
@property
def user_hashid ( self ) :
return self . corpus . user . hashid
2020-12-03 15:13:24 +01:00
2021-09-10 16:25:32 +02:00
@property
def user_id ( self ) :
return self . corpus . user_id
2019-10-30 08:28:52 +01:00
def delete ( self ) :
2020-07-10 11:36:54 +02:00
try :
2020-11-13 10:01:51 +01:00
os . remove ( self . path )
2020-07-10 11:36:54 +02:00
except OSError :
2021-09-16 11:15:31 +02:00
current_app . logger . error (
2021-12-13 12:20:01 +01:00
f ' Removing { self . path } led to an OSError! '
2021-09-16 11:15:31 +02:00
)
2020-07-10 11:36:54 +02:00
pass
2019-10-30 08:28:52 +01:00
db . session . delete ( self )
2022-02-08 12:26:20 +01:00
self . corpus . status = CorpusStatus . UNPREPARED
2019-10-30 08:28:52 +01:00
2022-09-02 13:07:30 +02:00
def to_json ( self , backrefs = False , relationships = False ) :
_json = {
2021-11-30 16:22:16 +01:00
' id ' : self . hashid ,
' url ' : self . url ,
' address ' : self . address ,
' author ' : self . author ,
' booktitle ' : self . booktitle ,
' chapter ' : self . chapter ,
' editor ' : self . editor ,
' institution ' : self . institution ,
' journal ' : self . journal ,
' pages ' : self . pages ,
' publisher ' : self . publisher ,
' publishing_year ' : self . publishing_year ,
' school ' : self . school ,
' title ' : self . title ,
2022-09-02 13:07:30 +02:00
* * self . file_mixin_to_json (
backrefs = backrefs ,
relationships = relationships
)
2021-11-30 16:22:16 +01:00
}
if backrefs :
2022-09-02 13:07:30 +02:00
_json [ ' corpus ' ] = self . corpus . to_json ( backrefs = True )
return _json
2021-11-30 16:22:16 +01:00
class Corpus ( HashidMixin , db . Model ) :
2020-05-14 15:30:13 +02:00
'''
2019-08-06 12:06:41 +02:00
Class to define a corpus .
2020-05-14 15:30:13 +02:00
'''
2019-08-06 12:06:41 +02:00
__tablename__ = ' corpora '
# Primary key
id = db . Column ( db . Integer , primary_key = True )
2020-04-29 12:17:16 +02:00
# Foreign keys
user_id = db . Column ( db . Integer , db . ForeignKey ( ' users.id ' ) )
2020-04-27 10:30:38 +02:00
# Fields
2019-08-06 15:41:07 +02:00
creation_date = db . Column ( db . DateTime ( ) , default = datetime . utcnow )
2019-08-12 08:57:21 +02:00
description = db . Column ( db . String ( 255 ) )
2022-09-02 13:07:30 +02:00
last_edited_date = db . Column ( db . DateTime ( ) )
2022-02-09 16:02:37 +01:00
status = db . Column (
2022-02-10 12:01:31 +01:00
IntEnumColumn ( CorpusStatus ) ,
2022-02-09 16:02:37 +01:00
default = CorpusStatus . UNPREPARED
)
2019-08-06 12:06:41 +02:00
title = db . Column ( db . String ( 32 ) )
2021-11-16 15:23:57 +01:00
num_analysis_sessions = db . Column ( db . Integer , default = 0 )
num_tokens = db . Column ( db . Integer , default = 0 )
2022-10-04 15:33:51 +02:00
is_public = db . Column ( db . Boolean , default = False )
2021-11-30 16:22:16 +01:00
# Backrefs: user: User
2019-10-16 16:52:05 +02:00
# Relationships
2021-11-30 16:22:16 +01:00
files = db . relationship (
' CorpusFile ' ,
backref = ' corpus ' ,
lazy = ' dynamic ' ,
cascade = ' all, delete-orphan '
)
2022-02-08 12:26:20 +01:00
# "static" attributes
2022-05-05 15:10:03 +02:00
max_num_tokens = 2_147_483_647
2019-08-06 12:06:41 +02:00
2021-11-30 16:22:16 +01:00
def __repr__ ( self ) :
return f ' <Corpus { self . title } > '
2020-12-07 16:10:40 +01:00
@property
def analysis_url ( self ) :
return url_for ( ' corpora.analyse_corpus ' , corpus_id = self . id )
2021-09-10 16:25:32 +02:00
@property
def jsonpatch_path ( self ) :
2021-11-30 16:22:16 +01:00
return f ' { self . user . jsonpatch_path } /corpora/ { self . hashid } '
2021-09-10 16:25:32 +02:00
2020-11-13 10:01:51 +01:00
@property
def path ( self ) :
2021-11-30 16:22:16 +01:00
return os . path . join ( self . user . path , ' corpora ' , str ( self . id ) )
2020-11-13 10:01:51 +01:00
2020-12-04 14:16:00 +01:00
@property
def url ( self ) :
return url_for ( ' corpora.corpus ' , corpus_id = self . id )
2021-11-30 16:22:16 +01:00
@property
def user_hashid ( self ) :
return self . user . hashid
2019-08-06 12:06:41 +02:00
2022-09-02 13:07:30 +02:00
@staticmethod
def create ( * * kwargs ) :
corpus = Corpus ( * * kwargs )
db . session . add ( corpus )
db . session . flush ( objects = [ corpus ] )
db . session . refresh ( corpus )
try :
os . mkdir ( corpus . path )
os . mkdir ( os . path . join ( corpus . path , ' files ' ) )
os . mkdir ( os . path . join ( corpus . path , ' cwb ' ) )
os . mkdir ( os . path . join ( corpus . path , ' cwb ' , ' data ' ) )
os . mkdir ( os . path . join ( corpus . path , ' cwb ' , ' registry ' ) )
except OSError as e :
current_app . logger . error ( e )
db . session . rollback ( )
raise e
return corpus
2020-07-10 11:36:54 +02:00
def build ( self ) :
2021-11-16 15:23:57 +01:00
corpus_element = ET . fromstring ( ' <corpus> \n </corpus> ' )
2020-07-10 11:36:54 +02:00
for corpus_file in self . files :
2022-04-12 16:11:40 +02:00
normalized_vrt_path = os . path . join ( self . path , ' cwb ' , f ' { corpus_file . id } .norm.vrt ' )
try :
normalize_vrt_file ( corpus_file . path , normalized_vrt_path )
except :
self . status = CorpusStatus . FAILED
return
element_tree = ET . parse ( normalized_vrt_path )
2022-02-03 12:39:16 +01:00
text_element = element_tree . getroot ( )
text_element . set ( ' author ' , corpus_file . author )
2022-09-02 13:07:30 +02:00
text_element . set ( ' title ' , corpus_file . title )
text_element . set (
' publishing_year ' ,
f ' { corpus_file . publishing_year } '
)
text_element . set ( ' address ' , corpus_file . address or ' NULL ' )
2022-02-03 12:39:16 +01:00
text_element . set ( ' booktitle ' , corpus_file . booktitle or ' NULL ' )
text_element . set ( ' chapter ' , corpus_file . chapter or ' NULL ' )
text_element . set ( ' editor ' , corpus_file . editor or ' NULL ' )
text_element . set ( ' institution ' , corpus_file . institution or ' NULL ' )
text_element . set ( ' journal ' , corpus_file . journal or ' NULL ' )
2022-09-02 13:07:30 +02:00
text_element . set ( ' pages ' , f ' { corpus_file . pages } ' or ' NULL ' )
2022-02-03 12:39:16 +01:00
text_element . set ( ' publisher ' , corpus_file . publisher or ' NULL ' )
text_element . set ( ' school ' , corpus_file . school or ' NULL ' )
2022-04-13 09:47:02 +02:00
text_element . tail = ' \n '
# corpus_element.insert(1, text_element)
corpus_element . append ( text_element )
2022-02-03 12:39:16 +01:00
ET . ElementTree ( corpus_element ) . write (
os . path . join ( self . path , ' cwb ' , ' corpus.vrt ' ) ,
encoding = ' utf-8 '
)
2020-07-10 11:36:54 +02:00
self . last_edited_date = datetime . utcnow ( )
2022-02-08 12:26:20 +01:00
self . status = CorpusStatus . SUBMITTED
2020-07-10 11:36:54 +02:00
2019-10-30 08:28:52 +01:00
def delete ( self ) :
2020-11-13 10:01:51 +01:00
shutil . rmtree ( self . path , ignore_errors = True )
2019-09-24 14:04:49 +02:00
db . session . delete ( self )
2022-09-02 13:07:30 +02:00
def to_json ( self , backrefs = False , relationships = False ) :
_json = {
2021-11-30 16:22:16 +01:00
' id ' : self . hashid ,
2022-09-02 13:07:30 +02:00
' creation_date ' : f ' { self . creation_date . isoformat ( ) } Z ' ,
2021-11-30 16:22:16 +01:00
' description ' : self . description ,
' max_num_tokens ' : self . max_num_tokens ,
' num_analysis_sessions ' : self . num_analysis_sessions ,
' num_tokens ' : self . num_tokens ,
2022-02-08 12:26:20 +01:00
' status ' : self . status . name ,
2022-09-02 13:07:30 +02:00
' last_edited_date ' : (
None if self . last_edited_date is None
else f ' { self . last_edited_date . isoformat ( ) } Z '
) ,
2022-10-04 15:33:51 +02:00
' title ' : self . title ,
' is_public ' : self . is_public
2021-11-30 16:22:16 +01:00
}
if backrefs :
2022-09-02 13:07:30 +02:00
_json [ ' user ' ] = self . user . to_json ( backrefs = True )
2021-11-30 16:22:16 +01:00
if relationships :
2022-09-02 13:07:30 +02:00
_json [ ' files ' ] = {
x . hashid : x . to_json ( relationships = True )
2021-11-30 16:22:16 +01:00
for x in self . files
}
2022-09-02 13:07:30 +02:00
return _json
2022-06-28 12:30:02 +02:00
# endregion models
##############################################################################
# event_handlers #
##############################################################################
# region event_handlers
2022-09-02 13:07:30 +02:00
2022-06-28 12:30:02 +02:00
@db.event.listens_for ( Corpus , ' after_delete ' )
@db.event.listens_for ( CorpusFile , ' after_delete ' )
@db.event.listens_for ( Job , ' after_delete ' )
@db.event.listens_for ( JobInput , ' after_delete ' )
@db.event.listens_for ( JobResult , ' after_delete ' )
def ressource_after_delete ( mapper , connection , ressource ) :
jsonpatch = [ { ' op ' : ' remove ' , ' path ' : ressource . jsonpatch_path } ]
room = f ' users. { ressource . user_hashid } '
socketio . emit ( ' users.patch ' , jsonpatch , room = room )
2022-07-04 14:09:17 +02:00
room = f ' /users/ { ressource . user_hashid } '
socketio . emit ( ' PATCH ' , jsonpatch , room = room )
2022-06-28 12:30:02 +02:00
@db.event.listens_for ( Corpus , ' after_insert ' )
@db.event.listens_for ( CorpusFile , ' after_insert ' )
@db.event.listens_for ( Job , ' after_insert ' )
@db.event.listens_for ( JobInput , ' after_insert ' )
@db.event.listens_for ( JobResult , ' after_insert ' )
def ressource_after_insert_handler ( mapper , connection , ressource ) :
2022-09-02 13:07:30 +02:00
value = ressource . to_json ( )
2022-06-28 12:30:02 +02:00
for attr in mapper . relationships :
value [ attr . key ] = { }
jsonpatch = [
{ ' op ' : ' add ' , ' path ' : ressource . jsonpatch_path , ' value ' : value }
]
2022-07-04 14:09:17 +02:00
room = f ' /users/ { ressource . user_hashid } '
socketio . emit ( ' PATCH ' , jsonpatch , room = room )
2022-06-28 12:30:02 +02:00
@db.event.listens_for ( Corpus , ' after_update ' )
@db.event.listens_for ( CorpusFile , ' after_update ' )
@db.event.listens_for ( Job , ' after_update ' )
@db.event.listens_for ( JobInput , ' after_update ' )
@db.event.listens_for ( JobResult , ' after_update ' )
def ressource_after_update_handler ( mapper , connection , ressource ) :
jsonpatch = [ ]
for attr in db . inspect ( ressource ) . attrs :
if attr . key in mapper . relationships :
continue
if not attr . load_history ( ) . has_changes ( ) :
continue
if isinstance ( attr . value , datetime ) :
2022-09-02 13:07:30 +02:00
value = f ' { attr . value . isoformat ( ) } Z '
2022-06-28 12:30:02 +02:00
elif isinstance ( attr . value , Enum ) :
value = attr . value . name
else :
value = attr . value
jsonpatch . append (
{
' op ' : ' replace ' ,
' path ' : f ' { ressource . jsonpatch_path } / { attr . key } ' ,
' value ' : value
}
)
if jsonpatch :
2022-07-04 14:09:17 +02:00
room = f ' /users/ { ressource . user_hashid } '
socketio . emit ( ' PATCH ' , jsonpatch , room = room )
2022-06-28 12:30:02 +02:00
@db.event.listens_for ( Job , ' after_update ' )
def job_after_update_handler ( mapper , connection , job ) :
for attr in db . inspect ( job ) . attrs :
if attr . key != ' status ' :
continue
2022-07-11 12:14:01 +02:00
if not attr . load_history ( ) . has_changes ( ) :
return
2022-06-28 12:30:02 +02:00
if job . user . setting_job_status_mail_notification_level == UserSettingJobStatusMailNotificationLevel . NONE :
return
if job . user . setting_job_status_mail_notification_level == UserSettingJobStatusMailNotificationLevel . END :
if job . status not in [ JobStatus . COMPLETED , JobStatus . FAILED ] :
return
msg = create_message (
job . user . email ,
f ' Status update for your Job " { job . title } " ' ,
' tasks/email/notification ' ,
job = job
)
mail . send ( msg )
# endregion event_handlers
2019-11-04 15:06:54 +01:00
2019-08-22 09:35:23 +02:00
2022-06-28 12:30:02 +02:00
##############################################################################
# misc #
##############################################################################
# region misc
2021-09-15 12:31:53 +02:00
@login.user_loader
2019-07-05 14:47:35 +02:00
def load_user ( user_id ) :
return User . query . get ( int ( user_id ) )
2022-06-28 12:30:02 +02:00
# endregion misc