mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2024-11-16 01:35:41 +00:00
133 lines
5.1 KiB
Python
133 lines
5.1 KiB
Python
from flask import current_app, url_for
|
|
from flask_hashids import HashidMixin
|
|
from tqdm import tqdm
|
|
from pathlib import Path
|
|
import requests
|
|
import yaml
|
|
from app import db
|
|
from app.extensions.sqlalchemy import ContainerColumn
|
|
from .file_mixin import FileMixin
|
|
from .user import User
|
|
|
|
|
|
class TesseractOCRPipelineModel(FileMixin, HashidMixin, db.Model):
|
|
__tablename__ = 'tesseract_ocr_pipeline_models'
|
|
# Primary key
|
|
id = db.Column(db.Integer, primary_key=True)
|
|
# Foreign keys
|
|
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
|
|
# Fields
|
|
title = db.Column(db.String(64))
|
|
description = db.Column(db.String(255))
|
|
version = db.Column(db.String(16))
|
|
compatible_service_versions = db.Column(ContainerColumn(list, 255))
|
|
publisher = db.Column(db.String(128))
|
|
publisher_url = db.Column(db.String(512))
|
|
publishing_url = db.Column(db.String(512))
|
|
publishing_year = db.Column(db.Integer)
|
|
is_public = db.Column(db.Boolean, default=False)
|
|
# Relationships
|
|
user = db.relationship('User', back_populates='tesseract_ocr_pipeline_models')
|
|
|
|
@property
|
|
def path(self) -> Path:
|
|
return self.user.path / 'tesseract_ocr_pipeline_models' / f'{self.id}'
|
|
|
|
@property
|
|
def jsonpatch_path(self):
|
|
return f'{self.user.jsonpatch_path}/tesseract_ocr_pipeline_models/{self.hashid}'
|
|
|
|
@property
|
|
def url(self):
|
|
return url_for(
|
|
'contributions.tesseract_ocr_pipeline_model',
|
|
tesseract_ocr_pipeline_model_id=self.id
|
|
)
|
|
|
|
@property
|
|
def user_hashid(self):
|
|
return self.user.hashid
|
|
|
|
@staticmethod
|
|
def insert_defaults(force_download=False):
|
|
nopaque_user = User.query.filter_by(username='nopaque').first()
|
|
default_records_file = Path(__file__).parent / 'default_records' / 'tesseract_ocr_pipeline_model.yml'
|
|
with default_records_file.open('r') as f:
|
|
default_records = yaml.safe_load(f)
|
|
for m in default_records:
|
|
model = TesseractOCRPipelineModel.query.filter_by(title=m['title'], version=m['version']).first() # noqa
|
|
if model is not None:
|
|
model.compatible_service_versions = m['compatible_service_versions']
|
|
model.description = m['description']
|
|
model.filename = f'{model.id}.traineddata'
|
|
model.publisher = m['publisher']
|
|
model.publisher_url = m['publisher_url']
|
|
model.publishing_url = m['publishing_url']
|
|
model.publishing_year = m['publishing_year']
|
|
model.is_public = True
|
|
model.title = m['title']
|
|
model.version = m['version']
|
|
else:
|
|
model = TesseractOCRPipelineModel(
|
|
compatible_service_versions=m['compatible_service_versions'],
|
|
description=m['description'],
|
|
publisher=m['publisher'],
|
|
publisher_url=m['publisher_url'],
|
|
publishing_url=m['publishing_url'],
|
|
publishing_year=m['publishing_year'],
|
|
is_public=True,
|
|
title=m['title'],
|
|
user=nopaque_user,
|
|
version=m['version']
|
|
)
|
|
db.session.add(model)
|
|
db.session.flush(objects=[model])
|
|
db.session.refresh(model)
|
|
model.filename = f'{model.id}.traineddata'
|
|
if not model.path.exists() or force_download:
|
|
r = requests.get(m['url'], stream=True)
|
|
pbar = tqdm(
|
|
desc=f'{model.title} ({model.filename})',
|
|
unit="B",
|
|
unit_scale=True,
|
|
unit_divisor=1024,
|
|
total=int(r.headers['Content-Length'])
|
|
)
|
|
pbar.clear()
|
|
with open(model.path, 'wb') as f:
|
|
for chunk in r.iter_content(chunk_size=1024):
|
|
if chunk: # filter out keep-alive new chunks
|
|
pbar.update(len(chunk))
|
|
f.write(chunk)
|
|
pbar.close()
|
|
db.session.commit()
|
|
|
|
def delete(self):
|
|
try:
|
|
self.path.unlink(missing_ok=True)
|
|
except OSError as e:
|
|
current_app.logger.error(e)
|
|
raise
|
|
db.session.delete(self)
|
|
|
|
def to_json_serializeable(self, backrefs=False, relationships=False):
|
|
json_serializeable = {
|
|
'id': self.hashid,
|
|
'compatible_service_versions': self.compatible_service_versions,
|
|
'description': self.description,
|
|
'publisher': self.publisher,
|
|
'publisher_url': self.publisher_url,
|
|
'publishing_url': self.publishing_url,
|
|
'publishing_year': self.publishing_year,
|
|
'is_public': self.is_public,
|
|
'title': self.title,
|
|
'version': self.version,
|
|
**self.file_mixin_to_json_serializeable()
|
|
}
|
|
if backrefs:
|
|
json_serializeable['user'] = \
|
|
self.user.to_json_serializeable(backrefs=True)
|
|
if relationships:
|
|
pass
|
|
return json_serializeable
|