from flask import current_app, url_for from flask_hashids import HashidMixin from tqdm import tqdm from pathlib import Path import requests import yaml from app import db from app.extensions.sqlalchemy_extras import ContainerColumn from .file_mixin import FileMixin from .user import User class TesseractOCRPipelineModel(FileMixin, HashidMixin, db.Model): __tablename__ = 'tesseract_ocr_pipeline_models' # Primary key id = db.Column(db.Integer, primary_key=True) # Foreign keys user_id = db.Column(db.Integer, db.ForeignKey('users.id')) # Fields title = db.Column(db.String(64)) description = db.Column(db.String(255)) version = db.Column(db.String(16)) compatible_service_versions = db.Column(ContainerColumn(list, 255)) publisher = db.Column(db.String(128)) publisher_url = db.Column(db.String(512)) publishing_url = db.Column(db.String(512)) publishing_year = db.Column(db.Integer) is_public = db.Column(db.Boolean, default=False) # Relationships user = db.relationship('User', back_populates='tesseract_ocr_pipeline_models') @property def path(self) -> Path: return self.user.path / 'tesseract_ocr_pipeline_models' / f'{self.id}' @property def jsonpatch_path(self): return f'{self.user.jsonpatch_path}/tesseract_ocr_pipeline_models/{self.hashid}' @property def url(self): return url_for( 'contributions.tesseract_ocr_pipeline_model', tesseract_ocr_pipeline_model_id=self.id ) @property def user_hashid(self): return self.user.hashid @staticmethod def insert_defaults(force_download=False): nopaque_user = User.query.filter_by(username='nopaque').first() default_records_file = Path(__file__).parent / 'default_records' / 'tesseract_ocr_pipeline_model.yml' with default_records_file.open('r') as f: default_records = yaml.safe_load(f) for m in default_records: model = TesseractOCRPipelineModel.query.filter_by(title=m['title'], version=m['version']).first() # noqa if model is not None: model.compatible_service_versions = m['compatible_service_versions'] model.description = m['description'] model.filename = f'{model.id}.traineddata' model.publisher = m['publisher'] model.publisher_url = m['publisher_url'] model.publishing_url = m['publishing_url'] model.publishing_year = m['publishing_year'] model.is_public = True model.title = m['title'] model.version = m['version'] else: model = TesseractOCRPipelineModel( compatible_service_versions=m['compatible_service_versions'], description=m['description'], publisher=m['publisher'], publisher_url=m['publisher_url'], publishing_url=m['publishing_url'], publishing_year=m['publishing_year'], is_public=True, title=m['title'], user=nopaque_user, version=m['version'] ) db.session.add(model) db.session.flush(objects=[model]) db.session.refresh(model) model.filename = f'{model.id}.traineddata' if not model.path.exists() or force_download: r = requests.get(m['url'], stream=True) pbar = tqdm( desc=f'{model.title} ({model.filename})', unit="B", unit_scale=True, unit_divisor=1024, total=int(r.headers['Content-Length']) ) pbar.clear() with open(model.path, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks pbar.update(len(chunk)) f.write(chunk) pbar.close() db.session.commit() def delete(self): try: self.path.unlink(missing_ok=True) except OSError as e: current_app.logger.error(e) raise db.session.delete(self) def to_json_serializeable(self, backrefs=False, relationships=False): json_serializeable = { 'id': self.hashid, 'compatible_service_versions': self.compatible_service_versions, 'description': self.description, 'publisher': self.publisher, 'publisher_url': self.publisher_url, 'publishing_url': self.publishing_url, 'publishing_year': self.publishing_year, 'is_public': self.is_public, 'title': self.title, 'version': self.version, **self.file_mixin_to_json_serializeable() } if backrefs: json_serializeable['user'] = \ self.user.to_json_serializeable(backrefs=True) if relationships: pass return json_serializeable