from flask import current_app from flask_migrate import upgrade from . import db from .models import Corpus, Job, Role, User, TesseractOCRModel import json import os import re def _make_default_dirs(): base_dir = current_app.config['NOPAQUE_DATA_DIR'] default_directories = [ os.path.join(base_dir, 'tmp'), os.path.join(base_dir, 'users') ] for directory in default_directories: if os.path.exists(directory): if not os.path.isdir(directory): raise NotADirectoryError(f'{directory} is not a directory') else: os.mkdir(directory) def register(app): @app.cli.command() def deploy(): ''' Run deployment tasks. ''' # Make default directories _make_default_dirs() # migrate database to latest revision upgrade() # Insert/Update default database values current_app.logger.info('Insert/Update default roles') Role.insert_defaults() current_app.logger.info('Insert/Update default users') User.insert_defaults() current_app.logger.info('Insert/Update default tesseract ocr models') TesseractOCRModel.insert_defaults() @app.cli.group() def daemon(): ''' Daemon commands. ''' pass @daemon.command('run') def run_daemon(): ''' Run daemon ''' corpus: Corpus for corpus in Corpus.query.filter(Corpus.num_analysis_sessions > 0): corpus.num_analysis_sessions = 0 db.session.commit() from app.daemon import Daemon daemon: Daemon = Daemon() daemon.run() @app.cli.group() def test(): ''' Test commands. ''' pass @test.command('run') def run_test(): ''' Run unit tests. ''' from unittest import TestLoader, TextTestRunner from unittest.suite import TestSuite tests: TestSuite = TestLoader().discover('tests') TextTestRunner(verbosity=2).run(tests) @app.cli.group() def convert(): ''' Datebase convert commands. ''' @convert.command() def nlp_jobs(): for job in Job.query.filter_by(service='nlp').all(): job.service = 'spacy-nlp' service_args = json.loads(job.service_args) new_service_args = {} for service_arg in service_args: if service_arg == '--check-encoding': new_service_args['encoding_detection'] = True elif re.match(r'-l ([a-z]{2})', service_arg): language_code = re.search(r'-l ([a-z]{2})', service_arg).group(1) # noqa new_service_args['language'] = language_code job.service_args = json.dumps(new_service_args) db.session.commit() @convert.command() def ocr_jobs(): # Language code to TesseractOCRModel.title lookup language_code_lookup = { 'ara': 'Arabic', 'chi_tra': 'Chinese - Traditional', 'dan': 'Danish', 'eng': 'English', 'enm': 'English, Middle (1100-1500)', 'fra': 'French', 'frm': 'French, Middle (ca. 1400-1600)', 'deu': 'German', 'frk': 'German Fraktur', 'ell': 'Greek, Modern (1453-)', 'ita': 'Italian', 'por': 'Portuguese', 'rus': 'Russian', 'spa': 'Spanish; Castilian' } for job in Job.query.filter_by(service='ocr').all(): job.service = 'tesseract-ocr' service_args = json.loads(job.service_args) new_service_args = {} for service_arg in service_args: if service_arg == '--binarize': new_service_args['binarization'] = True elif re.match(r'-l ([a-z]{3})', service_arg): language_code = re.search(r'-l ([a-z]{3})', service_arg).group(1) # noqa tesseract_ocr_model = TesseractOCRModel.query.filter_by(title=language_code_lookup[language_code]).first() # noqa new_service_args['model'] = tesseract_ocr_model.id job.service_args = json.dumps(new_service_args) db.session.commit()