nopaque/app/cli.py

123 lines
4.1 KiB
Python

from flask import current_app
from flask_migrate import upgrade
from . import db
from .models import Corpus, Job, Role, User, TesseractOCRModel
import json
import os
import re
def _make_default_dirs():
base_dir = current_app.config['NOPAQUE_DATA_DIR']
default_directories = [
os.path.join(base_dir, 'tmp'),
os.path.join(base_dir, 'users')
]
for directory in default_directories:
if os.path.exists(directory):
if not os.path.isdir(directory):
raise NotADirectoryError(f'{directory} is not a directory')
else:
os.mkdir(directory)
def register(app):
@app.cli.command()
def deploy():
''' Run deployment tasks. '''
# Make default directories
_make_default_dirs()
# migrate database to latest revision
upgrade()
# Insert/Update default database values
current_app.logger.info('Insert/Update default roles')
Role.insert_defaults()
current_app.logger.info('Insert/Update default users')
User.insert_defaults()
current_app.logger.info('Insert/Update default tesseract ocr models')
TesseractOCRModel.insert_defaults()
@app.cli.group()
def daemon():
''' Daemon commands. '''
pass
@daemon.command('run')
def run_daemon():
''' Run daemon '''
corpus: Corpus
for corpus in Corpus.query.filter(Corpus.num_analysis_sessions > 0):
corpus.num_analysis_sessions = 0
db.session.commit()
from app.daemon import Daemon
daemon: Daemon = Daemon()
daemon.run()
@app.cli.group()
def test():
''' Test commands. '''
pass
@test.command('run')
def run_test():
''' Run unit tests. '''
from unittest import TestLoader, TextTestRunner
from unittest.suite import TestSuite
tests: TestSuite = TestLoader().discover('tests')
TextTestRunner(verbosity=2).run(tests)
@app.cli.group()
def convert():
''' Datebase convert commands. '''
@convert.command()
def nlp_jobs():
for job in Job.query.filter_by(service='nlp').all():
job.service = 'spacy-nlp'
service_args = json.loads(job.service_args)
new_service_args = {}
for service_arg in service_args:
if service_arg == '--check-encoding':
new_service_args['encoding_detection'] = True
elif re.match(r'-l ([a-z]{2})', service_arg):
language_code = re.search(r'-l ([a-z]{2})', service_arg).group(1) # noqa
new_service_args['language'] = language_code
job.service_args = json.dumps(new_service_args)
db.session.commit()
@convert.command()
def ocr_jobs():
# Language code to TesseractOCRModel.title lookup
language_code_lookup = {
'ara': 'Arabic',
'chi_tra': 'Chinese - Traditional',
'dan': 'Danish',
'eng': 'English',
'enm': 'English, Middle (1100-1500)',
'fra': 'French',
'frm': 'French, Middle (ca. 1400-1600)',
'deu': 'German',
'frk': 'German Fraktur',
'ell': 'Greek, Modern (1453-)',
'ita': 'Italian',
'por': 'Portuguese',
'rus': 'Russian',
'spa': 'Spanish; Castilian'
}
for job in Job.query.filter_by(service='ocr').all():
job.service = 'tesseract-ocr'
service_args = json.loads(job.service_args)
new_service_args = {}
for service_arg in service_args:
if service_arg == '--binarize':
new_service_args['binarization'] = True
elif re.match(r'-l ([a-z]{3})', service_arg):
language_code = re.search(r'-l ([a-z]{3})', service_arg).group(1) # noqa
tesseract_ocr_model = TesseractOCRModel.query.filter_by(title=language_code_lookup[language_code]).first() # noqa
new_service_args['model'] = tesseract_ocr_model.id
job.service_args = json.dumps(new_service_args)
db.session.commit()