mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2024-12-25 19:04:18 +00:00
123 lines
4.1 KiB
Python
123 lines
4.1 KiB
Python
from flask import current_app
|
|
from flask_migrate import upgrade
|
|
from . import db
|
|
from .models import Corpus, Job, Role, User, TesseractOCRModel
|
|
import json
|
|
import os
|
|
import re
|
|
|
|
|
|
def _make_default_dirs():
|
|
base_dir = current_app.config['NOPAQUE_DATA_DIR']
|
|
|
|
default_directories = [
|
|
os.path.join(base_dir, 'tmp'),
|
|
os.path.join(base_dir, 'users')
|
|
]
|
|
for directory in default_directories:
|
|
if os.path.exists(directory):
|
|
if not os.path.isdir(directory):
|
|
raise NotADirectoryError(f'{directory} is not a directory')
|
|
else:
|
|
os.mkdir(directory)
|
|
|
|
|
|
def register(app):
|
|
@app.cli.command()
|
|
def deploy():
|
|
''' Run deployment tasks. '''
|
|
# Make default directories
|
|
_make_default_dirs()
|
|
|
|
# migrate database to latest revision
|
|
upgrade()
|
|
|
|
# Insert/Update default database values
|
|
current_app.logger.info('Insert/Update default roles')
|
|
Role.insert_defaults()
|
|
current_app.logger.info('Insert/Update default users')
|
|
User.insert_defaults()
|
|
current_app.logger.info('Insert/Update default tesseract ocr models')
|
|
TesseractOCRModel.insert_defaults()
|
|
|
|
@app.cli.group()
|
|
def daemon():
|
|
''' Daemon commands. '''
|
|
pass
|
|
|
|
@daemon.command('run')
|
|
def run_daemon():
|
|
''' Run daemon '''
|
|
corpus: Corpus
|
|
for corpus in Corpus.query.filter(Corpus.num_analysis_sessions > 0):
|
|
corpus.num_analysis_sessions = 0
|
|
db.session.commit()
|
|
from app.daemon import Daemon
|
|
daemon: Daemon = Daemon()
|
|
daemon.run()
|
|
|
|
@app.cli.group()
|
|
def test():
|
|
''' Test commands. '''
|
|
pass
|
|
|
|
@test.command('run')
|
|
def run_test():
|
|
''' Run unit tests. '''
|
|
from unittest import TestLoader, TextTestRunner
|
|
from unittest.suite import TestSuite
|
|
tests: TestSuite = TestLoader().discover('tests')
|
|
TextTestRunner(verbosity=2).run(tests)
|
|
|
|
@app.cli.group()
|
|
def convert():
|
|
''' Datebase convert commands. '''
|
|
|
|
@convert.command()
|
|
def nlp_jobs():
|
|
for job in Job.query.filter_by(service='nlp').all():
|
|
job.service = 'spacy-nlp'
|
|
service_args = json.loads(job.service_args)
|
|
new_service_args = {}
|
|
for service_arg in service_args:
|
|
if service_arg == '--check-encoding':
|
|
new_service_args['encoding_detection'] = True
|
|
elif re.match(r'-l ([a-z]{2})', service_arg):
|
|
language_code = re.search(r'-l ([a-z]{2})', service_arg).group(1) # noqa
|
|
new_service_args['language'] = language_code
|
|
job.service_args = json.dumps(new_service_args)
|
|
db.session.commit()
|
|
|
|
@convert.command()
|
|
def ocr_jobs():
|
|
# Language code to TesseractOCRModel.title lookup
|
|
language_code_lookup = {
|
|
'ara': 'Arabic',
|
|
'chi_tra': 'Chinese - Traditional',
|
|
'dan': 'Danish',
|
|
'eng': 'English',
|
|
'enm': 'English, Middle (1100-1500)',
|
|
'fra': 'French',
|
|
'frm': 'French, Middle (ca. 1400-1600)',
|
|
'deu': 'German',
|
|
'frk': 'German Fraktur',
|
|
'ell': 'Greek, Modern (1453-)',
|
|
'ita': 'Italian',
|
|
'por': 'Portuguese',
|
|
'rus': 'Russian',
|
|
'spa': 'Spanish; Castilian'
|
|
}
|
|
for job in Job.query.filter_by(service='ocr').all():
|
|
job.service = 'tesseract-ocr'
|
|
service_args = json.loads(job.service_args)
|
|
new_service_args = {}
|
|
for service_arg in service_args:
|
|
if service_arg == '--binarize':
|
|
new_service_args['binarization'] = True
|
|
elif re.match(r'-l ([a-z]{3})', service_arg):
|
|
language_code = re.search(r'-l ([a-z]{3})', service_arg).group(1) # noqa
|
|
tesseract_ocr_model = TesseractOCRModel.query.filter_by(title=language_code_lookup[language_code]).first() # noqa
|
|
new_service_args['model'] = tesseract_ocr_model.id
|
|
job.service_args = json.dumps(new_service_args)
|
|
db.session.commit()
|