nopaque/app/daemon/corpus_utils.py

253 lines
10 KiB
Python

from app.models import Corpus, CorpusStatus
from flask import current_app
import docker
import os
import shutil
class CheckCorporaMixin:
def check_corpora(self):
corpora = Corpus.query.all()
for corpus in (x for x in corpora if x.status == CorpusStatus.SUBMITTED): # noqa
self.create_build_corpus_service(corpus)
for corpus in (x for x in corpora if x.status == CorpusStatus.QUEUED or x.status == CorpusStatus.BUILDING): # noqa
self.checkout_build_corpus_service(corpus)
for corpus in (x for x in corpora if x.status == CorpusStatus.BUILT and x.num_analysis_sessions > 0): # noqa
corpus.status = CorpusStatus.STARTING_ANALYSIS_SESSION
for corpus in (x for x in corpora if x.status == CorpusStatus.RUNNING_ANALYSIS_SESSION and x.num_analysis_sessions == 0): # noqa
corpus.status = CorpusStatus.CANCELING_ANALYSIS_SESSION
for corpus in (x for x in corpora if x.status == CorpusStatus.RUNNING_ANALYSIS_SESSION): # noqa
self.checkout_analysing_corpus_container(corpus)
for corpus in (x for x in corpora if x.status == CorpusStatus.STARTING_ANALYSIS_SESSION): # noqa
self.create_cqpserver_container(corpus)
for corpus in (x for x in corpora if x.status == CorpusStatus.CANCELING_ANALYSIS_SESSION): # noqa
self.remove_cqpserver_container(corpus)
def create_build_corpus_service(self, corpus):
''' # Docker service settings # '''
''' ## Command ## '''
command = ['bash', '-c']
command.append(
f'mkdir /corpora/data/nopaque_{corpus.id}'
' && '
'cwb-encode'
' -c utf8'
f' -d /corpora/data/nopaque_{corpus.id}'
' -f /root/files/corpus.vrt'
f' -R /usr/local/share/cwb/registry/nopaque_{corpus.id}'
' -P pos -P lemma -P simple_pos'
' -S ent:0+type -S s:0'
' -S text:0+address+author+booktitle+chapter+editor+institution+journal+pages+publisher+publishing_year+school+title' # noqa
' -xsB -9'
' && '
f'cwb-make -V NOPAQUE_{corpus.id}'
)
''' ## Constraints ## '''
constraints = ['node.role==worker']
''' ## Image ## '''
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cwb:r1702'
''' ## Labels ## '''
labels = {
'origin': current_app.config['SERVER_NAME'],
'type': 'corpus.build',
'corpus_id': str(corpus.id)
}
''' ## Mounts ## '''
mounts = []
''' ### Data mount ### '''
data_mount_source = os.path.join(corpus.path, 'cwb', 'data')
data_mount_target = '/corpora/data'
data_mount = f'{data_mount_source}:{data_mount_target}:rw'
# Make sure that their is no data in the data directory
shutil.rmtree(data_mount_source, ignore_errors=True)
os.makedirs(data_mount_source)
mounts.append(data_mount)
''' ### File mount ### '''
file_mount_source = os.path.join(corpus.path, 'cwb', 'corpus.vrt')
file_mount_target = '/root/files/corpus.vrt'
file_mount = f'{file_mount_source}:{file_mount_target}:ro'
mounts.append(file_mount)
''' ### Registry mount ### '''
registry_mount_source = os.path.join(corpus.path, 'cwb', 'registry')
registry_mount_target = '/usr/local/share/cwb/registry'
registry_mount = f'{registry_mount_source}:{registry_mount_target}:rw'
# Make sure that their is no data in the registry directory
shutil.rmtree(registry_mount_source, ignore_errors=True)
os.makedirs(registry_mount_source)
mounts.append(registry_mount)
''' ## Name ## '''
name = f'build-corpus_{corpus.id}'
''' ## Restart policy ## '''
restart_policy = docker.types.RestartPolicy()
try:
self.docker.services.create(
image,
command=command,
constraints=constraints,
labels=labels,
mounts=mounts,
name=name,
restart_policy=restart_policy
)
except docker.errors.APIError as e:
current_app.logger.error(
f'Create service "{name}" failed '
f'due to "docker.errors.APIError": {e}'
)
return
corpus.status = CorpusStatus.QUEUED
def checkout_build_corpus_service(self, corpus):
service_name = f'build-corpus_{corpus.id}'
try:
service = self.docker.services.get(service_name)
except docker.errors.NotFound as e:
current_app.logger.error(
f'Get service "{service_name}" failed '
f'due to "docker.errors.NotFound": {e}'
)
corpus.status = CorpusStatus.FAILED
return
except docker.errors.APIError as e:
current_app.logger.error(
f'Get service "{service_name}" failed '
f'due to "docker.errors.APIError": {e}'
)
service_tasks = service.tasks()
if not service_tasks:
return
task_state = service_tasks[0].get('Status').get('State')
if corpus.status == CorpusStatus.QUEUED and task_state != 'pending': # noqa
corpus.status = CorpusStatus.BUILDING
return
elif corpus.status == CorpusStatus.BUILDING and task_state == 'complete': # noqa
corpus.status = CorpusStatus.BUILT
elif corpus.status == CorpusStatus.BUILDING and task_state == 'failed': # noqa
corpus.status = CorpusStatus.FAILED
else:
return
try:
service.remove()
except docker.errors.APIError as e:
current_app.logger.error(
f'Remove service "{service_name}" failed '
f'due to "docker.errors.APIError": {e}'
)
def create_cqpserver_container(self, corpus):
''' # Docker container settings # '''
''' ## Command ## '''
command = []
command.append(
'echo "host *;" > cqpserver.init'
' && '
'echo "user anonymous \\"\\";" >> cqpserver.init'
' && '
'cqpserver -I cqpserver.init'
)
''' ## Detach ## '''
detach = True
''' ## Entrypoint ## '''
entrypoint = ['bash', '-c']
''' ## Image ## '''
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cwb:r1702'
''' ## Name ## '''
name = f'cqpserver_{corpus.id}'
''' ## Network ## '''
network = 'nopaque_default'
''' ## Volumes ## '''
volumes = []
''' ### Corpus data volume ### '''
data_volume_source = os.path.join(corpus.path, 'cwb', 'data')
data_volume_target = '/corpora/data'
data_volume = f'{data_volume_source}:{data_volume_target}:rw'
volumes.append(data_volume)
''' ### Corpus registry volume ### '''
registry_volume_source = os.path.join(corpus.path, 'cwb', 'registry')
registry_volume_target = '/usr/local/share/cwb/registry'
registry_volume = f'{registry_volume_source}:{registry_volume_target}:rw' # noqa
volumes.append(registry_volume)
# Check if a cqpserver container already exists. If this is the case,
# remove it and create a new one
try:
container = self.docker.containers.get(name)
except docker.errors.NotFound:
pass
except docker.errors.APIError as e:
current_app.logger.error(
f'Get container "{name}" failed '
f'due to "docker.errors.APIError": {e}'
)
return
else:
try:
container.remove(force=True)
except docker.errors.APIError as e:
current_app.logger.error(
f'Remove container "{name}" failed '
f'due to "docker.errors.APIError": {e}'
)
return
try:
self.docker.containers.run(
image,
command=command,
detach=detach,
entrypoint=entrypoint,
volumes=volumes,
name=name,
network=network
)
except docker.errors.ImageNotFound as e:
current_app.logger.error(
f'Run container "{name}" failed '
f'due to "docker.errors.ImageNotFound" error: {e}'
)
corpus.status = CorpusStatus.FAILED
return
except docker.errors.APIError as e:
current_app.logger.error(
f'Run container "{name}" failed '
f'due to "docker.errors.APIError" error: {e}'
)
return
corpus.status = CorpusStatus.RUNNING_ANALYSIS_SESSION
def checkout_analysing_corpus_container(self, corpus):
container_name = f'cqpserver_{corpus.id}'
try:
self.docker.containers.get(container_name)
except docker.errors.NotFound as e:
current_app.logger.error(
f'Get container "{container_name}" failed '
f'due to "docker.errors.NotFound": {e}'
)
corpus.num_analysis_sessions = 0
corpus.status = CorpusStatus.BUILT
except docker.errors.APIError as e:
current_app.logger.error(
f'Get container "{container_name}" failed '
f'due to "docker.errors.APIError": {e}'
)
def remove_cqpserver_container(self, corpus):
container_name = f'cqpserver_{corpus.id}'
try:
container = self.docker.containers.get(container_name)
except docker.errors.NotFound:
corpus.status = CorpusStatus.BUILT
return
except docker.errors.APIError as e:
current_app.logger.error(
f'Get container "{container_name}" failed '
f'due to "docker.errors.APIError": {e}'
)
return
try:
container.remove(force=True)
except docker.errors.APIError as e:
current_app.logger.error(
f'Remove container "{container_name}" failed '
f'due to "docker.errors.APIError": {e}'
)