from datetime import datetime from werkzeug.utils import secure_filename from .. import db, mail from ..email import create_message from ..models import Job, JobResult import docker import logging import json import os service_settings = { 'file-setup': { 'ressources': docker.types.Resources(mem_reservation=1024 * (10 ** 6), cpu_reservation=1 * (10 ** 9)) }, 'nlp': { 'default_args': ' --n-cores 2 --mem-mb 2048', 'ressources': docker.types.Resources(mem_reservation=2048 * (10 ** 6), cpu_reservation=2 * (10 ** 9)) }, 'ocr': { 'default_args': ' --n-cores 4 --mem-mb 4096', 'ressources': docker.types.Resources(mem_reservation=4096 * (10 ** 6), cpu_reservation=4 * (10 ** 9)) } } class CheckJobsMixin: def check_jobs(self): jobs = Job.query.all() canceling_jobs = list(filter(lambda job: job.status == 'canceling', jobs)) # noqa queued_jobs = list(filter(lambda job: job.status == 'queued', jobs)) running_jobs = list(filter(lambda job: job.status == 'running', jobs)) submitted_jobs = list(filter(lambda job: job.status == 'submitted', jobs)) # noqa for job in submitted_jobs: self.create_job_service(job) for job in queued_jobs + running_jobs: self.checkout_job_service(job) for job in canceling_jobs: self.remove_job_service(job) def create_job_service(self, job): cmd = '{} -i /files -o /files/output'.format(job.service) if 'default_args' in service_settings[job.service]: cmd += service_settings[job.service]['default_args'] if job.service == 'file-setup': cmd += ' -f {}'.format(secure_filename(job.title)) cmd += ' --log-dir /files' cmd += ' --zip [{}]_{}'.format(job.service, secure_filename(job.title)) cmd += ' ' + ' '.join(json.loads(job.service_args)) ressources = service_settings[job.service]['ressources'] service_kwargs = {'command': cmd, 'constraints': ['node.role==worker'], 'labels': {'origin': 'nopaque', 'type': 'job', 'job_id': str(job.id)}, 'mounts': [job.path + ':/files:rw'], 'name': 'job_{}'.format(job.id), 'resources': ressources, 'restart_policy': docker.types.RestartPolicy()} service_image = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/{}:latest'.format(job.service) # noqa try: self.docker.services.create(service_image, **service_kwargs) except docker.errors.APIError as e: logging.error( 'Create "{}" service raised '.format(service_kwargs['name']) + '"docker.errors.APIError" The server returned an error. ' + 'Details: {}'.format(e) ) return else: job.status = 'queued' patch_operation = {'op': 'replace', 'path': '/jobs/{}/status'.format(job.id), 'value': job.status} # noqa self.buffer_user_patch_operation(job, patch_operation) finally: self.send_job_notification(job) def checkout_job_service(self, job): service_name = 'job_{}'.format(job.id) try: service = self.docker.services.get(service_name) except docker.errors.NotFound: logging.error('Get "{}" service raised '.format(service_name) + '"docker.errors.NotFound" The service does not exist. ' + '(job.status: {} -> failed)'.format(job.status)) job.status = 'failed' patch_operation = {'op': 'replace', 'path': '/jobs/{}/status'.format(job.id), 'value': job.status} # noqa self.buffer_user_patch_operation(job, patch_operation) except docker.errors.APIError as e: logging.error( 'Get "{}" service raised '.format(service_name) + '"docker.errors.APIError" The server returned an error. ' + 'Details: {}'.format(e) ) return except docker.errors.InvalidVersion: logging.error( 'Get "{}" service raised '.format(service_name) + '"docker.errors.InvalidVersion" One of the arguments is ' + 'not supported with the current API version.' ) return else: service_tasks = service.tasks() if not service_tasks: return task_state = service_tasks[0].get('Status').get('State') if job.status == 'queued' and task_state != 'pending': job.status = 'running' patch_operation = {'op': 'replace', 'path': '/jobs/{}/status'.format(job.id), 'value': job.status} # noqa self.buffer_user_patch_operation(job, patch_operation) elif job.status == 'running' and task_state in ['complete', 'failed']: try: service.remove() except docker.errors.APIError as e: logging.error( 'Remove "{}" service raised '.format(service_name) + '"docker.errors.APIError" The server returned an error. ' # noqa + 'Details: {}'.format(e) ) return else: if task_state == 'complete': results_dir = os.path.join(job.path, 'output') result_files = filter(lambda x: x.endswith('.zip'), os.listdir(results_dir)) for result_file in result_files: job_result = JobResult(filename=result_file, job=job) # noqa db.session.add(job_result) db.session.flush() db.session.refresh(job_result) patch_operation = {'op': 'add', 'path': '/jobs/{}/results/{}'.format(job.id, job_result.id), 'value': job_result.to_dict()} # noqa self.buffer_user_patch_operation(job, patch_operation) # noqa job.end_date = datetime.utcnow() patch_operation = {'op': 'replace', 'path': '/jobs/{}/end_date'.format(job.id), 'value': job.end_date.timestamp()} # noqa self.buffer_user_patch_operation(job, patch_operation) job.status = task_state patch_operation = {'op': 'replace', 'path': '/jobs/{}/status'.format(job.id), 'value': job.status} # noqa self.buffer_user_patch_operation(job, patch_operation) finally: self.send_job_notification(job) def remove_job_service(self, job): service_name = 'job_{}'.format(job.id) try: service = self.docker.services.get(service_name) except docker.errors.NotFound: job.status = 'canceled' patch_operation = {'op': 'replace', 'path': '/jobs/{}/status'.format(job.id), 'value': job.status} # noqa self.buffer_user_patch_operation(job, patch_operation) except docker.errors.APIError as e: logging.error( 'Get "{}" service raised '.format(service_name) + '"docker.errors.APIError" The server returned an error. ' + 'Details: {}'.format(e) ) return except docker.errors.InvalidVersion: logging.error( 'Get "{}" service raised '.format(service_name) + '"docker.errors.InvalidVersion" One of the arguments is ' + 'not supported with the current API version.' ) return else: try: service.update(mounts=None) except docker.errors.APIError as e: logging.error( 'Update "{}" service raised '.format(service_name) + '"docker.errors.APIError" The server returned an error. ' + 'Details: {}'.format(e) ) return try: service.remove() except docker.errors.APIError as e: logging.error( 'Remove "{}" service raised '.format(service_name) + '"docker.errors.APIError" The server returned an error. ' + 'Details: {}'.format(e) ) def send_job_notification(self, job): if job.creator.setting_job_status_mail_notifications == 'none': return if (job.creator.setting_job_status_mail_notifications == 'end' and job.status not in ['complete', 'failed']): return msg = create_message(job.creator.email, 'Status update for your Job "{}"'.format(job.title), # noqa 'tasks/email/notification', job=job) mail.send(msg)