nopaque/app/swarm.py

122 lines
4.6 KiB
Python
Raw Normal View History

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
2019-07-17 13:34:20 +02:00
import docker
import time
import json
2019-08-06 14:54:00 +02:00
import os
2019-07-17 13:34:20 +02:00
class Swarm:
def __init__(self, app=None):
self.app = app
if app is not None:
self.init_app(app)
2019-07-17 13:34:20 +02:00
self.docker = docker.from_env()
def init_app(self, app):
engine = create_engine(app.config['SQLALCHEMY_DATABASE_URI'])
self.Session = sessionmaker(bind=engine)
'''
' Swarm mode is intendet to run containers which serve a non terminating
' service like a webserver. For processing an occuring job it is necessary
' to use a one-shot container, which stops after the wrapped job process is
' completly executed. In order to run these one-shot containers in Swarm
' mode, the following run method is implemented analog to the presented
' implementation in Alex Ellis' blog post "One-shot containers on Docker
' Swarm"¹.
'
' ¹ https://blog.alexellis.io/containers-on-swarm/
'''
def run(self, job):
'''
Input is a job object. From this the _command is built.
'''
# Prepare argument values needed for the service creation.
service_args = json.loads(job.service_args)
ressources = json.loads(job.ressources)
_command = (job.service
2019-08-06 14:54:00 +02:00
+ ' -i /files'
+ ' -l {}'.format(service_args['lang'])
2019-08-06 14:54:00 +02:00
+ ' -o /files/output'
+ ' ' + ' '.join(service_args['args']))
_constraints = ['node.role==worker']
_image = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/{}:{}'.format(
job.service,
service_args['version']
)
_labels = {'service': job.service}
2019-08-06 14:54:00 +02:00
_mounts = [os.path.join('/home/compute/mnt/opaque',
str(job.user_id),
'jobs',
str(job.id))
+ ':/files:rw']
2019-08-08 12:01:55 +02:00
_name = str(job.user_id) + '_' + str(job.id)
2019-07-17 13:34:20 +02:00
'''
' The Docker SDK for Python expects the cpu_reservation value to be
' scaled to nanos (10^9). Because the job object contains unscaled
' (10^0) values, it must be conveted.
'
' While the cpu_reservation value has to be in nanos, the
2019-07-31 09:10:37 +02:00
' mem_reservation value must be presented in an unscaled form
' (intuitive right?). Bacause the job object provides the memory value
' in megabytes, it is also necessary to convert the value.
2019-07-17 13:34:20 +02:00
'''
_resources = docker.types.Resources(
cpu_reservation=ressources['n_cores'] * (10 ** 9),
mem_reservation=ressources['mem_mb'] * (10 ** 6)
)
_restart_policy = docker.types.RestartPolicy(condition='none')
2019-07-17 13:34:20 +02:00
'''
' Create the service with the prepared values.
'
' Note: A service reserves hardware ressources. In case no worker node
' has the required ressources available (not reserved), the
' service gets queued by the Docker engine until a node is able
' to meet the requirements.
'
' TODO: The name argument should be used with the prepared value
' (name=_name). Because there is no id generator for now, it is
' not set, so that the Docker engine assigns a random name.
2019-07-17 13:34:20 +02:00
'''
service = self.docker.services.create(
_image,
command=_command,
constraints=_constraints,
labels=_labels,
mounts=_mounts,
2019-08-08 12:01:55 +02:00
name=_name,
resources=_resources,
restart_policy=_restart_policy
)
2019-07-17 13:34:20 +02:00
'''
' Because it takes some time until all data in the service object is
' initialized (especcially the task list returned by the service.tasks
' method), a poll loop checks if the task list is empty.
'''
while not service.tasks():
time.sleep(1)
service.reload()
'''
' Poll the service until the job is completly executed.
2019-07-17 13:34:20 +02:00
'''
session = self.Session()
session.add(job)
2019-08-07 16:03:24 +02:00
job.status = 'running'
session.commit()
current_state = None
while True:
current_state = service.tasks()[0].get('Status').get('State')
if current_state == 'complete' or current_state == 'failed':
break
time.sleep(1)
service.reload()
job.status = current_state
session.commit()
session.close()
# Remove the service from the swarm.
service.remove()
2019-07-17 13:34:20 +02:00
return