Use pyFlow for file setup service

This commit is contained in:
Patrick Jentsch 2021-04-12 14:55:14 +02:00
parent 0c8b3421c6
commit 3c9a800886
5 changed files with 254 additions and 85 deletions

View File

@ -1,8 +1,5 @@
image: docker:19.03.13 image: docker:19.03.13
variables:
DOCKER_TLS_CERTDIR: "/certs"
services: services:
- docker:19.03.13-dind - docker:19.03.13-dind
@ -10,6 +7,10 @@ stages:
- build - build
- push - push
variables:
DOCKER_TLS_CERTDIR: "/certs"
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME-$CI_COMMIT_SHA
.reg_setup: .reg_setup:
before_script: before_script:
- apk add --no-cache curl - apk add --no-cache curl
@ -28,8 +29,6 @@ build_image:
stage: build stage: build
tags: tags:
- docker - docker
variables:
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
push_master: push_master:
extends: extends:
@ -47,7 +46,6 @@ push_master:
- docker - docker
variables: variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:latest IMAGE_TAG: $CI_REGISTRY_IMAGE:latest
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
push_other: push_other:
extends: extends:
@ -68,4 +66,3 @@ push_other:
- docker - docker
variables: variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA

View File

@ -1,7 +1,7 @@
FROM debian:buster-slim FROM debian:buster-slim
LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <sporada@uni-bielefeld.de>" LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>"
ENV LANG=C.UTF-8 ENV LANG=C.UTF-8
@ -9,17 +9,38 @@ ENV LANG=C.UTF-8
RUN apt-get update \ RUN apt-get update \
&& apt-get install --no-install-recommends --yes \ && apt-get install --no-install-recommends --yes \
wget
# Install the NLP pipeline and it's dependencies #
## Install pyFlow ##
ENV PYFLOW_VERSION=1.1.20
RUN wget --no-check-certificate --quiet \
"https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \
&& tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
&& cd "pyflow-${PYFLOW_VERSION}" \
&& apt-get install --no-install-recommends --yes \
python2.7 \
&& python2.7 setup.py build install \
&& cd .. \
&& rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"
## Further dependencies ##
RUN apt-get install --no-install-recommends --yes \
imagemagick \ imagemagick \
procps \
python3.7 \ python3.7 \
zip \ zip \
&& rm -r /var/lib/apt/lists/* && mv /etc/ImageMagick-6/policy.xml /etc/ImageMagick-6/policy.xml.bak
RUN mv /etc/ImageMagick-6/policy.xml /etc/ImageMagick-6/policy.xml.bak
## Install Pipeline ##
COPY file-setup /usr/local/bin COPY file-setup /usr/local/bin
RUN rm -r /var/lib/apt/lists/*
ENTRYPOINT ["file-setup"] ENTRYPOINT ["file-setup"]
CMD ["--help"] CMD ["--help"]

View File

@ -0,0 +1,37 @@
# File setup
This software implements a parallelized pipeline to setup image files. It is used for nopaque's File setup service but you can also use it standalone, for that purpose a convenient wrapper script is provided.
## Software used in this pipeline implementation
- Official Debian Docker image (buster-slim) and programs from its free repositories: https://hub.docker.com/_/debian
## Use this image
1. Create input and output directories for the pipeline.
``` bash
mkdir -p /<my_data_location>/input /<my_data_location>/output
```
2. Place your images files inside a directory in `/<my_data_location>/input`.
3. Start the pipeline process. Check the pipeline help (`file-setup --help`) for more details.
```
# Option one: Use the wrapper script
## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/raw/1.0.0/wrapper/file-setup, make it executeable and add it to your ${PATH}
cd /<my_data_location>
file-setup -i input -o output
# Option two: Classic Docker style
docker run \
--rm \
-it \
-u $(id -u $USER):$(id -g $USER) \
-v /<my_data_location>/input:/input \
-v /<my_data_location>/output:/output \
gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/file-setup:1.0.0 \
-i /input \
-o /output
```
4. Check your results in the `/<my_data_location>/output` directory.

View File

@ -1,79 +1,188 @@
#!/usr/bin/env python3.7 #!/usr/bin/env python2.7
# coding=utf-8 # coding=utf-8
"""A file setup pipeline for image file merging."""
""" __author__ = 'Patrick Jentsch <p.jentsch@uni-bielefeld.de>,' \
file-setup 'Stephan Porada <porada@posteo.de>'
__version__ = '1.0.0'
Usage: For usage instructions run with option --help
Authors: Patrick Jentsch <p.jentsch@uni-bielefeld.de
Stephan Porada <sporada@uni-bielefeld.de>
"""
from argparse import ArgumentParser from argparse import ArgumentParser
from pyflow import WorkflowRunner
import os import os
import re import sys
import subprocess
def parse_arguments(): class FileSetupPipelineJob:
parser = ArgumentParser(description='Merge images (JPEG, PNG or TIFF) into one PDF file.') """An file setup pipeline job class
parser.add_argument('-i', '--input-directory',
Each image containing input directory of the pipeline is represented as an
file setup pipeline job, which holds all necessary information for the
pipeline to process it.
Arguments:
dir -- Path to the directory
output_dir -- Path to a directory, where job results a stored
"""
def __init__(self, dir, output_dir):
self.dir = dir
self.name = os.path.basename(dir)
self.output_dir = output_dir
class FileSetupPipeline(WorkflowRunner):
def __init__(self, input_dir, output_dir, zip):
self.input_dir = input_dir
self.output_dir = output_dir
self.zip = zip
self.jobs = collect_jobs(self.input_dir, self.output_dir)
def workflow(self):
if not self.jobs:
return
'''
' ##################################################
' # setup output directory #
' ##################################################
'''
setup_output_directory_tasks = []
for i, job in enumerate(self.jobs):
cmd = 'mkdir -p "{}"'.format(job.output_dir)
lbl = 'setup_output_directory_-_{}'.format(i)
task = self.addTask(command=cmd, label=lbl)
setup_output_directory_tasks.append(task)
'''
' ##################################################
' # pre file setup #
' ##################################################
'''
pre_file_setup_tasks = []
for i, job in enumerate(self.jobs):
input_file = os.path.join(job.output_dir, 'file_setup_input_files.txt') # noqa
cmd = 'ls -dQv "{}/"* >> "{}"'.format(job.dir, input_file)
deps = 'setup_output_directory_-_{}'.format(i)
lbl = 'pre_file_setup_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
pre_file_setup_tasks.append(task)
'''
' ##################################################
' # file setup #
' ##################################################
'''
file_setup_tasks = []
n_cores = max(1, int(self.getNCores() / len(self.jobs)))
for i, job in enumerate(self.jobs):
input_file = os.path.join(job.output_dir, 'file_setup_input_files.txt') # noqa
output_file = os.path.join(job.output_dir, '{}.pdf'.format(job.name)) # noqa
cmd = 'convert "@{}" "{}"'.format(input_file, output_file)
deps = 'pre_file_setup_-_{}'.format(i)
lbl = 'file_setup_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl,
nCores=n_cores)
file_setup_tasks.append(task)
'''
' ##################################################
' # post file setup #
' ##################################################
'''
post_file_setup_tasks = []
for i, job in enumerate(self.jobs):
input_file = os.path.join(job.output_dir, 'file_setup_input_files.txt') # noqa
cmd = 'rm "{}"'.format(input_file)
deps = 'file_setup_-_{}'.format(i)
lbl = 'post_file_setup_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
post_file_setup_tasks.append(task)
'''
' ##################################################
' # zip creation #
' ##################################################
'''
zip_creation_tasks = []
if self.zip is not None:
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -r'
cmd += ' "{}.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*"'
cmd += ' -i "*.pdf"'
cmd += ' && '
cmd += 'cd -'
deps = file_setup_tasks
lbl = 'zip_creation'
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task)
def collect_jobs(input_dir, output_dir):
jobs = []
for dir in os.listdir(input_dir):
if not os.path.isdir(os.path.join(input_dir, dir)):
continue
# TODO: Filter for file types
if not os.listdir(os.path.join(input_dir, dir)):
continue
job = FileSetupPipelineJob(os.path.join(input_dir, dir),
os.path.join(output_dir, dir))
jobs.append(job)
return jobs
def parse_args():
parser = ArgumentParser(description='A file setup pipeline for image file merging', # noqa
prog='File setup pipeline')
parser.add_argument('-i', '--input-dir',
help='Input directory', help='Input directory',
required=True) required=True)
parser.add_argument('-o', '--output-directory', parser.add_argument('-o', '--output-dir',
help='Output directory', help='Output directory',
required=True) required=True)
parser.add_argument('-f', '--output-file-base', parser.add_argument('--log-dir',
help='output file base', help='Logging directory')
required=True) parser.add_argument('--mem-mb',
parser.add_argument('--log-dir') help='Amount of system memory to be used (Default: min(--n-cores * 2048, available system memory))', # noqa
parser.add_argument('--zip') type=int)
return parser.parse_args() parser.add_argument('--n-cores',
default=1,
help='Number of CPU threads to be used (Default: 1)',
type=int)
parser.add_argument('--zip',
help='Create one zip file per filetype')
parser.add_argument('-v', '--version',
action='version',
help='Returns the current version of the file setup pipeline', # noqa
version='%(prog)s {}'.format(__version__))
args = parser.parse_args()
# Set some tricky default values and check for insufficient input
def natural_sorted(iterable): if args.log_dir is None:
""" Sort the given list in the way that humans expect. args.log_dir = args.output_dir
""" if args.n_cores < 1:
convert = lambda text: int(text) if text.isdigit() else text raise Exception('--n-cores must be greater or equal 1')
alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)] if args.mem_mb is None:
return sorted(iterable, key=alphanum_key) max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0])
args.mem_mb = min(args.n_cores * 2048, max_mem_mb)
if args.mem_mb < 2048:
def merge_images(input_dir, output_dir, output_file_base, zip): raise Exception('--mem-mb must be greater or equal 2048')
try: if args.zip is not None and args.zip.lower().endswith('.zip'):
os.mkdir(output_dir)
except FileExistsError:
pass
files = filter(lambda x: x.lower().endswith(('.jpg', '.jpeg', '.png', '.tif', '.tiff')),
os.listdir(input_dir))
files = natural_sorted(files)
files = map(lambda x: os.path.join(input_dir, x), files)
output_file = os.path.join(output_dir, '{}.pdf'.format(output_file_base))
# Convert input files to a single PDF
cmd = 'convert "{}" "{}"'.format('" "'.join(files), output_file)
subprocess.run(cmd, shell=True)
# zip stuff
if zip is not None:
# Remove .zip file extension if provided # Remove .zip file extension if provided
if zip.lower().endswith('.zip'): args.zip = args.zip[:-4]
zip = zip[:-4] args.zip = args.zip if args.zip else 'output'
zip = zip if zip else 'output' return args
cmd = 'cd "{}"'.format(output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' "{}.zip" "{}.pdf"'.format(zip, output_file_base)
cmd += ' && '
cmd += 'cd -'
subprocess.run(cmd, shell=True)
def main(): def main():
args = parse_arguments() args = parse_args()
merge_images(args.input_directory, file_setup_pipeline = FileSetupPipeline(args.input_dir, args.output_dir, args.zip) # noqa
args.output_directory, retval = file_setup_pipeline.run(dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores) # noqa
args.output_file_base, sys.exit(retval)
args.zip)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -4,30 +4,35 @@
from argparse import ArgumentParser from argparse import ArgumentParser
import os import os
import subprocess import subprocess
import sys
CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/file-setup:latest' CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/file-setup:1.0.0'
CONTAINER_INPUT_DIR = '/input' CONTAINER_INPUT_DIR = '/input'
CONTAINER_OUTPUT_DIR = '/output' CONTAINER_OUTPUT_DIR = '/output'
CONTAINER_LOG_DIR = '/logs'
UID = str(os.getuid()) UID = str(os.getuid())
GID = str(os.getgid()) GID = str(os.getgid())
parser = ArgumentParser(add_help=False) parser = ArgumentParser(add_help=False)
parser.add_argument('-i', '--input-directory') parser.add_argument('-i', '--input-dir')
parser.add_argument('-o', '--output-directory') parser.add_argument('-o', '--output-dir')
parser.add_argument('--log-dir')
args, remaining_args = parser.parse_known_args() args, remaining_args = parser.parse_known_args()
cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)] cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)]
if args.output_directory is not None: if args.input_dir is not None:
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.output_directory), mapping = os.path.abspath(args.input_dir) + ':' + CONTAINER_INPUT_DIR
CONTAINER_OUTPUT_DIR)] cmd += ['-v', mapping]
remaining_args.insert(0, CONTAINER_OUTPUT_DIR) remaining_args += ['-i', CONTAINER_INPUT_DIR]
remaining_args.insert(0, '-o') if args.output_dir is not None:
if args.input_directory is not None: mapping = os.path.abspath(args.output_dir) + ':' + CONTAINER_OUTPUT_DIR
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.input_directory), cmd += ['-v', mapping]
CONTAINER_INPUT_DIR)] remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
remaining_args.insert(0, CONTAINER_INPUT_DIR) if args.log_dir is not None:
remaining_args.insert(0, '-i') mapping = os.path.abspath(args.log_dir) + ':' + CONTAINER_LOG_DIR
cmd += ['-v', mapping]
remaining_args += ['--log-dir', CONTAINER_LOG_DIR]
cmd.append(CONTAINER_IMAGE) cmd.append(CONTAINER_IMAGE)
cmd += remaining_args cmd += remaining_args
subprocess.run(cmd) sys.exit(subprocess.run(cmd).returncode)