mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup.git
				synced 2025-10-26 08:21:14 +00:00 
			
		
		
		
	Use pyFlow for file setup service
This commit is contained in:
		| @@ -1,8 +1,5 @@ | ||||
| image: docker:19.03.13 | ||||
|  | ||||
| variables: | ||||
|   DOCKER_TLS_CERTDIR: "/certs" | ||||
|  | ||||
| services: | ||||
|   - docker:19.03.13-dind | ||||
|  | ||||
| @@ -10,6 +7,10 @@ stages: | ||||
|   - build | ||||
|   - push | ||||
|  | ||||
| variables: | ||||
|   DOCKER_TLS_CERTDIR: "/certs" | ||||
|   INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME-$CI_COMMIT_SHA | ||||
|  | ||||
| .reg_setup: | ||||
|   before_script: | ||||
|     - apk add --no-cache curl | ||||
| @@ -28,8 +29,6 @@ build_image: | ||||
|   stage: build | ||||
|   tags: | ||||
|     - docker | ||||
|   variables: | ||||
|     INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA | ||||
|  | ||||
| push_master: | ||||
|   extends: | ||||
| @@ -47,7 +46,6 @@ push_master: | ||||
|     - docker | ||||
|   variables: | ||||
|     IMAGE_TAG: $CI_REGISTRY_IMAGE:latest | ||||
|     INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA | ||||
|  | ||||
| push_other: | ||||
|   extends: | ||||
| @@ -68,4 +66,3 @@ push_other: | ||||
|     - docker | ||||
|   variables: | ||||
|     IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME | ||||
|     INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA | ||||
|   | ||||
							
								
								
									
										31
									
								
								Dockerfile
									
									
									
									
									
								
							
							
						
						
									
										31
									
								
								Dockerfile
									
									
									
									
									
								
							| @@ -1,7 +1,7 @@ | ||||
| FROM debian:buster-slim | ||||
|  | ||||
|  | ||||
| LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <sporada@uni-bielefeld.de>" | ||||
| LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>" | ||||
|  | ||||
|  | ||||
| ENV LANG=C.UTF-8 | ||||
| @@ -9,17 +9,38 @@ ENV LANG=C.UTF-8 | ||||
|  | ||||
| RUN apt-get update \ | ||||
|  && apt-get install --no-install-recommends --yes \ | ||||
|      wget | ||||
|  | ||||
|  | ||||
| # Install the NLP pipeline and it's dependencies # | ||||
| ## Install pyFlow ## | ||||
| ENV PYFLOW_VERSION=1.1.20 | ||||
| RUN wget --no-check-certificate --quiet \ | ||||
|       "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \ | ||||
| && tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \ | ||||
| && cd "pyflow-${PYFLOW_VERSION}" \ | ||||
| && apt-get install --no-install-recommends --yes \ | ||||
|      python2.7 \ | ||||
| && python2.7 setup.py build install \ | ||||
| && cd .. \ | ||||
| && rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz" | ||||
|  | ||||
|  | ||||
| ## Further dependencies ## | ||||
| RUN apt-get install --no-install-recommends --yes \ | ||||
|       imagemagick \ | ||||
|       procps \ | ||||
|       python3.7 \ | ||||
|       zip \ | ||||
|  && rm -r /var/lib/apt/lists/* | ||||
|  | ||||
|  | ||||
| RUN mv /etc/ImageMagick-6/policy.xml /etc/ImageMagick-6/policy.xml.bak | ||||
|  && mv /etc/ImageMagick-6/policy.xml /etc/ImageMagick-6/policy.xml.bak | ||||
|  | ||||
|  | ||||
| ## Install Pipeline ## | ||||
| COPY file-setup /usr/local/bin | ||||
|  | ||||
|  | ||||
| RUN rm -r /var/lib/apt/lists/* | ||||
|  | ||||
|  | ||||
| ENTRYPOINT ["file-setup"] | ||||
| CMD ["--help"] | ||||
|   | ||||
							
								
								
									
										37
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										37
									
								
								README.md
									
									
									
									
									
								
							| @@ -0,0 +1,37 @@ | ||||
| # File setup | ||||
|  | ||||
| This software implements a parallelized pipeline to setup image files. It is used for nopaque's File setup service but you can also use it standalone, for that purpose a convenient wrapper script is provided. | ||||
|  | ||||
| ## Software used in this pipeline implementation | ||||
| - Official Debian Docker image (buster-slim) and programs from its free repositories: https://hub.docker.com/_/debian | ||||
|  | ||||
|  | ||||
| ## Use this image | ||||
|  | ||||
| 1. Create input and output directories for the pipeline. | ||||
| ``` bash | ||||
| mkdir -p /<my_data_location>/input /<my_data_location>/output | ||||
| ``` | ||||
|  | ||||
| 2. Place your images files inside a directory in `/<my_data_location>/input`. | ||||
|  | ||||
| 3. Start the pipeline process. Check the pipeline help (`file-setup --help`) for more details. | ||||
| ``` | ||||
| # Option one: Use the wrapper script | ||||
| ## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/raw/1.0.0/wrapper/file-setup, make it executeable and add it to your ${PATH} | ||||
| cd /<my_data_location> | ||||
| file-setup -i input -o output | ||||
|  | ||||
| # Option two: Classic Docker style | ||||
| docker run \ | ||||
|     --rm \ | ||||
|     -it \ | ||||
|     -u $(id -u $USER):$(id -g $USER) \ | ||||
|     -v /<my_data_location>/input:/input \ | ||||
|     -v /<my_data_location>/output:/output \ | ||||
|     gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/file-setup:1.0.0 \ | ||||
|         -i /input \ | ||||
|         -o /output | ||||
| ``` | ||||
|  | ||||
| 4. Check your results in the `/<my_data_location>/output` directory. | ||||
|   | ||||
							
								
								
									
										221
									
								
								file-setup
									
									
									
									
									
								
							
							
						
						
									
										221
									
								
								file-setup
									
									
									
									
									
								
							| @@ -1,79 +1,188 @@ | ||||
| #!/usr/bin/env python3.7 | ||||
| #!/usr/bin/env python2.7 | ||||
| # coding=utf-8 | ||||
|  | ||||
| """A file setup pipeline for image file merging.""" | ||||
|  | ||||
| """ | ||||
| file-setup | ||||
|  | ||||
| Usage: For usage instructions run with option --help | ||||
| Authors: Patrick Jentsch <p.jentsch@uni-bielefeld.de | ||||
|          Stephan Porada <sporada@uni-bielefeld.de> | ||||
| """ | ||||
| __author__ = 'Patrick Jentsch <p.jentsch@uni-bielefeld.de>,' \ | ||||
|              'Stephan Porada <porada@posteo.de>' | ||||
| __version__ = '1.0.0' | ||||
|  | ||||
| from argparse import ArgumentParser | ||||
| from pyflow import WorkflowRunner | ||||
| import os | ||||
| import re | ||||
| import subprocess | ||||
| import sys | ||||
|  | ||||
|  | ||||
| def parse_arguments(): | ||||
|     parser = ArgumentParser(description='Merge images (JPEG, PNG or TIFF) into one PDF file.') | ||||
|     parser.add_argument('-i', '--input-directory', | ||||
|                         help='Input directory', | ||||
|                         required=True) | ||||
|     parser.add_argument('-o', '--output-directory', | ||||
|                         help='Output directory', | ||||
|                         required=True) | ||||
|     parser.add_argument('-f', '--output-file-base', | ||||
|                         help='output file base', | ||||
|                         required=True) | ||||
|     parser.add_argument('--log-dir') | ||||
|     parser.add_argument('--zip') | ||||
|     return parser.parse_args() | ||||
| class FileSetupPipelineJob: | ||||
|     """An file setup pipeline job class | ||||
|  | ||||
|     Each image containing input directory of the pipeline is represented as an | ||||
|     file setup pipeline job, which holds all necessary information for the | ||||
|     pipeline to process it. | ||||
|  | ||||
| def natural_sorted(iterable): | ||||
|     """ Sort the given list in the way that humans expect. | ||||
|     Arguments: | ||||
|     dir -- Path to the directory | ||||
|     output_dir -- Path to a directory, where job results a stored | ||||
|     """ | ||||
|     convert = lambda text: int(text) if text.isdigit() else text | ||||
|     alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)] | ||||
|     return sorted(iterable, key=alphanum_key) | ||||
|  | ||||
|     def __init__(self, dir, output_dir): | ||||
|         self.dir = dir | ||||
|         self.name = os.path.basename(dir) | ||||
|         self.output_dir = output_dir | ||||
|  | ||||
|  | ||||
| def merge_images(input_dir, output_dir, output_file_base, zip): | ||||
|     try: | ||||
|         os.mkdir(output_dir) | ||||
|     except FileExistsError: | ||||
|         pass | ||||
|     files = filter(lambda x: x.lower().endswith(('.jpg', '.jpeg', '.png', '.tif', '.tiff')), | ||||
|                    os.listdir(input_dir)) | ||||
|     files = natural_sorted(files) | ||||
|     files = map(lambda x: os.path.join(input_dir, x), files) | ||||
|     output_file = os.path.join(output_dir, '{}.pdf'.format(output_file_base)) | ||||
|     # Convert input files to a single PDF | ||||
|     cmd = 'convert "{}" "{}"'.format('" "'.join(files), output_file) | ||||
|     subprocess.run(cmd, shell=True) | ||||
|     # zip stuff | ||||
|     if zip is not None: | ||||
|         # Remove .zip file extension if provided | ||||
|         if zip.lower().endswith('.zip'): | ||||
|             zip = zip[:-4] | ||||
|             zip = zip if zip else 'output' | ||||
|         cmd = 'cd "{}"'.format(output_dir) | ||||
| class FileSetupPipeline(WorkflowRunner): | ||||
|     def __init__(self, input_dir, output_dir, zip): | ||||
|         self.input_dir = input_dir | ||||
|         self.output_dir = output_dir | ||||
|         self.zip = zip | ||||
|         self.jobs = collect_jobs(self.input_dir, self.output_dir) | ||||
|  | ||||
|     def workflow(self): | ||||
|         if not self.jobs: | ||||
|             return | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # setup output directory                         # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         setup_output_directory_tasks = [] | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             cmd = 'mkdir -p "{}"'.format(job.output_dir) | ||||
|             lbl = 'setup_output_directory_-_{}'.format(i) | ||||
|             task = self.addTask(command=cmd, label=lbl) | ||||
|             setup_output_directory_tasks.append(task) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # pre file setup                                 # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         pre_file_setup_tasks = [] | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             input_file = os.path.join(job.output_dir, 'file_setup_input_files.txt')  # noqa | ||||
|             cmd = 'ls -dQv "{}/"* >> "{}"'.format(job.dir, input_file) | ||||
|             deps = 'setup_output_directory_-_{}'.format(i) | ||||
|             lbl = 'pre_file_setup_-_{}'.format(i) | ||||
|             task = self.addTask(command=cmd, dependencies=deps, label=lbl) | ||||
|             pre_file_setup_tasks.append(task) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # file setup                                     # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         file_setup_tasks = [] | ||||
|         n_cores = max(1, int(self.getNCores() / len(self.jobs))) | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             input_file = os.path.join(job.output_dir, 'file_setup_input_files.txt')  # noqa | ||||
|             output_file = os.path.join(job.output_dir, '{}.pdf'.format(job.name))  # noqa | ||||
|             cmd = 'convert "@{}" "{}"'.format(input_file, output_file) | ||||
|             deps = 'pre_file_setup_-_{}'.format(i) | ||||
|             lbl = 'file_setup_-_{}'.format(i) | ||||
|             task = self.addTask(command=cmd, dependencies=deps, label=lbl, | ||||
|                                 nCores=n_cores) | ||||
|             file_setup_tasks.append(task) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # post file setup                                # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         post_file_setup_tasks = [] | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             input_file = os.path.join(job.output_dir, 'file_setup_input_files.txt')  # noqa | ||||
|             cmd = 'rm "{}"'.format(input_file) | ||||
|             deps = 'file_setup_-_{}'.format(i) | ||||
|             lbl = 'post_file_setup_-_{}'.format(i) | ||||
|             task = self.addTask(command=cmd, dependencies=deps, label=lbl) | ||||
|             post_file_setup_tasks.append(task) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # zip creation                                   # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         zip_creation_tasks = [] | ||||
|         if self.zip is not None: | ||||
|             cmd = 'cd "{}"'.format(self.output_dir) | ||||
|             cmd += ' && ' | ||||
|             cmd += 'zip' | ||||
|         cmd += ' "{}.zip" "{}.pdf"'.format(zip, output_file_base) | ||||
|             cmd += ' -r' | ||||
|             cmd += ' "{}.zip" .'.format(self.zip) | ||||
|             cmd += ' -x "pyflow.data*"' | ||||
|             cmd += ' -i "*.pdf"' | ||||
|             cmd += ' && ' | ||||
|             cmd += 'cd -' | ||||
|         subprocess.run(cmd, shell=True) | ||||
|             deps = file_setup_tasks | ||||
|             lbl = 'zip_creation' | ||||
|             task = self.addTask(command=cmd, dependencies=deps, label=lbl) | ||||
|             zip_creation_tasks.append(task) | ||||
|  | ||||
|  | ||||
| def collect_jobs(input_dir, output_dir): | ||||
|     jobs = [] | ||||
|     for dir in os.listdir(input_dir): | ||||
|         if not os.path.isdir(os.path.join(input_dir, dir)): | ||||
|             continue | ||||
|         # TODO: Filter for file types | ||||
|         if not os.listdir(os.path.join(input_dir, dir)): | ||||
|             continue | ||||
|         job = FileSetupPipelineJob(os.path.join(input_dir, dir), | ||||
|                                    os.path.join(output_dir, dir)) | ||||
|         jobs.append(job) | ||||
|     return jobs | ||||
|  | ||||
|  | ||||
| def parse_args(): | ||||
|     parser = ArgumentParser(description='A file setup pipeline for image file merging',  # noqa | ||||
|                             prog='File setup pipeline') | ||||
|     parser.add_argument('-i', '--input-dir', | ||||
|                         help='Input directory', | ||||
|                         required=True) | ||||
|     parser.add_argument('-o', '--output-dir', | ||||
|                         help='Output directory', | ||||
|                         required=True) | ||||
|     parser.add_argument('--log-dir', | ||||
|                         help='Logging directory') | ||||
|     parser.add_argument('--mem-mb', | ||||
|                         help='Amount of system memory to be used (Default: min(--n-cores * 2048, available system memory))',  # noqa | ||||
|                         type=int) | ||||
|     parser.add_argument('--n-cores', | ||||
|                         default=1, | ||||
|                         help='Number of CPU threads to be used (Default: 1)', | ||||
|                         type=int) | ||||
|     parser.add_argument('--zip', | ||||
|                         help='Create one zip file per filetype') | ||||
|     parser.add_argument('-v', '--version', | ||||
|                         action='version', | ||||
|                         help='Returns the current version of the file setup pipeline',  # noqa | ||||
|                         version='%(prog)s {}'.format(__version__)) | ||||
|     args = parser.parse_args() | ||||
|  | ||||
|     # Set some tricky default values and check for insufficient input | ||||
|     if args.log_dir is None: | ||||
|         args.log_dir = args.output_dir | ||||
|     if args.n_cores < 1: | ||||
|         raise Exception('--n-cores must be greater or equal 1') | ||||
|     if args.mem_mb is None: | ||||
|         max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0]) | ||||
|         args.mem_mb = min(args.n_cores * 2048, max_mem_mb) | ||||
|     if args.mem_mb < 2048: | ||||
|         raise Exception('--mem-mb must be greater or equal 2048') | ||||
|     if args.zip is not None and args.zip.lower().endswith('.zip'): | ||||
|         # Remove .zip file extension if provided | ||||
|         args.zip = args.zip[:-4] | ||||
|         args.zip = args.zip if args.zip else 'output' | ||||
|     return args | ||||
|  | ||||
|  | ||||
| def main(): | ||||
|     args = parse_arguments() | ||||
|     merge_images(args.input_directory, | ||||
|                  args.output_directory, | ||||
|                  args.output_file_base, | ||||
|                  args.zip) | ||||
|     args = parse_args() | ||||
|     file_setup_pipeline = FileSetupPipeline(args.input_dir, args.output_dir, args.zip)  # noqa | ||||
|     retval = file_setup_pipeline.run(dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores)  # noqa | ||||
|     sys.exit(retval) | ||||
|  | ||||
|  | ||||
| if __name__ == '__main__': | ||||
|   | ||||
| @@ -4,30 +4,35 @@ | ||||
| from argparse import ArgumentParser | ||||
| import os | ||||
| import subprocess | ||||
| import sys | ||||
|  | ||||
| CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/file-setup:latest' | ||||
| CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/file-setup:1.0.0' | ||||
| CONTAINER_INPUT_DIR = '/input' | ||||
| CONTAINER_OUTPUT_DIR = '/output' | ||||
| CONTAINER_LOG_DIR = '/logs' | ||||
| UID = str(os.getuid()) | ||||
| GID = str(os.getgid()) | ||||
|  | ||||
| parser = ArgumentParser(add_help=False) | ||||
| parser.add_argument('-i', '--input-directory') | ||||
| parser.add_argument('-o', '--output-directory') | ||||
| parser.add_argument('-i', '--input-dir') | ||||
| parser.add_argument('-o', '--output-dir') | ||||
| parser.add_argument('--log-dir') | ||||
| args, remaining_args = parser.parse_known_args() | ||||
|  | ||||
| cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)] | ||||
| if args.output_directory is not None: | ||||
|     cmd += ['-v', '{}:{}'.format(os.path.abspath(args.output_directory), | ||||
|                                  CONTAINER_OUTPUT_DIR)] | ||||
|     remaining_args.insert(0, CONTAINER_OUTPUT_DIR) | ||||
|     remaining_args.insert(0, '-o') | ||||
| if args.input_directory is not None: | ||||
|     cmd += ['-v', '{}:{}'.format(os.path.abspath(args.input_directory), | ||||
|                                  CONTAINER_INPUT_DIR)] | ||||
|     remaining_args.insert(0, CONTAINER_INPUT_DIR) | ||||
|     remaining_args.insert(0, '-i') | ||||
| if args.input_dir is not None: | ||||
|     mapping = os.path.abspath(args.input_dir) + ':' + CONTAINER_INPUT_DIR | ||||
|     cmd += ['-v', mapping] | ||||
|     remaining_args += ['-i', CONTAINER_INPUT_DIR] | ||||
| if args.output_dir is not None: | ||||
|     mapping = os.path.abspath(args.output_dir) + ':' + CONTAINER_OUTPUT_DIR | ||||
|     cmd += ['-v', mapping] | ||||
|     remaining_args += ['-o', CONTAINER_OUTPUT_DIR] | ||||
| if args.log_dir is not None: | ||||
|     mapping = os.path.abspath(args.log_dir) + ':' + CONTAINER_LOG_DIR | ||||
|     cmd += ['-v', mapping] | ||||
|     remaining_args += ['--log-dir', CONTAINER_LOG_DIR] | ||||
| cmd.append(CONTAINER_IMAGE) | ||||
| cmd += remaining_args | ||||
|  | ||||
| subprocess.run(cmd) | ||||
| sys.exit(subprocess.run(cmd).returncode) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user