From cdb4f1889c4c69e35bb791212e180f677431310a Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Tue, 18 Jan 2022 12:58:42 +0100 Subject: [PATCH] Update Pipeline structure --- Dockerfile | 32 +++--- LICENSE | 21 ++++ README.md | 46 ++++----- file-setup | 246 +++++++++++++++++++++------------------------ wrapper/file-setup | 10 +- 5 files changed, 175 insertions(+), 180 deletions(-) create mode 100644 LICENSE diff --git a/Dockerfile b/Dockerfile index 9cccf1f..7efe836 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,38 +9,32 @@ ENV LANG=C.UTF-8 RUN apt-get update \ && apt-get install --no-install-recommends --yes \ - wget + imagemagick \ + procps \ + wget \ + && mv /etc/ImageMagick-6/policy.xml /etc/ImageMagick-6/policy.xml.bak -# Install the NLP pipeline and it's dependencies # +# Install the File setup pipeline and it's dependencies # ## Install pyFlow ## ENV PYFLOW_VERSION=1.1.20 RUN wget --no-check-certificate --quiet \ "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \ -&& tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \ -&& cd "pyflow-${PYFLOW_VERSION}" \ -&& apt-get install --no-install-recommends --yes \ - python2.7 \ -&& python2.7 setup.py build install \ -&& cd .. \ -&& rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz" + && tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \ + && cd "pyflow-${PYFLOW_VERSION}" \ + && apt-get install --no-install-recommends --yes \ + python2.7 \ + && python2.7 setup.py build install \ + && cd - > /dev/null \ + && rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz" -## Further dependencies ## -RUN apt-get install --no-install-recommends --yes \ - imagemagick \ - procps \ - python3.7 \ - zip \ - && mv /etc/ImageMagick-6/policy.xml /etc/ImageMagick-6/policy.xml.bak +RUN rm -r /var/lib/apt/lists/* ## Install Pipeline ## COPY file-setup /usr/local/bin -RUN rm -r /var/lib/apt/lists/* - - ENTRYPOINT ["file-setup"] CMD ["--help"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..a374dbc --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Bielefeld University - CRC 1288 - INF + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index ec53ab0..862e536 100644 --- a/README.md +++ b/README.md @@ -1,37 +1,37 @@ # File setup -This software implements a parallelized pipeline to setup image files. It is used for nopaque's File setup service but you can also use it standalone, for that purpose a convenient wrapper script is provided. +This software implements a parallelized pipeline to setup image files. It is used for nopaque's File setup service but you can also use it standalone, for that purpose a convenient wrapper script is provided. The pipeline is designed to run on Linux operating systems, but with some tweaks it should also run on Windows with WSL installed. ## Software used in this pipeline implementation -- Official Debian Docker image (buster-slim) and programs from its free repositories: https://hub.docker.com/_/debian +- Official Debian Docker image (buster-slim): https://hub.docker.com/_/debian + - Software from Debian Buster's free repositories -## Use this image +## Installation -1. Create input and output directories for the pipeline. -``` bash -mkdir -p //input //output +1. Install Docker and Python 3. +2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup.git` +2. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/file-setup:v0.1.0 file-setup` +3. Add the wrapper script (`wrapper/filesetup` relative to this README file) to your `${PATH}`. +4. Create working directories for the pipeline: `mkdir -p //{input,output}`. + +## Use the Pipeline + +1. Place your images files inside a subdirectory in `//input`. It should look similar to this: +``` +. +|-- input +| |-- alice_in_wonderland +| |-- page-1.png +| |-- page-2.png +| |-- ... +| `-- page-x.png +`-- output ``` - -2. Place your images files inside a directory in `//input`. 3. Start the pipeline process. Check the pipeline help (`file-setup --help`) for more details. -``` -# Option one: Use the wrapper script -## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/raw/1.0.0/wrapper/file-setup, make it executeable and add it to your ${PATH} +```bash cd / file-setup -i input -o output - -# Option two: Classic Docker style -docker run \ - --rm \ - -it \ - -u $(id -u $USER):$(id -g $USER) \ - -v //input:/input \ - -v //output:/output \ - gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/file-setup:1.0.0 \ - -i /input \ - -o /output ``` - 4. Check your results in the `//output` directory. diff --git a/file-setup b/file-setup index 8ccb27f..83ee94d 100755 --- a/file-setup +++ b/file-setup @@ -1,29 +1,28 @@ #!/usr/bin/env python2.7 # coding=utf-8 -"""A file setup pipeline for image file merging.""" - -__author__ = 'Patrick Jentsch ,' \ - 'Stephan Porada ' -__version__ = '1.0.0' +''' File setup pipeline for image file merging. ''' +__version__ = '0.1.0' from argparse import ArgumentParser from pyflow import WorkflowRunner +import json +import multiprocessing import os import sys -class FileSetupPipelineJob: - """An file setup pipeline job class +class PipelineJob: + ''' + File setup pipeline job class. - Each image containing input directory of the pipeline is represented as an - file setup pipeline job, which holds all necessary information for the - pipeline to process it. + Each input directory containing images is represented here together with + all necessary information for the pipeline to process it. Arguments: dir -- Path to the directory - output_dir -- Path to a directory, where job results a stored - """ + output_dir -- Path to a directory, where job results are stored + ''' def __init__(self, dir, output_dir): self.dir = dir @@ -31,134 +30,118 @@ class FileSetupPipelineJob: self.output_dir = output_dir -class FileSetupPipeline(WorkflowRunner): - def __init__(self, input_dir, output_dir, zip): +class CreatePDFWorkflow(WorkflowRunner): + def __init__(self, job): + self.job = job + + def workflow(self): + ''' + ' ################################################## + ' # convert # + ' ################################################## + ''' + n_cores = min(2, self.getNCores()) + mem_mb = min(n_cores * 256, self.getMemMb()) + cmd = 'ls -dv "{}/"* > "{}"'.format( + os.path.join(self.job.dir), + os.path.join(self.job.output_dir, 'inputs.txt') + ) + cmd += ' && ' + cmd += 'convert "@{}" "{}"'.format( + os.path.join(self.job.output_dir, 'inputs.txt'), + os.path.join(self.job.output_dir, '{}.pdf'.format(self.job.name)) + ) + cmd += ' && ' + cmd += 'rm "{}"'.format(os.path.join(self.job.output_dir, 'inputs.txt')) # noqa + self.addTask( + 'convert', + command=cmd, + memMb=mem_mb, + nCores=n_cores + ) + + +class MainWorkflow(WorkflowRunner): + def __init__(self, input_dir, output_dir): self.input_dir = input_dir self.output_dir = output_dir - self.zip = zip - self.jobs = collect_jobs(self.input_dir, self.output_dir) + self.jobs = [] + self.collect_jobs() + + def collect_jobs(self): + for dir in os.listdir(self.input_dir): + if not os.path.isdir(os.path.join(self.input_dir, dir)): + continue + if not os.listdir(os.path.join(self.input_dir, dir)): + continue + # TODO: Filter for file types within the directory + job = PipelineJob( + os.path.join(self.input_dir, dir), + os.path.join(self.output_dir, dir) + ) + self.jobs.append(job) def workflow(self): if not self.jobs: return + # Create output and temporary directories + for job in self.jobs: + os.mkdir(job.output_dir) + ''' ' ################################################## - ' # setup output directory # + ' # create-pdf # ' ################################################## ''' - setup_output_directory_tasks = [] + create_pdf_tasks = [] for i, job in enumerate(self.jobs): - cmd = 'mkdir -p "{}"'.format(job.output_dir) - lbl = 'setup_output_directory_-_{}'.format(i) - task = self.addTask(command=cmd, label=lbl) - setup_output_directory_tasks.append(task) + task = self.addWorkflowTask( + 'create_pdf_-_{}'.format(i), + CreatePDFWorkflow(job) + ) + create_pdf_tasks.append(task) - ''' - ' ################################################## - ' # pre file setup # - ' ################################################## - ''' - pre_file_setup_tasks = [] - for i, job in enumerate(self.jobs): - input_file = os.path.join(job.output_dir, 'file_setup_input_files.txt') # noqa - cmd = 'ls -dQv "{}/"* >> "{}"'.format(job.dir, input_file) - deps = 'setup_output_directory_-_{}'.format(i) - lbl = 'pre_file_setup_-_{}'.format(i) - task = self.addTask(command=cmd, dependencies=deps, label=lbl) - pre_file_setup_tasks.append(task) - - ''' - ' ################################################## - ' # file setup # - ' ################################################## - ''' - file_setup_tasks = [] - n_cores = max(1, int(self.getNCores() / len(self.jobs))) - for i, job in enumerate(self.jobs): - input_file = os.path.join(job.output_dir, 'file_setup_input_files.txt') # noqa - output_file = os.path.join(job.output_dir, '{}.pdf'.format(job.name)) # noqa - cmd = 'convert "@{}" "{}"'.format(input_file, output_file) - deps = 'pre_file_setup_-_{}'.format(i) - lbl = 'file_setup_-_{}'.format(i) - task = self.addTask(command=cmd, dependencies=deps, label=lbl, - nCores=n_cores) - file_setup_tasks.append(task) - - ''' - ' ################################################## - ' # post file setup # - ' ################################################## - ''' - post_file_setup_tasks = [] - for i, job in enumerate(self.jobs): - input_file = os.path.join(job.output_dir, 'file_setup_input_files.txt') # noqa - cmd = 'rm "{}"'.format(input_file) - deps = 'file_setup_-_{}'.format(i) - lbl = 'post_file_setup_-_{}'.format(i) - task = self.addTask(command=cmd, dependencies=deps, label=lbl) - post_file_setup_tasks.append(task) - - ''' - ' ################################################## - ' # zip creation # - ' ################################################## - ''' - zip_creation_tasks = [] - if self.zip is not None: - cmd = 'cd "{}"'.format(self.output_dir) - cmd += ' && ' - cmd += 'zip' - cmd += ' -r' - cmd += ' "{}.zip" .'.format(self.zip) - cmd += ' -x "pyflow.data*"' - cmd += ' -i "*.pdf"' - cmd += ' && ' - cmd += 'cd -' - deps = file_setup_tasks - lbl = 'zip_creation' - task = self.addTask(command=cmd, dependencies=deps, label=lbl) - zip_creation_tasks.append(task) - - -def collect_jobs(input_dir, output_dir): - jobs = [] - for dir in os.listdir(input_dir): - if not os.path.isdir(os.path.join(input_dir, dir)): - continue - # TODO: Filter for file types - if not os.listdir(os.path.join(input_dir, dir)): - continue - job = FileSetupPipelineJob(os.path.join(input_dir, dir), - os.path.join(output_dir, dir)) - jobs.append(job) - return jobs + self.waitForTasks() + for job in self.jobs: + # Track output files + relative_output_dir = os.path.relpath(job.output_dir, start=self.output_dir) # noqa + self.output_files.append( + { + 'description': 'PDF file without text layer.', + 'file': os.path.join(relative_output_dir, '{}.pdf'.format(job.name)), # noqa + 'mimetype': 'application/pdf' + } + ) + with open(os.path.join(self.output_dir, 'output_records.json'), 'w') as f: # noqa + json.dump(self.output_files, f, indent=4) def parse_args(): - parser = ArgumentParser(description='A file setup pipeline for image file merging', # noqa - prog='File setup pipeline') - parser.add_argument('-i', '--input-dir', - help='Input directory', - required=True) - parser.add_argument('-o', '--output-dir', - help='Output directory', - required=True) - parser.add_argument('--log-dir', - help='Logging directory') - parser.add_argument('--mem-mb', - help='Amount of system memory to be used (Default: min(--n-cores * 2048, available system memory))', # noqa - type=int) - parser.add_argument('--n-cores', - default=1, - help='Number of CPU threads to be used (Default: 1)', - type=int) - parser.add_argument('--zip', - help='Create one zip file per filetype') - parser.add_argument('-v', '--version', - action='version', - help='Returns the current version of the file setup pipeline', # noqa - version='%(prog)s {}'.format(__version__)) + parser = ArgumentParser(description='Pipeline for merging images') + parser.add_argument( + '-i', '--input-dir', help='Input directory', required=True) + parser.add_argument( + '-o', '--output-dir', help='Output directory', required=True) + parser.add_argument( + '--log-dir', help='Logging directory (Default: --output-dir)') + parser.add_argument( + '--mem-mb', + help='Amount of system memory to be used (Default: min(--n-cores * 256, available system memory))', # noqa + type=int + ) + parser.add_argument( + '--n-cores', + default=min(2, multiprocessing.cpu_count()), + help='Number of CPU threads to be used (Default: min(2, CPU count))', + type=int + ) + parser.add_argument( + '-v', '--version', + action='version', + help='Returns the current version of the file setup pipeline', + version='%(prog)s {}'.format(__version__) + ) args = parser.parse_args() # Set some tricky default values and check for insufficient input @@ -168,20 +151,17 @@ def parse_args(): raise Exception('--n-cores must be greater or equal 1') if args.mem_mb is None: max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0]) - args.mem_mb = min(args.n_cores * 2048, max_mem_mb) - if args.mem_mb < 2048: - raise Exception('--mem-mb must be greater or equal 2048') - if args.zip is not None and args.zip.lower().endswith('.zip'): - # Remove .zip file extension if provided - args.zip = args.zip[:-4] - args.zip = args.zip if args.zip else 'output' + args.mem_mb = min(args.n_cores * 256, max_mem_mb) + if args.mem_mb < 256: + raise Exception('--mem-mb must be greater or equal 256') return args def main(): args = parse_args() - file_setup_pipeline = FileSetupPipeline(args.input_dir, args.output_dir, args.zip) # noqa - retval = file_setup_pipeline.run(dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores) # noqa + main_workflow = MainWorkflow(args.input_dir, args.output_dir) + retval = main_workflow.run( + dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores) # noqa sys.exit(retval) diff --git a/wrapper/file-setup b/wrapper/file-setup index e3ceb8f..bb6c02e 100755 --- a/wrapper/file-setup +++ b/wrapper/file-setup @@ -6,7 +6,7 @@ import os import subprocess import sys -CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/file-setup:1.0.0' +CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/file-setup:v0.1.0' # noqa CONTAINER_INPUT_DIR = '/input' CONTAINER_OUTPUT_DIR = '/output' CONTAINER_LOG_DIR = '/logs' @@ -19,17 +19,17 @@ parser.add_argument('-o', '--output-dir') parser.add_argument('--log-dir') args, remaining_args = parser.parse_known_args() -cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)] +cmd = ['docker', 'run', '--rm', '-it', '-u', f'{UID}:{GID}'] if args.input_dir is not None: - mapping = os.path.abspath(args.input_dir) + ':' + CONTAINER_INPUT_DIR + mapping = f'{os.path.abspath(args.input_dir)}:{CONTAINER_INPUT_DIR}' cmd += ['-v', mapping] remaining_args += ['-i', CONTAINER_INPUT_DIR] if args.output_dir is not None: - mapping = os.path.abspath(args.output_dir) + ':' + CONTAINER_OUTPUT_DIR + mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}' cmd += ['-v', mapping] remaining_args += ['-o', CONTAINER_OUTPUT_DIR] if args.log_dir is not None: - mapping = os.path.abspath(args.log_dir) + ':' + CONTAINER_LOG_DIR + mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}' cmd += ['-v', mapping] remaining_args += ['--log-dir', CONTAINER_LOG_DIR] cmd.append(CONTAINER_IMAGE)