From cdb4f1889c4c69e35bb791212e180f677431310a Mon Sep 17 00:00:00 2001
From: Patrick Jentsch
Date: Tue, 18 Jan 2022 12:58:42 +0100
Subject: [PATCH] Update Pipeline structure
---
Dockerfile | 32 +++---
LICENSE | 21 ++++
README.md | 46 ++++-----
file-setup | 246 +++++++++++++++++++++------------------------
wrapper/file-setup | 10 +-
5 files changed, 175 insertions(+), 180 deletions(-)
create mode 100644 LICENSE
diff --git a/Dockerfile b/Dockerfile
index 9cccf1f..7efe836 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -9,38 +9,32 @@ ENV LANG=C.UTF-8
RUN apt-get update \
&& apt-get install --no-install-recommends --yes \
- wget
+ imagemagick \
+ procps \
+ wget \
+ && mv /etc/ImageMagick-6/policy.xml /etc/ImageMagick-6/policy.xml.bak
-# Install the NLP pipeline and it's dependencies #
+# Install the File setup pipeline and it's dependencies #
## Install pyFlow ##
ENV PYFLOW_VERSION=1.1.20
RUN wget --no-check-certificate --quiet \
"https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \
-&& tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
-&& cd "pyflow-${PYFLOW_VERSION}" \
-&& apt-get install --no-install-recommends --yes \
- python2.7 \
-&& python2.7 setup.py build install \
-&& cd .. \
-&& rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"
+ && tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
+ && cd "pyflow-${PYFLOW_VERSION}" \
+ && apt-get install --no-install-recommends --yes \
+ python2.7 \
+ && python2.7 setup.py build install \
+ && cd - > /dev/null \
+ && rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"
-## Further dependencies ##
-RUN apt-get install --no-install-recommends --yes \
- imagemagick \
- procps \
- python3.7 \
- zip \
- && mv /etc/ImageMagick-6/policy.xml /etc/ImageMagick-6/policy.xml.bak
+RUN rm -r /var/lib/apt/lists/*
## Install Pipeline ##
COPY file-setup /usr/local/bin
-RUN rm -r /var/lib/apt/lists/*
-
-
ENTRYPOINT ["file-setup"]
CMD ["--help"]
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..a374dbc
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Bielefeld University - CRC 1288 - INF
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
index ec53ab0..862e536 100644
--- a/README.md
+++ b/README.md
@@ -1,37 +1,37 @@
# File setup
-This software implements a parallelized pipeline to setup image files. It is used for nopaque's File setup service but you can also use it standalone, for that purpose a convenient wrapper script is provided.
+This software implements a parallelized pipeline to setup image files. It is used for nopaque's File setup service but you can also use it standalone, for that purpose a convenient wrapper script is provided. The pipeline is designed to run on Linux operating systems, but with some tweaks it should also run on Windows with WSL installed.
## Software used in this pipeline implementation
-- Official Debian Docker image (buster-slim) and programs from its free repositories: https://hub.docker.com/_/debian
+- Official Debian Docker image (buster-slim): https://hub.docker.com/_/debian
+ - Software from Debian Buster's free repositories
-## Use this image
+## Installation
-1. Create input and output directories for the pipeline.
-``` bash
-mkdir -p //input //output
+1. Install Docker and Python 3.
+2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup.git`
+2. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/file-setup:v0.1.0 file-setup`
+3. Add the wrapper script (`wrapper/filesetup` relative to this README file) to your `${PATH}`.
+4. Create working directories for the pipeline: `mkdir -p //{input,output}`.
+
+## Use the Pipeline
+
+1. Place your images files inside a subdirectory in `//input`. It should look similar to this:
+```
+.
+|-- input
+| |-- alice_in_wonderland
+| |-- page-1.png
+| |-- page-2.png
+| |-- ...
+| `-- page-x.png
+`-- output
```
-
-2. Place your images files inside a directory in `//input`.
3. Start the pipeline process. Check the pipeline help (`file-setup --help`) for more details.
-```
-# Option one: Use the wrapper script
-## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/raw/1.0.0/wrapper/file-setup, make it executeable and add it to your ${PATH}
+```bash
cd /
file-setup -i input -o output
-
-# Option two: Classic Docker style
-docker run \
- --rm \
- -it \
- -u $(id -u $USER):$(id -g $USER) \
- -v //input:/input \
- -v //output:/output \
- gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/file-setup:1.0.0 \
- -i /input \
- -o /output
```
-
4. Check your results in the `//output` directory.
diff --git a/file-setup b/file-setup
index 8ccb27f..83ee94d 100755
--- a/file-setup
+++ b/file-setup
@@ -1,29 +1,28 @@
#!/usr/bin/env python2.7
# coding=utf-8
-"""A file setup pipeline for image file merging."""
-
-__author__ = 'Patrick Jentsch ,' \
- 'Stephan Porada '
-__version__ = '1.0.0'
+''' File setup pipeline for image file merging. '''
+__version__ = '0.1.0'
from argparse import ArgumentParser
from pyflow import WorkflowRunner
+import json
+import multiprocessing
import os
import sys
-class FileSetupPipelineJob:
- """An file setup pipeline job class
+class PipelineJob:
+ '''
+ File setup pipeline job class.
- Each image containing input directory of the pipeline is represented as an
- file setup pipeline job, which holds all necessary information for the
- pipeline to process it.
+ Each input directory containing images is represented here together with
+ all necessary information for the pipeline to process it.
Arguments:
dir -- Path to the directory
- output_dir -- Path to a directory, where job results a stored
- """
+ output_dir -- Path to a directory, where job results are stored
+ '''
def __init__(self, dir, output_dir):
self.dir = dir
@@ -31,134 +30,118 @@ class FileSetupPipelineJob:
self.output_dir = output_dir
-class FileSetupPipeline(WorkflowRunner):
- def __init__(self, input_dir, output_dir, zip):
+class CreatePDFWorkflow(WorkflowRunner):
+ def __init__(self, job):
+ self.job = job
+
+ def workflow(self):
+ '''
+ ' ##################################################
+ ' # convert #
+ ' ##################################################
+ '''
+ n_cores = min(2, self.getNCores())
+ mem_mb = min(n_cores * 256, self.getMemMb())
+ cmd = 'ls -dv "{}/"* > "{}"'.format(
+ os.path.join(self.job.dir),
+ os.path.join(self.job.output_dir, 'inputs.txt')
+ )
+ cmd += ' && '
+ cmd += 'convert "@{}" "{}"'.format(
+ os.path.join(self.job.output_dir, 'inputs.txt'),
+ os.path.join(self.job.output_dir, '{}.pdf'.format(self.job.name))
+ )
+ cmd += ' && '
+ cmd += 'rm "{}"'.format(os.path.join(self.job.output_dir, 'inputs.txt')) # noqa
+ self.addTask(
+ 'convert',
+ command=cmd,
+ memMb=mem_mb,
+ nCores=n_cores
+ )
+
+
+class MainWorkflow(WorkflowRunner):
+ def __init__(self, input_dir, output_dir):
self.input_dir = input_dir
self.output_dir = output_dir
- self.zip = zip
- self.jobs = collect_jobs(self.input_dir, self.output_dir)
+ self.jobs = []
+ self.collect_jobs()
+
+ def collect_jobs(self):
+ for dir in os.listdir(self.input_dir):
+ if not os.path.isdir(os.path.join(self.input_dir, dir)):
+ continue
+ if not os.listdir(os.path.join(self.input_dir, dir)):
+ continue
+ # TODO: Filter for file types within the directory
+ job = PipelineJob(
+ os.path.join(self.input_dir, dir),
+ os.path.join(self.output_dir, dir)
+ )
+ self.jobs.append(job)
def workflow(self):
if not self.jobs:
return
+ # Create output and temporary directories
+ for job in self.jobs:
+ os.mkdir(job.output_dir)
+
'''
' ##################################################
- ' # setup output directory #
+ ' # create-pdf #
' ##################################################
'''
- setup_output_directory_tasks = []
+ create_pdf_tasks = []
for i, job in enumerate(self.jobs):
- cmd = 'mkdir -p "{}"'.format(job.output_dir)
- lbl = 'setup_output_directory_-_{}'.format(i)
- task = self.addTask(command=cmd, label=lbl)
- setup_output_directory_tasks.append(task)
+ task = self.addWorkflowTask(
+ 'create_pdf_-_{}'.format(i),
+ CreatePDFWorkflow(job)
+ )
+ create_pdf_tasks.append(task)
- '''
- ' ##################################################
- ' # pre file setup #
- ' ##################################################
- '''
- pre_file_setup_tasks = []
- for i, job in enumerate(self.jobs):
- input_file = os.path.join(job.output_dir, 'file_setup_input_files.txt') # noqa
- cmd = 'ls -dQv "{}/"* >> "{}"'.format(job.dir, input_file)
- deps = 'setup_output_directory_-_{}'.format(i)
- lbl = 'pre_file_setup_-_{}'.format(i)
- task = self.addTask(command=cmd, dependencies=deps, label=lbl)
- pre_file_setup_tasks.append(task)
-
- '''
- ' ##################################################
- ' # file setup #
- ' ##################################################
- '''
- file_setup_tasks = []
- n_cores = max(1, int(self.getNCores() / len(self.jobs)))
- for i, job in enumerate(self.jobs):
- input_file = os.path.join(job.output_dir, 'file_setup_input_files.txt') # noqa
- output_file = os.path.join(job.output_dir, '{}.pdf'.format(job.name)) # noqa
- cmd = 'convert "@{}" "{}"'.format(input_file, output_file)
- deps = 'pre_file_setup_-_{}'.format(i)
- lbl = 'file_setup_-_{}'.format(i)
- task = self.addTask(command=cmd, dependencies=deps, label=lbl,
- nCores=n_cores)
- file_setup_tasks.append(task)
-
- '''
- ' ##################################################
- ' # post file setup #
- ' ##################################################
- '''
- post_file_setup_tasks = []
- for i, job in enumerate(self.jobs):
- input_file = os.path.join(job.output_dir, 'file_setup_input_files.txt') # noqa
- cmd = 'rm "{}"'.format(input_file)
- deps = 'file_setup_-_{}'.format(i)
- lbl = 'post_file_setup_-_{}'.format(i)
- task = self.addTask(command=cmd, dependencies=deps, label=lbl)
- post_file_setup_tasks.append(task)
-
- '''
- ' ##################################################
- ' # zip creation #
- ' ##################################################
- '''
- zip_creation_tasks = []
- if self.zip is not None:
- cmd = 'cd "{}"'.format(self.output_dir)
- cmd += ' && '
- cmd += 'zip'
- cmd += ' -r'
- cmd += ' "{}.zip" .'.format(self.zip)
- cmd += ' -x "pyflow.data*"'
- cmd += ' -i "*.pdf"'
- cmd += ' && '
- cmd += 'cd -'
- deps = file_setup_tasks
- lbl = 'zip_creation'
- task = self.addTask(command=cmd, dependencies=deps, label=lbl)
- zip_creation_tasks.append(task)
-
-
-def collect_jobs(input_dir, output_dir):
- jobs = []
- for dir in os.listdir(input_dir):
- if not os.path.isdir(os.path.join(input_dir, dir)):
- continue
- # TODO: Filter for file types
- if not os.listdir(os.path.join(input_dir, dir)):
- continue
- job = FileSetupPipelineJob(os.path.join(input_dir, dir),
- os.path.join(output_dir, dir))
- jobs.append(job)
- return jobs
+ self.waitForTasks()
+ for job in self.jobs:
+ # Track output files
+ relative_output_dir = os.path.relpath(job.output_dir, start=self.output_dir) # noqa
+ self.output_files.append(
+ {
+ 'description': 'PDF file without text layer.',
+ 'file': os.path.join(relative_output_dir, '{}.pdf'.format(job.name)), # noqa
+ 'mimetype': 'application/pdf'
+ }
+ )
+ with open(os.path.join(self.output_dir, 'output_records.json'), 'w') as f: # noqa
+ json.dump(self.output_files, f, indent=4)
def parse_args():
- parser = ArgumentParser(description='A file setup pipeline for image file merging', # noqa
- prog='File setup pipeline')
- parser.add_argument('-i', '--input-dir',
- help='Input directory',
- required=True)
- parser.add_argument('-o', '--output-dir',
- help='Output directory',
- required=True)
- parser.add_argument('--log-dir',
- help='Logging directory')
- parser.add_argument('--mem-mb',
- help='Amount of system memory to be used (Default: min(--n-cores * 2048, available system memory))', # noqa
- type=int)
- parser.add_argument('--n-cores',
- default=1,
- help='Number of CPU threads to be used (Default: 1)',
- type=int)
- parser.add_argument('--zip',
- help='Create one zip file per filetype')
- parser.add_argument('-v', '--version',
- action='version',
- help='Returns the current version of the file setup pipeline', # noqa
- version='%(prog)s {}'.format(__version__))
+ parser = ArgumentParser(description='Pipeline for merging images')
+ parser.add_argument(
+ '-i', '--input-dir', help='Input directory', required=True)
+ parser.add_argument(
+ '-o', '--output-dir', help='Output directory', required=True)
+ parser.add_argument(
+ '--log-dir', help='Logging directory (Default: --output-dir)')
+ parser.add_argument(
+ '--mem-mb',
+ help='Amount of system memory to be used (Default: min(--n-cores * 256, available system memory))', # noqa
+ type=int
+ )
+ parser.add_argument(
+ '--n-cores',
+ default=min(2, multiprocessing.cpu_count()),
+ help='Number of CPU threads to be used (Default: min(2, CPU count))',
+ type=int
+ )
+ parser.add_argument(
+ '-v', '--version',
+ action='version',
+ help='Returns the current version of the file setup pipeline',
+ version='%(prog)s {}'.format(__version__)
+ )
args = parser.parse_args()
# Set some tricky default values and check for insufficient input
@@ -168,20 +151,17 @@ def parse_args():
raise Exception('--n-cores must be greater or equal 1')
if args.mem_mb is None:
max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0])
- args.mem_mb = min(args.n_cores * 2048, max_mem_mb)
- if args.mem_mb < 2048:
- raise Exception('--mem-mb must be greater or equal 2048')
- if args.zip is not None and args.zip.lower().endswith('.zip'):
- # Remove .zip file extension if provided
- args.zip = args.zip[:-4]
- args.zip = args.zip if args.zip else 'output'
+ args.mem_mb = min(args.n_cores * 256, max_mem_mb)
+ if args.mem_mb < 256:
+ raise Exception('--mem-mb must be greater or equal 256')
return args
def main():
args = parse_args()
- file_setup_pipeline = FileSetupPipeline(args.input_dir, args.output_dir, args.zip) # noqa
- retval = file_setup_pipeline.run(dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores) # noqa
+ main_workflow = MainWorkflow(args.input_dir, args.output_dir)
+ retval = main_workflow.run(
+ dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores) # noqa
sys.exit(retval)
diff --git a/wrapper/file-setup b/wrapper/file-setup
index e3ceb8f..bb6c02e 100755
--- a/wrapper/file-setup
+++ b/wrapper/file-setup
@@ -6,7 +6,7 @@ import os
import subprocess
import sys
-CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/file-setup:1.0.0'
+CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/file-setup:v0.1.0' # noqa
CONTAINER_INPUT_DIR = '/input'
CONTAINER_OUTPUT_DIR = '/output'
CONTAINER_LOG_DIR = '/logs'
@@ -19,17 +19,17 @@ parser.add_argument('-o', '--output-dir')
parser.add_argument('--log-dir')
args, remaining_args = parser.parse_known_args()
-cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)]
+cmd = ['docker', 'run', '--rm', '-it', '-u', f'{UID}:{GID}']
if args.input_dir is not None:
- mapping = os.path.abspath(args.input_dir) + ':' + CONTAINER_INPUT_DIR
+ mapping = f'{os.path.abspath(args.input_dir)}:{CONTAINER_INPUT_DIR}'
cmd += ['-v', mapping]
remaining_args += ['-i', CONTAINER_INPUT_DIR]
if args.output_dir is not None:
- mapping = os.path.abspath(args.output_dir) + ':' + CONTAINER_OUTPUT_DIR
+ mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}'
cmd += ['-v', mapping]
remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
if args.log_dir is not None:
- mapping = os.path.abspath(args.log_dir) + ':' + CONTAINER_LOG_DIR
+ mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}'
cmd += ['-v', mapping]
remaining_args += ['--log-dir', CONTAINER_LOG_DIR]
cmd.append(CONTAINER_IMAGE)