Less complex, lower output file size.

This commit is contained in:
Patrick Jentsch 2020-09-18 11:17:29 +02:00
parent 09b40c47c5
commit 18e5ee21dc
4 changed files with 30 additions and 125 deletions

View File

@ -1,26 +1,24 @@
FROM debian:10-slim
LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de"
LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <sporada@uni-bielefeld.de>"
ENV LANG=C.UTF-8
RUN mkdir /usr/share/man/man1/ # needed to install pdftk https://github.com/geerlingguy/ansible-role-java/issues/64
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
ca-certificates \
imagemagick \
python3.7 \
pdftk \
zip
zip \
&& rm -r /var/lib/apt/lists/*
RUN rm -rf /var/lib/apt/lists/*
RUN rm -f /etc/ImageMagick-6/policy.xml
RUN cat /etc/ImageMagick-6/policy.xml && rm /etc/ImageMagick-6/policy.xml
COPY file-setup /usr/local/bin
COPY policy.xml /etc/ImageMagick-6
ENTRYPOINT ["file-setup"]

View File

@ -3,20 +3,21 @@
"""
merge_images
Usage: For usage instructions run with option --help
Author: Stephan Porada <sporada@uni-bielefeld.de>
file-setup
Usage: For usage instructions run with option --help
Authors: Patrick Jentsch <p.jentsch@uni-bielefeld.de
Stephan Porada <sporada@uni-bielefeld.de>
"""
from argparse import ArgumentParser
import os
import re
import subprocess
def parse_arguments():
parser = ArgumentParser(description='Merges images into one .pdf file.')
parser = ArgumentParser(description='Merge images (JPEG, PNG or TIFF) into one PDF file.')
parser.add_argument('-i', '--input-directory',
help='Input directory',
required=True)
@ -31,45 +32,37 @@ def parse_arguments():
return parser.parse_args()
def natural_sorted(iterable):
""" Sort the given list in the way that humans expect.
"""
convert = lambda text: int(text) if text.isdigit() else text
alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
return sorted(iterable, key=alphanum_key)
def merge_images(input_dir, output_dir, output_file_base, zip):
try:
os.mkdir(output_dir)
tmp_dir = os.path.join(input_dir, 'tmp')
os.mkdir(tmp_dir)
except FileExistsError:
pass
try:
tmp_dir = os.path.join(input_dir, 'tmp')
os.mkdir(tmp_dir)
except FileExistsError:
pass
# Sort filenames into a list ordered with version flag -v
cmd = 'ls -Q -v "{i}"/*.* > "{i}"/file_list.txt'.format(i=input_dir)
subprocess.run(cmd, shell=True)
# Convert all image files into pdf files
cmd = ('mogrify -compress LZW -format pdf '
+ '-path "{}" @"{}"/file_list.txt'.format(tmp_dir, input_dir))
subprocess.run(cmd, shell=True)
# remove file list
cmd = 'rm "{}"/file_list.txt'.format(input_dir)
subprocess.run(cmd, shell=True)
# join all pdfs into one pdf
cmd = ('pdftk "{}"/*.pdf cat '.format(tmp_dir)
+ 'output "{}"/"{}".pdf').format(output_dir, output_file_base)
subprocess.run(cmd, shell=True)
# remove single pdf files
cmd = 'rm -r "{}"'.format(tmp_dir)
files = filter(lambda x: x.lower().endswith(('.jpg', '.jpeg', '.png', '.tif', '.tiff')),
os.listdir(input_dir))
files = natural_sorted(files)
files = map(lambda x: os.path.join(input_dir, x), files)
output_file = os.path.join(output_dir, '{}.pdf'.format(output_file_base))
# Convert input files to a single PDF
cmd = 'convert "{}" "{}"'.format('" "'.join(files), output_file)
subprocess.run(cmd, shell=True)
# zip stuff
if zip is not None:
# Remove .zip file extension if provided
if zip.endswith('.zip'):
if zip.lower().endswith('.zip'):
zip = zip[:-4]
zip = zip if zip else 'output'
cmd = 'cd "{}"'.format(output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' "{}".zip "{}".pdf'.format(zip, output_file_base)
cmd += ' "{}.zip" "{}.pdf"'.format(zip, output_file_base)
cmd += ' && '
cmd += 'cd -'
subprocess.run(cmd, shell=True)

View File

@ -1,87 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE policymap [
<!ELEMENT policymap (policy)+>
<!ELEMENT policy (#PCDATA)>
<!ATTLIST policy domain (delegate|coder|filter|path|resource) #IMPLIED>
<!ATTLIST policy name CDATA #IMPLIED>
<!ATTLIST policy rights CDATA #IMPLIED>
<!ATTLIST policy pattern CDATA #IMPLIED>
<!ATTLIST policy value CDATA #IMPLIED>
]>
<!--
Configure ImageMagick policies.
Domains include system, delegate, coder, filter, path, or resource.
Rights include none, read, write, and execute. Use | to combine them,
for example: "read | write" to permit read from, or write to, a path.
Use a glob expression as a pattern.
Suppose we do not want users to process MPEG video images:
<policy domain="delegate" rights="none" pattern="mpeg:decode" />
Here we do not want users reading images from HTTP:
<policy domain="coder" rights="none" pattern="HTTP" />
Lets prevent users from executing any image filters:
<policy domain="filter" rights="none" pattern="*" />
The /repository file system is restricted to read only. We use a glob
expression to match all paths that start with /repository:
<policy domain="path" rights="read" pattern="/repository/*" />
Let's prevent possible exploits by removing the right to use indirect reads.
<policy domain="path" rights="none" pattern="@*" />
Any large image is cached to disk rather than memory:
<policy domain="resource" name="area" value="1GB"/>
Define arguments for the memory, map, area, width, height, and disk resources
with SI prefixes (.e.g 100MB). In addition, resource policies are maximums
for each instance of ImageMagick (e.g. policy memory limit 1GB, -limit 2GB
exceeds policy maximum so memory limit is 1GB).
-->
<policymap>
<!-- <policy domain="system" name="shred" value="2"/> -->
<!-- <policy domain="system" name="precision" value="6"/> -->
<!-- <policy domain="system" name="memory-map" value="anonymous"/> -->
<!-- <policy domain="system" name="max-memory-request" value="256MiB"/> -->
<!-- <policy domain="resource" name="temporary-path" value="/tmp"/> -->
<policy domain="resource" name="memory" value="256MiB"/>
<policy domain="resource" name="map" value="512MiB"/>
<policy domain="resource" name="width" value="16KP"/>
<policy domain="resource" name="height" value="16KP"/>
<!-- <policy domain="resource" name="list-length" value="128"/> -->
<policy domain="resource" name="area" value="128MB"/>
<policy domain="resource" name="disk" value="1GiB"/>
<!-- <policy domain="resource" name="file" value="768"/> -->
<!-- <policy domain="resource" name="thread" value="4"/> -->
<!-- <policy domain="resource" name="throttle" value="0"/> -->
<!-- <policy domain="resource" name="time" value="3600"/> -->
<!-- <policy domain="coder" rights="none" pattern="MVG" /> -->
<policy domain="module" rights="read|write" pattern="{PS,PDF,XPS}" />
<!-- <policy domain="delegate" rights="none" pattern="HTTPS" /> -->
<!-- <policy domain="path" rights="none" pattern="@*" /> -->
<!-- <policy domain="cache" name="memory-map" value="anonymous"/> -->
<!-- <policy domain="cache" name="synchronize" value="True"/> -->
<!-- <policy domain="cache" name="shared-secret" value="passphrase" stealth="true"/> -->
<!-- <policy domain="system" name="pixel-cache-memory" value="anonymous"/> -->
<!-- <policy domain="system" name="shred" value="2"/> -->
<!-- <policy domain="system" name="precision" value="6"/> -->
<!-- not needed due to the need to use explicitly by mvg: -->
<!-- <policy domain="delegate" rights="none" pattern="MVG" /> -->
<!-- use curl -->
<policy domain="delegate" rights="none" pattern="URL" />
<policy domain="delegate" rights="none" pattern="HTTPS" />
<policy domain="delegate" rights="none" pattern="HTTP" />
<!-- in order to avoid to get image with password text -->
<!-- <policy domain="path" rights="none" pattern="@*"/> -->
</policymap>

View File

@ -29,4 +29,5 @@ if args.input_directory is not None:
remaining_args.insert(0, '-i')
cmd.append(CONTAINER_IMAGE)
cmd += remaining_args
subprocess.run(cmd)