rework how pdfs are created 2

This commit is contained in:
Stephan Porada 2020-06-03 10:34:42 +02:00
parent 194156d862
commit 4e17867248
4 changed files with 117 additions and 10 deletions

View File

@ -6,18 +6,21 @@ LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de"
ENV LANG=C.UTF-8
RUN mkdir /usr/share/man/man1/ # needed to install pdftk https://github.com/geerlingguy/ansible-role-java/issues/64
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
ca-certificates \
imagemagick \
python3.7 \
pdftk \
zip
RUN rm -rf /var/lib/apt/lists/*
RUN rm -f /etc/ImageMagick-6/policy.xml
COPY file-setup /usr/local/bin
COPY policy.xml /etc/ImageMagick-6
ENTRYPOINT ["file-setup"]

View File

@ -28,22 +28,39 @@ def parse_arguments():
def merge_images(input_dir, output_dir, output_file_base, zip):
try:
os.mkdir(output_dir)
tmp_dir_path = os.path.join(input_dir, 'tmp')
os.mkdir(tmp_dir_path)
tmp_dir = os.path.join(input_dir, 'tmp')
os.mkdir(tmp_dir)
except FileExistsError:
pass
try:
tmp_dir = os.path.join(input_dir, 'tmp')
os.mkdir(tmp_dir)
except FileExistsError:
pass
# Sort filenames into a list ordered with version flag -v
cmd = 'ls -Q -v "{i}"/*.* > "{i}"/file_list.txt'.format(i=input_dir)
subprocess.run(cmd, shell=True)
cmd = ('mogrify -compress LZW -format pdf ',
+ '-path @"{o}" "{i}"/file_list.txt').format(i=input_dir,
o=tmp_dir_path)
# Convert all image files into pdf files
cmd = ('mogrify -compress LZW -format pdf '
+ '-path "{o}" @"{i}"/file_list.txt').format(i=input_dir,
o=tmp_dir)
subprocess.run(cmd, shell=True)
# remove image files
cmd = 'xargs rm <{i}/file_list.txt'.format(i=input_dir)
subprocess.run(cmd, shell=True)
# remove file list
cmd = 'rm "{i}"/file_list.txt'.format(i=input_dir)
subprocess.run(cmd, shell=True)
cmd = ('pdftk "{tmp_dir_path}"/*.pdf cat ',
+ 'output "{o}"/"{ofb}".pdf').format(i=tmp_dir_path,
# join all pdfs into one pdf
cmd = ('pdftk "{tmp_i}"/*.pdf cat '
+ 'output "{o}"/"{ofb}".pdf').format(tmp_i=tmp_dir,
o=output_dir,
ofb=output_file_base)
subprocess.run(cmd, shell=True)
# remove single pdf files
cmd = 'rm -fr {i}/tmp'.format(i=input_dir)
subprocess.run(cmd, shell=True)
# zip stuff
if zip is not None:
# Remove .zip file extension if provided
if zip.endswith('.zip'):

87
policy.xml Normal file
View File

@ -0,0 +1,87 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE policymap [
<!ELEMENT policymap (policy)+>
<!ELEMENT policy (#PCDATA)>
<!ATTLIST policy domain (delegate|coder|filter|path|resource) #IMPLIED>
<!ATTLIST policy name CDATA #IMPLIED>
<!ATTLIST policy rights CDATA #IMPLIED>
<!ATTLIST policy pattern CDATA #IMPLIED>
<!ATTLIST policy value CDATA #IMPLIED>
]>
<!--
Configure ImageMagick policies.
Domains include system, delegate, coder, filter, path, or resource.
Rights include none, read, write, and execute. Use | to combine them,
for example: "read | write" to permit read from, or write to, a path.
Use a glob expression as a pattern.
Suppose we do not want users to process MPEG video images:
<policy domain="delegate" rights="none" pattern="mpeg:decode" />
Here we do not want users reading images from HTTP:
<policy domain="coder" rights="none" pattern="HTTP" />
Lets prevent users from executing any image filters:
<policy domain="filter" rights="none" pattern="*" />
The /repository file system is restricted to read only. We use a glob
expression to match all paths that start with /repository:
<policy domain="path" rights="read" pattern="/repository/*" />
Let's prevent possible exploits by removing the right to use indirect reads.
<policy domain="path" rights="none" pattern="@*" />
Any large image is cached to disk rather than memory:
<policy domain="resource" name="area" value="1GB"/>
Define arguments for the memory, map, area, width, height, and disk resources
with SI prefixes (.e.g 100MB). In addition, resource policies are maximums
for each instance of ImageMagick (e.g. policy memory limit 1GB, -limit 2GB
exceeds policy maximum so memory limit is 1GB).
-->
<policymap>
<!-- <policy domain="system" name="shred" value="2"/> -->
<!-- <policy domain="system" name="precision" value="6"/> -->
<!-- <policy domain="system" name="memory-map" value="anonymous"/> -->
<!-- <policy domain="system" name="max-memory-request" value="256MiB"/> -->
<!-- <policy domain="resource" name="temporary-path" value="/tmp"/> -->
<policy domain="resource" name="memory" value="256MiB"/>
<policy domain="resource" name="map" value="512MiB"/>
<policy domain="resource" name="width" value="16KP"/>
<policy domain="resource" name="height" value="16KP"/>
<!-- <policy domain="resource" name="list-length" value="128"/> -->
<policy domain="resource" name="area" value="128MB"/>
<policy domain="resource" name="disk" value="1GiB"/>
<!-- <policy domain="resource" name="file" value="768"/> -->
<!-- <policy domain="resource" name="thread" value="4"/> -->
<!-- <policy domain="resource" name="throttle" value="0"/> -->
<!-- <policy domain="resource" name="time" value="3600"/> -->
<!-- <policy domain="coder" rights="none" pattern="MVG" /> -->
<policy domain="module" rights="read|write" pattern="{PS,PDF,XPS}" />
<!-- <policy domain="delegate" rights="none" pattern="HTTPS" /> -->
<!-- <policy domain="path" rights="none" pattern="@*" /> -->
<!-- <policy domain="cache" name="memory-map" value="anonymous"/> -->
<!-- <policy domain="cache" name="synchronize" value="True"/> -->
<!-- <policy domain="cache" name="shared-secret" value="passphrase" stealth="true"/> -->
<!-- <policy domain="system" name="pixel-cache-memory" value="anonymous"/> -->
<!-- <policy domain="system" name="shred" value="2"/> -->
<!-- <policy domain="system" name="precision" value="6"/> -->
<!-- not needed due to the need to use explicitly by mvg: -->
<!-- <policy domain="delegate" rights="none" pattern="MVG" /> -->
<!-- use curl -->
<policy domain="delegate" rights="none" pattern="URL" />
<policy domain="delegate" rights="none" pattern="HTTPS" />
<policy domain="delegate" rights="none" pattern="HTTP" />
<!-- in order to avoid to get image with password text -->
<!-- <policy domain="path" rights="none" pattern="@*"/> -->
</policymap>

View File

@ -14,6 +14,7 @@ GID = str(os.getgid())
parser = ArgumentParser(add_help=False)
parser.add_argument('-i')
parser.add_argument('-o')
parser.add_argument('-f')
args, remaining_args = parser.parse_known_args()
cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)]
@ -25,5 +26,4 @@ if args.i is not None:
remaining_args.insert(0, CONTAINER_INPUT_DIR)
cmd.append(CONTAINER_IMAGE)
cmd += remaining_args
subprocess.run(cmd)