diff --git a/Dockerfile b/Dockerfile index b715aec..c848bad 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,6 +19,7 @@ RUN apt-get update \ python2.7 \ python3.5 \ wget \ + zip \ && rm -rf /var/lib/apt/lists/* ENV OCROPY_VERSION 1.3.3 diff --git a/ocr b/ocr index 28ecbd9..4a8c6bf 100755 --- a/ocr +++ b/ocr @@ -16,6 +16,7 @@ import os import re import sys from pyflow import WorkflowRunner +from zipfile import ZipFile def parse_arguments(): @@ -74,6 +75,7 @@ class OCRWorkflow(WorkflowRunner): self.keep_intermediates = args.keep_intermediates self.lang = args.lang self.n_cores = args.n_cores + self.output_dir = args.output_dir def workflow(self): if len(self.jobs) == 0: @@ -384,6 +386,60 @@ class OCRWorkflow(WorkflowRunner): ) ) + all_zip_jobs = [] + all_zip_job_dependencies = (hocr_to_tei_jobs + + pdf_merge_jobs + + txt_merge_jobs) + cmd = 'cd "%s" && zip all.zip */*.{pdf,txt,xml} -x "pyflow.data*" && cd -' % ( + self.output_dir + ) + all_zip_jobs.append( + self.addTask( + command=cmd, + dependencies=all_zip_job_dependencies, + label='all_zip_job_-_%i' % (index) + ) + ) + + pdf_zip_jobs = [] + pdf_zip_job_dependencies = pdf_merge_jobs + cmd = 'cd "%s" && zip pdf.zip */*.pdf -x "pyflow.data*" && cd -' % ( + self.output_dir + ) + pdf_zip_jobs.append( + self.addTask( + command=cmd, + dependencies=pdf_zip_job_dependencies, + label='pdf_zip_job_-_%i' % (index) + ) + ) + + txt_zip_jobs = [] + txt_zip_job_dependencies = txt_merge_jobs + cmd = 'cd "%s" && zip txt.zip */*.txt -x "pyflow.data*" && cd -' % ( + self.output_dir + ) + txt_zip_jobs.append( + self.addTask( + command=cmd, + dependencies=txt_zip_job_dependencies, + label='txt_zip_job_-_%i' % (index) + ) + ) + + xml_zip_jobs = [] + xml_zip_job_dependencies = hocr_to_tei_jobs + cmd = 'cd "%s" && zip xml.zip */*.xml -x "pyflow.data*" && cd -' % ( + self.output_dir + ) + xml_zip_jobs.append( + self.addTask( + command=cmd, + dependencies=xml_zip_job_dependencies, + label='xml_zip_job_-_%i' % (index) + ) + ) + ''' ' ################################################## ' # Cleanup #