From 6c4a642cb708033076af9bdfc7ec566ce6fefdb8 Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Mon, 3 Feb 2020 15:00:27 +0100 Subject: [PATCH] Add a switch for zip functionality --- ocr | 102 +++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 56 insertions(+), 46 deletions(-) diff --git a/ocr b/ocr index 4a8c6bf..a25e4c2 100755 --- a/ocr +++ b/ocr @@ -65,6 +65,14 @@ def parse_arguments(): required=False, type=int ) + parser.add_argument( + '--zip', + action='store_true', + default=False, + dest='zip', + help='package result files in zip bundles', + required=False + ) return parser.parse_args() @@ -76,6 +84,7 @@ class OCRWorkflow(WorkflowRunner): self.lang = args.lang self.n_cores = args.n_cores self.output_dir = args.output_dir + self.zip = args.zip def workflow(self): if len(self.jobs) == 0: @@ -386,59 +395,60 @@ class OCRWorkflow(WorkflowRunner): ) ) - all_zip_jobs = [] - all_zip_job_dependencies = (hocr_to_tei_jobs - + pdf_merge_jobs - + txt_merge_jobs) - cmd = 'cd "%s" && zip all.zip */*.{pdf,txt,xml} -x "pyflow.data*" && cd -' % ( - self.output_dir - ) - all_zip_jobs.append( - self.addTask( - command=cmd, - dependencies=all_zip_job_dependencies, - label='all_zip_job_-_%i' % (index) + if self.zip: + all_zip_jobs = [] + all_zip_job_dependencies = (hocr_to_tei_jobs + + pdf_merge_jobs + + txt_merge_jobs) + cmd = 'cd "%s" && zip all.zip */*.{pdf,txt,xml} -x "pyflow.data*" && cd -' % ( + self.output_dir + ) + all_zip_jobs.append( + self.addTask( + command=cmd, + dependencies=all_zip_job_dependencies, + label='all_zip_job' + ) ) - ) - pdf_zip_jobs = [] - pdf_zip_job_dependencies = pdf_merge_jobs - cmd = 'cd "%s" && zip pdf.zip */*.pdf -x "pyflow.data*" && cd -' % ( - self.output_dir - ) - pdf_zip_jobs.append( - self.addTask( - command=cmd, - dependencies=pdf_zip_job_dependencies, - label='pdf_zip_job_-_%i' % (index) + pdf_zip_jobs = [] + pdf_zip_job_dependencies = all_zip_jobs + cmd = 'cd "%s" && zip -m pdf.zip */*.pdf -x "pyflow.data*" && cd -' % ( + self.output_dir + ) + pdf_zip_jobs.append( + self.addTask( + command=cmd, + dependencies=pdf_zip_job_dependencies, + label='pdf_zip_job' + ) ) - ) - txt_zip_jobs = [] - txt_zip_job_dependencies = txt_merge_jobs - cmd = 'cd "%s" && zip txt.zip */*.txt -x "pyflow.data*" && cd -' % ( - self.output_dir - ) - txt_zip_jobs.append( - self.addTask( - command=cmd, - dependencies=txt_zip_job_dependencies, - label='txt_zip_job_-_%i' % (index) + txt_zip_jobs = [] + txt_zip_job_dependencies = all_zip_jobs + cmd = 'cd "%s" && zip -m txt.zip */*.txt -x "pyflow.data*" && cd -' % ( + self.output_dir + ) + txt_zip_jobs.append( + self.addTask( + command=cmd, + dependencies=txt_zip_job_dependencies, + label='txt_zip_job' + ) ) - ) - xml_zip_jobs = [] - xml_zip_job_dependencies = hocr_to_tei_jobs - cmd = 'cd "%s" && zip xml.zip */*.xml -x "pyflow.data*" && cd -' % ( - self.output_dir - ) - xml_zip_jobs.append( - self.addTask( - command=cmd, - dependencies=xml_zip_job_dependencies, - label='xml_zip_job_-_%i' % (index) + xml_zip_jobs = [] + xml_zip_job_dependencies = all_zip_jobs + cmd = 'cd "%s" && zip -m xml.zip */*.xml -x "pyflow.data*" && cd -' % ( + self.output_dir + ) + xml_zip_jobs.append( + self.addTask( + command=cmd, + dependencies=xml_zip_job_dependencies, + label='xml_zip_job' + ) ) - ) ''' ' ##################################################