From 6c4a642cb708033076af9bdfc7ec566ce6fefdb8 Mon Sep 17 00:00:00 2001
From: Patrick Jentsch
Date: Mon, 3 Feb 2020 15:00:27 +0100
Subject: [PATCH] Add a switch for zip functionality
---
ocr | 102 +++++++++++++++++++++++++++++++++---------------------------
1 file changed, 56 insertions(+), 46 deletions(-)
diff --git a/ocr b/ocr
index 4a8c6bf..a25e4c2 100755
--- a/ocr
+++ b/ocr
@@ -65,6 +65,14 @@ def parse_arguments():
required=False,
type=int
)
+ parser.add_argument(
+ '--zip',
+ action='store_true',
+ default=False,
+ dest='zip',
+ help='package result files in zip bundles',
+ required=False
+ )
return parser.parse_args()
@@ -76,6 +84,7 @@ class OCRWorkflow(WorkflowRunner):
self.lang = args.lang
self.n_cores = args.n_cores
self.output_dir = args.output_dir
+ self.zip = args.zip
def workflow(self):
if len(self.jobs) == 0:
@@ -386,59 +395,60 @@ class OCRWorkflow(WorkflowRunner):
)
)
- all_zip_jobs = []
- all_zip_job_dependencies = (hocr_to_tei_jobs
- + pdf_merge_jobs
- + txt_merge_jobs)
- cmd = 'cd "%s" && zip all.zip */*.{pdf,txt,xml} -x "pyflow.data*" && cd -' % (
- self.output_dir
- )
- all_zip_jobs.append(
- self.addTask(
- command=cmd,
- dependencies=all_zip_job_dependencies,
- label='all_zip_job_-_%i' % (index)
+ if self.zip:
+ all_zip_jobs = []
+ all_zip_job_dependencies = (hocr_to_tei_jobs
+ + pdf_merge_jobs
+ + txt_merge_jobs)
+ cmd = 'cd "%s" && zip all.zip */*.{pdf,txt,xml} -x "pyflow.data*" && cd -' % (
+ self.output_dir
+ )
+ all_zip_jobs.append(
+ self.addTask(
+ command=cmd,
+ dependencies=all_zip_job_dependencies,
+ label='all_zip_job'
+ )
)
- )
- pdf_zip_jobs = []
- pdf_zip_job_dependencies = pdf_merge_jobs
- cmd = 'cd "%s" && zip pdf.zip */*.pdf -x "pyflow.data*" && cd -' % (
- self.output_dir
- )
- pdf_zip_jobs.append(
- self.addTask(
- command=cmd,
- dependencies=pdf_zip_job_dependencies,
- label='pdf_zip_job_-_%i' % (index)
+ pdf_zip_jobs = []
+ pdf_zip_job_dependencies = all_zip_jobs
+ cmd = 'cd "%s" && zip -m pdf.zip */*.pdf -x "pyflow.data*" && cd -' % (
+ self.output_dir
+ )
+ pdf_zip_jobs.append(
+ self.addTask(
+ command=cmd,
+ dependencies=pdf_zip_job_dependencies,
+ label='pdf_zip_job'
+ )
)
- )
- txt_zip_jobs = []
- txt_zip_job_dependencies = txt_merge_jobs
- cmd = 'cd "%s" && zip txt.zip */*.txt -x "pyflow.data*" && cd -' % (
- self.output_dir
- )
- txt_zip_jobs.append(
- self.addTask(
- command=cmd,
- dependencies=txt_zip_job_dependencies,
- label='txt_zip_job_-_%i' % (index)
+ txt_zip_jobs = []
+ txt_zip_job_dependencies = all_zip_jobs
+ cmd = 'cd "%s" && zip -m txt.zip */*.txt -x "pyflow.data*" && cd -' % (
+ self.output_dir
+ )
+ txt_zip_jobs.append(
+ self.addTask(
+ command=cmd,
+ dependencies=txt_zip_job_dependencies,
+ label='txt_zip_job'
+ )
)
- )
- xml_zip_jobs = []
- xml_zip_job_dependencies = hocr_to_tei_jobs
- cmd = 'cd "%s" && zip xml.zip */*.xml -x "pyflow.data*" && cd -' % (
- self.output_dir
- )
- xml_zip_jobs.append(
- self.addTask(
- command=cmd,
- dependencies=xml_zip_job_dependencies,
- label='xml_zip_job_-_%i' % (index)
+ xml_zip_jobs = []
+ xml_zip_job_dependencies = all_zip_jobs
+ cmd = 'cd "%s" && zip -m xml.zip */*.xml -x "pyflow.data*" && cd -' % (
+ self.output_dir
+ )
+ xml_zip_jobs.append(
+ self.addTask(
+ command=cmd,
+ dependencies=xml_zip_job_dependencies,
+ label='xml_zip_job'
+ )
)
- )
'''
' ##################################################