From dfc05be7db2afe1ed452936730ca808c56da9729 Mon Sep 17 00:00:00 2001
From: Patrick Jentsch
Date: Mon, 20 Jan 2020 15:04:55 +0100
Subject: [PATCH] add zip creation of results
---
Dockerfile | 1 +
ocr | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 57 insertions(+)
diff --git a/Dockerfile b/Dockerfile
index b715aec..c848bad 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -19,6 +19,7 @@ RUN apt-get update \
python2.7 \
python3.5 \
wget \
+ zip \
&& rm -rf /var/lib/apt/lists/*
ENV OCROPY_VERSION 1.3.3
diff --git a/ocr b/ocr
index 28ecbd9..4a8c6bf 100755
--- a/ocr
+++ b/ocr
@@ -16,6 +16,7 @@ import os
import re
import sys
from pyflow import WorkflowRunner
+from zipfile import ZipFile
def parse_arguments():
@@ -74,6 +75,7 @@ class OCRWorkflow(WorkflowRunner):
self.keep_intermediates = args.keep_intermediates
self.lang = args.lang
self.n_cores = args.n_cores
+ self.output_dir = args.output_dir
def workflow(self):
if len(self.jobs) == 0:
@@ -384,6 +386,60 @@ class OCRWorkflow(WorkflowRunner):
)
)
+ all_zip_jobs = []
+ all_zip_job_dependencies = (hocr_to_tei_jobs
+ + pdf_merge_jobs
+ + txt_merge_jobs)
+ cmd = 'cd "%s" && zip all.zip */*.{pdf,txt,xml} -x "pyflow.data*" && cd -' % (
+ self.output_dir
+ )
+ all_zip_jobs.append(
+ self.addTask(
+ command=cmd,
+ dependencies=all_zip_job_dependencies,
+ label='all_zip_job_-_%i' % (index)
+ )
+ )
+
+ pdf_zip_jobs = []
+ pdf_zip_job_dependencies = pdf_merge_jobs
+ cmd = 'cd "%s" && zip pdf.zip */*.pdf -x "pyflow.data*" && cd -' % (
+ self.output_dir
+ )
+ pdf_zip_jobs.append(
+ self.addTask(
+ command=cmd,
+ dependencies=pdf_zip_job_dependencies,
+ label='pdf_zip_job_-_%i' % (index)
+ )
+ )
+
+ txt_zip_jobs = []
+ txt_zip_job_dependencies = txt_merge_jobs
+ cmd = 'cd "%s" && zip txt.zip */*.txt -x "pyflow.data*" && cd -' % (
+ self.output_dir
+ )
+ txt_zip_jobs.append(
+ self.addTask(
+ command=cmd,
+ dependencies=txt_zip_job_dependencies,
+ label='txt_zip_job_-_%i' % (index)
+ )
+ )
+
+ xml_zip_jobs = []
+ xml_zip_job_dependencies = hocr_to_tei_jobs
+ cmd = 'cd "%s" && zip xml.zip */*.xml -x "pyflow.data*" && cd -' % (
+ self.output_dir
+ )
+ xml_zip_jobs.append(
+ self.addTask(
+ command=cmd,
+ dependencies=xml_zip_job_dependencies,
+ label='xml_zip_job_-_%i' % (index)
+ )
+ )
+
'''
' ##################################################
' # Cleanup #