add zip creation of results

This commit is contained in:
Patrick Jentsch 2020-01-20 15:04:55 +01:00
parent 3a4cc16e5b
commit dfc05be7db
2 changed files with 57 additions and 0 deletions

View File

@ -19,6 +19,7 @@ RUN apt-get update \
python2.7 \
python3.5 \
wget \
zip \
&& rm -rf /var/lib/apt/lists/*
ENV OCROPY_VERSION 1.3.3

56
ocr
View File

@ -16,6 +16,7 @@ import os
import re
import sys
from pyflow import WorkflowRunner
from zipfile import ZipFile
def parse_arguments():
@ -74,6 +75,7 @@ class OCRWorkflow(WorkflowRunner):
self.keep_intermediates = args.keep_intermediates
self.lang = args.lang
self.n_cores = args.n_cores
self.output_dir = args.output_dir
def workflow(self):
if len(self.jobs) == 0:
@ -384,6 +386,60 @@ class OCRWorkflow(WorkflowRunner):
)
)
all_zip_jobs = []
all_zip_job_dependencies = (hocr_to_tei_jobs
+ pdf_merge_jobs
+ txt_merge_jobs)
cmd = 'cd "%s" && zip all.zip */*.{pdf,txt,xml} -x "pyflow.data*" && cd -' % (
self.output_dir
)
all_zip_jobs.append(
self.addTask(
command=cmd,
dependencies=all_zip_job_dependencies,
label='all_zip_job_-_%i' % (index)
)
)
pdf_zip_jobs = []
pdf_zip_job_dependencies = pdf_merge_jobs
cmd = 'cd "%s" && zip pdf.zip */*.pdf -x "pyflow.data*" && cd -' % (
self.output_dir
)
pdf_zip_jobs.append(
self.addTask(
command=cmd,
dependencies=pdf_zip_job_dependencies,
label='pdf_zip_job_-_%i' % (index)
)
)
txt_zip_jobs = []
txt_zip_job_dependencies = txt_merge_jobs
cmd = 'cd "%s" && zip txt.zip */*.txt -x "pyflow.data*" && cd -' % (
self.output_dir
)
txt_zip_jobs.append(
self.addTask(
command=cmd,
dependencies=txt_zip_job_dependencies,
label='txt_zip_job_-_%i' % (index)
)
)
xml_zip_jobs = []
xml_zip_job_dependencies = hocr_to_tei_jobs
cmd = 'cd "%s" && zip xml.zip */*.xml -x "pyflow.data*" && cd -' % (
self.output_dir
)
xml_zip_jobs.append(
self.addTask(
command=cmd,
dependencies=xml_zip_job_dependencies,
label='xml_zip_job_-_%i' % (index)
)
)
'''
' ##################################################
' # Cleanup #