mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2025-01-13 00:30:34 +00:00
add zip creation of results
This commit is contained in:
parent
3a4cc16e5b
commit
dfc05be7db
@ -19,6 +19,7 @@ RUN apt-get update \
|
|||||||
python2.7 \
|
python2.7 \
|
||||||
python3.5 \
|
python3.5 \
|
||||||
wget \
|
wget \
|
||||||
|
zip \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
ENV OCROPY_VERSION 1.3.3
|
ENV OCROPY_VERSION 1.3.3
|
||||||
|
56
ocr
56
ocr
@ -16,6 +16,7 @@ import os
|
|||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
from pyflow import WorkflowRunner
|
from pyflow import WorkflowRunner
|
||||||
|
from zipfile import ZipFile
|
||||||
|
|
||||||
|
|
||||||
def parse_arguments():
|
def parse_arguments():
|
||||||
@ -74,6 +75,7 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
self.keep_intermediates = args.keep_intermediates
|
self.keep_intermediates = args.keep_intermediates
|
||||||
self.lang = args.lang
|
self.lang = args.lang
|
||||||
self.n_cores = args.n_cores
|
self.n_cores = args.n_cores
|
||||||
|
self.output_dir = args.output_dir
|
||||||
|
|
||||||
def workflow(self):
|
def workflow(self):
|
||||||
if len(self.jobs) == 0:
|
if len(self.jobs) == 0:
|
||||||
@ -384,6 +386,60 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
all_zip_jobs = []
|
||||||
|
all_zip_job_dependencies = (hocr_to_tei_jobs
|
||||||
|
+ pdf_merge_jobs
|
||||||
|
+ txt_merge_jobs)
|
||||||
|
cmd = 'cd "%s" && zip all.zip */*.{pdf,txt,xml} -x "pyflow.data*" && cd -' % (
|
||||||
|
self.output_dir
|
||||||
|
)
|
||||||
|
all_zip_jobs.append(
|
||||||
|
self.addTask(
|
||||||
|
command=cmd,
|
||||||
|
dependencies=all_zip_job_dependencies,
|
||||||
|
label='all_zip_job_-_%i' % (index)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
pdf_zip_jobs = []
|
||||||
|
pdf_zip_job_dependencies = pdf_merge_jobs
|
||||||
|
cmd = 'cd "%s" && zip pdf.zip */*.pdf -x "pyflow.data*" && cd -' % (
|
||||||
|
self.output_dir
|
||||||
|
)
|
||||||
|
pdf_zip_jobs.append(
|
||||||
|
self.addTask(
|
||||||
|
command=cmd,
|
||||||
|
dependencies=pdf_zip_job_dependencies,
|
||||||
|
label='pdf_zip_job_-_%i' % (index)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
txt_zip_jobs = []
|
||||||
|
txt_zip_job_dependencies = txt_merge_jobs
|
||||||
|
cmd = 'cd "%s" && zip txt.zip */*.txt -x "pyflow.data*" && cd -' % (
|
||||||
|
self.output_dir
|
||||||
|
)
|
||||||
|
txt_zip_jobs.append(
|
||||||
|
self.addTask(
|
||||||
|
command=cmd,
|
||||||
|
dependencies=txt_zip_job_dependencies,
|
||||||
|
label='txt_zip_job_-_%i' % (index)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
xml_zip_jobs = []
|
||||||
|
xml_zip_job_dependencies = hocr_to_tei_jobs
|
||||||
|
cmd = 'cd "%s" && zip xml.zip */*.xml -x "pyflow.data*" && cd -' % (
|
||||||
|
self.output_dir
|
||||||
|
)
|
||||||
|
xml_zip_jobs.append(
|
||||||
|
self.addTask(
|
||||||
|
command=cmd,
|
||||||
|
dependencies=xml_zip_job_dependencies,
|
||||||
|
label='xml_zip_job_-_%i' % (index)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
'''
|
'''
|
||||||
' ##################################################
|
' ##################################################
|
||||||
' # Cleanup #
|
' # Cleanup #
|
||||||
|
Loading…
x
Reference in New Issue
Block a user