diff --git a/ocr b/ocr index 17ab5c8..f26f8d4 100755 --- a/ocr +++ b/ocr @@ -24,33 +24,30 @@ TESSERACT_MODELS = ['deu', 'eng', 'enm', 'fra', 'frk', 'frm', 'ita', 'por', def parse_args(): parser = ArgumentParser(description='OCR Pipeline utilizing tesseract.') - parser.add_argument('i', help='Input directory for OCR. One PDf equals one\ - job') - parser.add_argument('o', help='Output directory containing OCR results.') + parser.add_argument('-i', '--input-directory', + help='Input directory (only PDF files get processed)', + required=True) + parser.add_argument('-o', '--output-directory', + help='Output directory', + required=True) parser.add_argument('-l', '--language', choices=TESSERACT_MODELS, required=True) parser.add_argument('--binarize', action='store_true', help='Use ocropy binarisation as preprocessing step.') - parser.add_argument('--keep-intermediates', + parser.add_argument('--compress', action='store_true', - help='Keep intermediate files for debugging etc.', - required=False) + help='Compress the final PDF result file.') + parser.add_argument('--log-dir') parser.add_argument('--n-cores', default=min(4, multiprocessing.cpu_count()), help='Total number of cores available.', - type=int, - required=False) - parser.add_argument('--zip', help='Zips all results in different archives \ - depending on result types. Also zips \ - everything into one archive.', - required=False) - parser.add_argument('-c', '--compress', - help='Compress the final PDF result file.', - required=False, - action='store_true') - parser.add_argument('--log_dir') + type=int) + parser.add_argument('--zip', + help='Zips all results in different archives depending' + ' on result types. Also zips everything into one ' + 'archive.') return parser.parse_args() @@ -62,11 +59,10 @@ class OCRPipelineJob: class OCRPipeline(WorkflowRunner): - def __init__(self, binarize, jobs, keep_intermediates, lang, n_cores, - output_dir, zip, compress): + def __init__(self, binarize, jobs, lang, n_cores, output_dir, zip, + compress): self.binarize = binarize self.jobs = jobs - self.keep_intermediates = keep_intermediates self.lang = lang self.n_cores = n_cores self.output_dir = output_dir @@ -79,36 +75,26 @@ class OCRPipeline(WorkflowRunner): ''' ' ################################################## - ' # mkdir_jobs # + ' # setup output directory # ' ################################################## ''' - - mkdir_jobs = [] + setup_output_directory_jobs = [] for i, job in enumerate(self.jobs): - output_dir = os.path.join(job.output_dir, 'tmp') - poco_dir = os.path.join(job.output_dir, 'PoCo') + intermediate_dir = os.path.join(job.output_dir, 'tmp') cmd = 'mkdir' cmd += ' -p' - cmd += ' "{}"'.format(output_dir) - cmd += ' "{}"'.format(os.path.join(poco_dir, 'hocr')) - cmd += ' "{}"'.format(os.path.join(poco_dir, 'tiff')) - if self.keep_intermediates: - cmd += ' "{}"'.format(os.path.join(output_dir, 'hocr')) - cmd += ' "{}"'.format(os.path.join(output_dir, 'pdf')) - cmd += ' "{}"'.format(os.path.join(output_dir, 'tiff')) - cmd += ' "{}"'.format(os.path.join(output_dir, 'txt')) - if self.binarize: - cmd += ' "{}"'.format(os.path.join(output_dir, 'bin.png')) - cmd += ' "{}"'.format(os.path.join(output_dir, 'nrm.png')) - lbl = 'mkdir_job_-_{}'.format(i) - mkdir_jobs.append(self.addTask(command=cmd, label=lbl)) + cmd += ' "{}"'.format(intermediate_dir) + cmd += ' "{}"'.format(os.path.join(job.output_dir, 'poco')) + lbl = 'setup_output_directory_-_{}'.format(i) + setup_output_directory_jobs.append(self.addTask(command=cmd, + label=lbl)) ''' ' ################################################## - ' # pdftoppm_jobs # + ' # split input # ' ################################################## ''' - pdftoppm_jobs = [] + split_input_jobs = [] n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs)))) for i, job in enumerate(self.jobs): output_dir = os.path.join(job.output_dir, 'tmp') @@ -118,24 +104,26 @@ class OCRPipeline(WorkflowRunner): cmd += ' -tiff' cmd += ' -tiffcompression lzw' cmd += ' "{}" "{}"'.format(job.file, output_file_base) - deps = mkdir_jobs - lbl = 'pdftoppm_job_-_{}'.format(i) - pdftoppm_jobs.append(self.addTask(command=cmd, dependencies=deps, - label=lbl, nCores=n_cores)) + deps = 'setup_output_directory_-_{}'.format(i) + lbl = 'split_input_-_{}'.format(i) + split_input_jobs.append(self.addTask(command=cmd, + dependencies=deps, + label=lbl, + nCores=n_cores)) if self.binarize: ''' - ' The ocropus_nlbin_jobs list is created based on the output files - ' of the pdftoppm_jobs. So wait until they are finished. + ' The binarization_jobs list is created based on the output files + ' of the split_jobs. So wait until they are finished. ''' self.waitForTasks() ''' ' ################################################## - ' # ocropus_nlbin_jobs # + ' # binarization # ' ################################################## ''' - ocropus_nlbin_jobs = [] + binarization_jobs = [] ''' ' We run ocropus-nlbin with either four or, if there are less then ' four cores available for this workflow, the available core @@ -152,24 +140,25 @@ class OCRPipeline(WorkflowRunner): cmd = 'ocropus-nlbin "{}"'.format('" "'.join(files)) cmd += ' -o "{}"'.format(output_dir) cmd += ' -Q "{}"'.format(n_cores) - deps = pdftoppm_jobs - lbl = 'ocropus_nlbin_job_-_{}'.format(i) - ocropus_nlbin_jobs.append( - self.addTask(command=cmd, dependencies=deps, label=lbl, - nCores=n_cores)) + deps = 'split_input_-_{}'.format(i) + lbl = 'binarization_-_{}'.format(i) + binarization_jobs.append(self.addTask(command=cmd, + dependencies=deps, + label=lbl, + nCores=n_cores)) ''' - ' The post_ocropus_nlbin_jobs are created based on the output files - ' of the ocropus_nlbin_jobs. So wait until they are finished. + ' The post_binarization_jobs are created based on the output files + ' of the binarization_jobs. So wait until they are finished. ''' self.waitForTasks() ''' ' ################################################## - ' # post_ocropus_nlbin_jobs # + ' # post binarization # ' ################################################## ''' - post_ocropus_nlbin_jobs = [] + post_binarization_jobs = [] for i, job in enumerate(self.jobs): input_dir = os.path.join(job.output_dir, 'tmp') output_dir = input_dir @@ -182,26 +171,26 @@ class OCRPipeline(WorkflowRunner): output_file = os.path.join(output_dir, 'page-{}.bin.png'.format(int(file.split('.', 1)[0]))) # noqa cmd = 'mv "{}" "{}"'.format(os.path.join(output_dir, file), output_file) - deps = ocropus_nlbin_jobs - lbl = 'post_ocropus_nlbin_job_-_{}-{}'.format(i, number) - post_ocropus_nlbin_jobs.append( - self.addTask(command=cmd, dependencies=deps, - label=lbl)) + deps = 'binarization_-_{}'.format(i) + lbl = 'post_binarization_-_{}-{}'.format(i, number) + post_binarization_jobs.append( + self.addTask(command=cmd, dependencies=deps, label=lbl) + ) number += 1 ''' - ' The tesseract_jobs are created based of the output files of either - ' the pdftoppm_jobs or post_ocropus_nlbin_jobs. So wait until they are + ' The ocr_jobs are created based of the output files of either the + ' split_jobs or post_binarization_jobs. So wait until they are ' finished. ''' self.waitForTasks() ''' ' ################################################## - ' # tesseract_jobs # + ' # ocr # ' ################################################## ''' - tesseract_jobs = [] + ocr_jobs = [] ''' ' Tesseract runs fastest with four cores. So we run it with either four ' or, if there are less then four cores available for this workflow, @@ -211,8 +200,7 @@ class OCRPipeline(WorkflowRunner): for i, job in enumerate(self.jobs): input_dir = os.path.join(job.output_dir, 'tmp') output_dir = input_dir - files = filter(lambda x: x.endswith('.bin.png' if self.binarize else '.tif'), # noqa - os.listdir(input_dir)) + files = filter(lambda x: x.endswith('.bin.png' if self.binarize else '.tif'), os.listdir(input_dir)) # noqa files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) files = map(lambda x: os.path.join(input_dir, x), files) number = 0 @@ -221,14 +209,17 @@ class OCRPipeline(WorkflowRunner): cmd = 'tesseract "{}" "{}"'.format(file, output_file_base) cmd += ' -l "{}"'.format(self.lang) cmd += ' hocr pdf txt' + cmd += ' && ' + cmd += 'sed -i \'s+{}/++g\' "{}".hocr'.format(input_dir, output_file_base) # noqa if self.binarize: - deps = post_ocropus_nlbin_jobs + deps = 'post_binarization_-_{}-{}'.format(i, number) else: - deps = pdftoppm_jobs - label = 'tesseract_jobs_-_{}-{}'.format(i, number) - tesseract_jobs.append( - self.addTask(command=cmd, dependencies=deps, label=label, - nCores=n_cores)) + deps = 'split_input_-_{}'.format(i) + label = 'ocr_-_{}-{}'.format(i, number) + ocr_jobs.append(self.addTask(command=cmd, + dependencies=deps, + label=label, + nCores=n_cores)) number += 1 ''' @@ -239,190 +230,171 @@ class OCRPipeline(WorkflowRunner): ''' ' ################################################## - ' # hocrtotei_jobs # + ' # combined pdf creation # ' ################################################## ''' - hocrtotei_jobs = [] + combined_pdf_creation_jobs = [] for i, job in enumerate(self.jobs): input_dir = os.path.join(job.output_dir, 'tmp') - files = filter(lambda x: x.endswith('.hocr'), + files = filter(lambda x: x.endswith('.pdf'), os.listdir(input_dir)) files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) files = map(lambda x: os.path.join(input_dir, x), files) - output_file = os.path.join(job.output_dir, - '{}.xml'.format(job.name)) - cmd = 'hocrtotei "{}" "{}"'.format('" "'.join(files), output_file) - deps = tesseract_jobs - lbl = 'hocrtotei_job_-_{}'.format(i) - hocrtotei_jobs.append(self.addTask(command=cmd, dependencies=deps, - label=lbl)) - - ''' - ' ################################################## - ' # hocr_poco_jobs # - ' ################################################## - ''' - - hocr_poco_jobs = [] - for i, job in enumerate(self.jobs): - input_dir = os.path.join(job.output_dir, 'tmp') - files = filter(lambda x: x.endswith('.hocr'), - os.listdir(input_dir)) - files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) - files = map(lambda x: os.path.join(input_dir, x), files) - # set relative file paths into hocr - relative_files = map(lambda x: os.path.join('..', - 'tiff', - os.path.basename(x).replace('.hocr', '.tif')), # noqa - files) - for file, relative_file in zip(files, relative_files): - with open(file, 'r+') as f: - html = f.read() - html = html.replace(file.replace('.hocr', '.tif'), - relative_file) - f.seek(0) - f.truncate(0) # deletes content of file to write new html - f.write(html) - output_path_base = os.path.join(job.output_dir, 'PoCo') - output_path = os.path.join(output_path_base, 'hocr') - cmd = 'cp "{}" "{}"'.format('" "'.join(files), output_path) - deps = tesseract_jobs - lbl = 'hocr_poco_jobs-_{}'.format(i) - hocr_poco_jobs.append(self.addTask(command=cmd, dependencies=deps, - label=lbl)) - ''' - ' ################################################## - ' # tiff_poco_jobs # - ' ################################################## - ''' - - tiff_poco_jobs = [] - for i, job in enumerate(self.jobs): - input_dir = os.path.join(job.output_dir, 'tmp') - files = filter(lambda x: x.endswith('.tif'), - os.listdir(input_dir)) - files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) - files = map(lambda x: os.path.join(input_dir, x), files) - output_path_base = os.path.join(job.output_dir, 'PoCo') - output_path = os.path.join(output_path_base, 'tiff') - cmd = 'cp "{}" "{}"'.format('" "'.join(files), output_path) - deps = tesseract_jobs - lbl = 'tiff_poco_jobs-_{}'.format(i) - tiff_poco_jobs.append(self.addTask(command=cmd, dependencies=deps, - label=lbl)) - - ''' - ' ################################################## - ' # pdfunite_jobs # - ' ################################################## - ''' - pdfunite_jobs = [] - for i, job in enumerate(self.jobs): - input_dir = os.path.join(job.output_dir, 'tmp') - files = filter(lambda x: x.endswith('.pdf'), os.listdir(input_dir)) - files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) - files = map(lambda x: os.path.join(input_dir, x), files) output_file = os.path.join(job.output_dir, '{}.pdf'.format(job.name)) cmd = 'pdfunite "{}" "{}"'.format('" "'.join(files), output_file) - deps = tesseract_jobs - lbl = 'pdfunite_job_-_{}'.format(i) - pdfunite_jobs.append(self.addTask(command=cmd, dependencies=deps, - label=lbl)) + deps = filter(lambda x: x.startswith('ocr_-_{}'.format(i)), + ocr_jobs) + lbl = 'combined_pdf_creation_-_{}'.format(i) + combined_pdf_creation_jobs.append(self.addTask(command=cmd, + dependencies=deps, + label=lbl)) ''' ' ################################################## - ' # cat_jobs # + ' # combined txt creation # ' ################################################## ''' - cat_jobs = [] + combined_txt_creation_jobs = [] for i, job in enumerate(self.jobs): input_dir = os.path.join(job.output_dir, 'tmp') files = filter(lambda x: x.endswith('.txt'), os.listdir(input_dir)) files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) files = map(lambda x: os.path.join(input_dir, x), files) - output_file = os.path.join(job.output_dir, - '{}.txt'.format(job.name)) + output_file = os.path.join(job.output_dir, '{}.txt'.format(job.name)) # noqa cmd = 'cat "{}" > "{}"'.format('" "'.join(files), output_file) - deps = tesseract_jobs - lbl = 'cat_job_-_{}'.format(i) - cat_jobs.append(self.addTask(command=cmd, dependencies=deps, - label=lbl)) + deps = filter(lambda x: x.startswith('ocr_-_{}'.format(i)), + ocr_jobs) + lbl = 'combined_txt_creation_-_{}'.format(i) + combined_txt_creation_jobs.append(self.addTask(command=cmd, + dependencies=deps, + label=lbl)) + + ''' + ' ################################################## + ' # tei p5 creation # + ' ################################################## + ''' + tei_p5_creation_jobs = [] + for i, job in enumerate(self.jobs): + input_dir = os.path.join(job.output_dir, 'tmp') + files = filter(lambda x: x.endswith('.hocr'), os.listdir(input_dir)) # noqa + files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) + files = map(lambda x: os.path.join(input_dir, x), files) + output_file = os.path.join(job.output_dir, '{}.xml'.format(job.name)) # noqa + cmd = 'hocrtotei "{}" "{}"'.format('" "'.join(files), output_file) + deps = filter(lambda x: x.startswith('ocr_-_{}'.format(i)), + ocr_jobs) + lbl = 'tei_p5_creation_-_{}'.format(i) + tei_p5_creation_jobs.append(self.addTask(command=cmd, + dependencies=deps, + label=lbl)) + + ''' + ' ################################################## + ' # poco bundle creation # + ' ################################################## + ''' + poco_bundle_creation_jobs = [] + for i, job in enumerate(self.jobs): + input_dir = os.path.join(job.output_dir, 'tmp') + output_dir = os.path.join(job.output_dir, 'poco') + cmd = 'mv "{}"/*.hocr "{}"'.format(input_dir, output_dir) + cmd += ' && ' + cmd += 'mv "{}"/*.{} "{}"'.format(input_dir, 'bin.png' if self.binarize else 'tif', output_dir) # noqa + deps = filter(lambda x: x.startswith('ocr_-_{}'.format(i)), + ocr_jobs) + deps.append('tei_p5_creation_-_{}'.format(i)) + lbl = 'poco_bundle_creation_-_{}'.format(i) + poco_bundle_creation_jobs.append(self.addTask(command=cmd, + dependencies=deps, + label=lbl)) + ''' ' The following jobs are created based of the output files of the - ' pdfunite_jobs. So wait until they are finished. + ' combined_pdf_creation_jobs. So wait until they are finished. ''' self.waitForTasks() ''' ' ################################################## - ' # compress_jobs # + ' # pdf compression # ' ################################################## ''' - compress_jobs = [] + pdf_compression_jobs = [] if self.compress: for i, job in enumerate(self.jobs): - print(os.listdir(job.output_dir)) - file = filter(lambda x: x.endswith('.pdf'), - os.listdir(job.output_dir))[0] + file = filter(lambda x: x.endswith('.pdf'), os.listdir(job.output_dir))[0] # noqa original_file = os.path.join(job.output_dir, file) compressed_file = os.path.join(job.output_dir, 'c_' + file) - cmd = ('gs ' - + '-sDEVICE=pdfwrite ' - + '-dCompatibilityLevel=1.4 ' - + '-dPDFSETTINGS=/ebook ' - + '-dNOPAUSE ' - + '-dQUIET ' - + '-dBATCH ' - + '-sOutputFile={o} {i} ').format(o=compressed_file, - i=original_file) - cmd += '&& rm {original_f} '.format(original_f=original_file) - cmd += ('&& mv {compressed_f} ' - + '{original_f} ').format(compressed_f=compressed_file, - original_f=original_file) - deps = (hocrtotei_jobs - + tesseract_jobs - + pdfunite_jobs - + cat_jobs - + hocr_poco_jobs - + tiff_poco_jobs) - lbl = 'compress_job_-_{}'.format(i) - compress_jobs.append(self.addTask(command=cmd, - dependencies=deps, - label=lbl)) + cmd = 'gs' + cmd += ' -sDEVICE=pdfwrite' + cmd += ' -dCompatibilityLevel=1.4' + cmd += ' -dPDFSETTINGS=/ebook' + cmd += ' -dNOPAUSE' + cmd += ' -dQUIET' + cmd += ' -dBATCH' + cmd += ' -sOutputFile="{}"'.format(compressed_file) + cmd += ' "{}"'.format(original_file) + cmd += ' && ' + cmd += 'mv "{}" "{}"'.format(compressed_file, original_file) + deps = 'combined_pdf_creation_-_{}'.format(i) + lbl = 'pdf_compression_-_{}'.format(i) + pdf_compression_jobs.append(self.addTask(command=cmd, + dependencies=deps, + label=lbl)) ''' ' ################################################## - ' # zip_jobs # + ' # cleanup # ' ################################################## ''' - zip_jobs = [] - deps = (hocrtotei_jobs - + tesseract_jobs - + pdfunite_jobs - + cat_jobs - + hocr_poco_jobs - + tiff_poco_jobs - + compress_jobs) + cleanup_jobs = [] + for i, job in enumerate(self.jobs): + input_dir = os.path.join(job.output_dir, 'tmp') + cmd = 'rm -r "{}"'.format(input_dir) + if self.compress: + deps = ['pdf_compression_-_{}'.format(i)] + else: + deps = ['combined_pdf_creation_-_{}'.format(i)] + deps.append('combined_txt_creation_-_{}'.format(i)) + deps.append('poco_bundle_creation_-_{}'.format(i)) + deps.append('tei_p5_creation_-_{}'.format(i)) + lbl = 'cleanup_-_{}'.format(i) + cleanup_jobs.append(self.addTask(command=cmd, + dependencies=deps, + label=lbl)) + + ''' + ' ################################################## + ' # zip creation # + ' ################################################## + ''' + zip_creation_jobs = [] if self.zip is not None: # Remove .zip file extension if provided if self.zip.endswith('.zip'): self.zip = self.zip[:-4] self.zip = self.zip if self.zip else 'output' - # zip ALL + # zip all files cmd = 'cd "{}"'.format(self.output_dir) cmd += ' && ' cmd += 'zip' cmd += ' -r' cmd += ' "{}".all.zip .'.format(self.zip) cmd += ' -x "pyflow.data*" "*tmp*"' - cmd += ' -i "*.pdf" "*.txt" "*.xml" "*.hocr" "*.tif"' + cmd += ' -i "*.pdf" "*.txt" "*.xml" "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif') # noqa cmd += ' && ' cmd += 'cd -' - lbl = 'zip_job_-_all' - zip_jobs.append(self.addTask(command=cmd, dependencies=deps, - label=lbl)) - # zip PDFs + deps = (pdf_compression_jobs if self.compress else + combined_pdf_creation_jobs) + deps += combined_txt_creation_jobs + deps += poco_bundle_creation_jobs + lbl = 'zip_creation_-_all' + zip_creation_jobs.append(self.addTask(command=cmd, + dependencies=deps, + label=lbl)) + # zip PDF files cmd = 'cd "{}"'.format(self.output_dir) cmd += ' && ' cmd += 'zip' @@ -432,11 +404,13 @@ class OCRPipeline(WorkflowRunner): cmd += ' -i "*.pdf"' cmd += ' && ' cmd += 'cd -' - deps = deps + ['zip_job_-_all'] - lbl = 'zip_job_-_pdf' - zip_jobs.append(self.addTask(command=cmd, dependencies=deps, - label=lbl)) - # zip TXTs + deps = (pdf_compression_jobs if self.compress else + combined_pdf_creation_jobs) + lbl = 'zip_creation_-_pdf' + zip_creation_jobs.append(self.addTask(command=cmd, + dependencies=deps, + label=lbl)) + # zip TXT files cmd = 'cd "{}"'.format(self.output_dir) cmd += ' && ' cmd += 'zip' @@ -446,11 +420,12 @@ class OCRPipeline(WorkflowRunner): cmd += ' -i "*.txt"' cmd += ' && ' cmd += 'cd -' - deps = deps + ['zip_job_-_all'] - lbl = 'zip_job_-_txt' - zip_jobs.append(self.addTask(command=cmd, dependencies=deps, - label=lbl)) - # zip XMLs + deps = combined_txt_creation_jobs + lbl = 'zip_creation_-_txt' + zip_creation_jobs.append(self.addTask(command=cmd, + dependencies=deps, + label=lbl)) + # zip XML files cmd = 'cd "{}"'.format(self.output_dir) cmd += ' && ' cmd += 'zip' @@ -460,80 +435,26 @@ class OCRPipeline(WorkflowRunner): cmd += ' -i "*.xml"' cmd += ' && ' cmd += 'cd -' - deps = deps + ['zip_job_-_all'] - lbl = 'zip_job_-_xml' - zip_jobs.append(self.addTask(command=cmd, dependencies=deps, - label=lbl)) - # zip PoCo files - poco_paths = [] - poco_names = [] - for i, job in enumerate(self.jobs): - poco_paths.append(os.path.join(os.path.basename(job.output_dir), # noqa - 'PoCo')) - poco_names.append(job.output_dir) - + deps = tei_p5_creation_jobs + lbl = 'zip_creation_-_xml' + zip_creation_jobs.append(self.addTask(command=cmd, + dependencies=deps, + label=lbl)) + # zip PoCo bundles cmd = 'cd "{}"'.format(self.output_dir) cmd += ' && ' cmd += 'zip' cmd += ' -r' - cmd += ' "{}".poco.zip'.format(self.zip) - cmd += ' "{}"'.format('" "'.join(poco_paths)) + cmd += ' "{}".poco.zip .'.format(self.zip) + cmd += ' -x "pyflow.data*" "*tmp*"' + cmd += ' -i "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif') # noqa cmd += ' && ' cmd += 'cd -' - deps = deps + ['zip_job_-_all'] - lbl = 'zip_job_-_poco_{}'.format(i) - zip_jobs.append(self.addTask(command=cmd, dependencies=deps, - label=lbl)) - - ''' - ' ################################################## - ' # mv_jobs # - ' ################################################## - ''' - mv_jobs = [] - if self.keep_intermediates: - for i, job in enumerate(self.jobs): - input_dir = os.path.join(job.output_dir, 'tmp') - output_dir = input_dir - cmd = 'mv "{}"/*.hocr "{}"'.format( - input_dir, os.path.join(output_dir, 'hocr')) - cmd += ' && ' - cmd += 'mv "{}"/*.pdf "{}"'.format(input_dir, os.path.join(output_dir, 'pdf')) # noqa - cmd += ' && ' - cmd += 'mv "{}"/*.tif "{}"'.format(input_dir, os.path.join(output_dir, 'tiff')) # noqa - cmd += ' && ' - cmd += 'mv "{}"/*.txt "{}"'.format(input_dir, os.path.join(output_dir, 'txt')) # noqa - if self.binarize: - cmd += ' && ' - cmd += 'mv "{}"/*.bin.png "{}"'.format(input_dir, os.path.join(output_dir, 'bin.png')) # noqa - cmd += ' && ' - cmd += 'mv "{}"/*.nrm.png "{}"'.format(input_dir, os.path.join(output_dir, 'nrm.png')) # noqa - deps = (hocrtotei_jobs - + tesseract_jobs - + pdfunite_jobs - + cat_jobs - + hocr_poco_jobs - + tiff_poco_jobs, - + compress_jobs - + zip_jobs) - lbl = 'mv_job_-_{}'.format(i) - mv_jobs.append(self.addTask(command=cmd, dependencies=deps, - label=lbl)) - else: - for i, job in enumerate(self.jobs): - input_dir = os.path.join(job.output_dir, 'tmp') - cmd = 'rm -r "{}"'.format(input_dir) - deps = (hocrtotei_jobs - + tesseract_jobs - + pdfunite_jobs - + cat_jobs - + hocr_poco_jobs - + tiff_poco_jobs - + compress_jobs - + zip_jobs) - lbl = 'mv_job_-_{}'.format(i) - mv_jobs.append(self.addTask(command=cmd, dependencies=deps, - label=lbl)) + deps = poco_bundle_creation_jobs + lbl = 'zip_creation_-_poco' + zip_creation_jobs.append(self.addTask(command=cmd, + dependencies=deps, + label=lbl)) def collect_jobs(input_dir, output_dir): @@ -550,12 +471,14 @@ def collect_jobs(input_dir, output_dir): def main(): args = parse_args() - jobs = collect_jobs(args.i, args.o) - ocr_pipeline = OCRPipeline(args.binarize, jobs, args.keep_intermediates, - args.language, args.n_cores, args.o, args.zip, + jobs = collect_jobs(args.input_directory, args.output_directory) + ocr_pipeline = OCRPipeline(args.binarize, jobs, args.language, + args.n_cores, args.output_directory, args.zip, args.compress) - retval = ocr_pipeline.run(dataDirRoot=(args.log_dir or args.o), - nCores=args.n_cores) + retval = ocr_pipeline.run( + dataDirRoot=(args.log_dir or args.output_directory), + nCores=args.n_cores + ) sys.exit(retval) diff --git a/wrapper/ocr b/wrapper/ocr index 4c38f25..5b3e68f 100755 --- a/wrapper/ocr +++ b/wrapper/ocr @@ -12,17 +12,21 @@ UID = str(os.getuid()) GID = str(os.getgid()) parser = ArgumentParser(add_help=False) -parser.add_argument('-i') -parser.add_argument('-o') +parser.add_argument('-i', '--input-directory') +parser.add_argument('-o', '--output-directory') args, remaining_args = parser.parse_known_args() cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)] -if args.o is not None: - cmd += ['-v', '{}:{}'.format(os.path.abspath(args.o), CONTAINER_OUTPUT_DIR)] +if args.output_directory is not None: + cmd += ['-v', '{}:{}'.format(os.path.abspath(args.output_directory), + CONTAINER_OUTPUT_DIR)] remaining_args.insert(0, CONTAINER_OUTPUT_DIR) -if args.i is not None: - cmd += ['-v', '{}:{}'.format(os.path.abspath(args.i), CONTAINER_INPUT_DIR)] + remaining_args.insert(0, '-o') +if args.input_directory is not None: + cmd += ['-v', '{}:{}'.format(os.path.abspath(args.input_directory), + CONTAINER_INPUT_DIR)] remaining_args.insert(0, CONTAINER_INPUT_DIR) + remaining_args.insert(0, '-i') cmd.append(CONTAINER_IMAGE) cmd += remaining_args