One thread per page ocr patch

This commit is contained in:
Stephan Porada 2020-10-07 13:46:22 +02:00
parent 4b84488fe6
commit cb68d6de2d

8
ocr
View File

@ -203,12 +203,6 @@ class OCRPipeline(WorkflowRunner):
' ################################################## ' ##################################################
''' '''
ocr_tasks = [] ocr_tasks = []
'''
' Tesseract runs fastest with four cores. So we run it with either four
' or, if there are less then four cores available for this workflow,
' the available core number.
'''
n_cores = min(4, self.n_cores)
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
input_dir = job.intermediate_dir input_dir = job.intermediate_dir
output_dir = job.intermediate_dir output_dir = job.intermediate_dir
@ -232,7 +226,7 @@ class OCRPipeline(WorkflowRunner):
cmd += ' && ' cmd += ' && '
cmd += 'sed -i \'s+{}/++g\' "{}".hocr'.format(input_dir, output_file_base) # noqa cmd += 'sed -i \'s+{}/++g\' "{}".hocr'.format(input_dir, output_file_base) # noqa
lbl = 'ocr_-_{}-{}'.format(i, j) lbl = 'ocr_-_{}-{}'.format(i, j)
task = self.addTask(command=cmd, dependencies=deps, label=lbl, nCores=n_cores) # noqa task = self.addTask(command=cmd, dependencies=deps, label=lbl, env={"OMP_THREAD_LIMIT": "1"}) # noqa
ocr_tasks.append(task) ocr_tasks.append(task)
''' '''