mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2024-12-27 10:34:18 +00:00
Sort all lists before processing
This commit is contained in:
parent
b9dba80d7f
commit
03b1054560
43
ocr
43
ocr
@ -13,6 +13,7 @@ Author: Patrick Jentsch <p.jentsch@uni-bielefeld.de>
|
|||||||
import argparse
|
import argparse
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
from pyflow import WorkflowRunner
|
from pyflow import WorkflowRunner
|
||||||
|
|
||||||
@ -35,7 +36,6 @@ def parse_arguments():
|
|||||||
requirements: imagemagick, ocropus, pdftoppm, pdfunite, \
|
requirements: imagemagick, ocropus, pdftoppm, pdfunite, \
|
||||||
poppler-utils, pyflow, python2.7, python3.5, tesseract'
|
poppler-utils, pyflow, python2.7, python3.5, tesseract'
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-l',
|
'-l',
|
||||||
dest='lang',
|
dest='lang',
|
||||||
@ -92,12 +92,6 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
self.nCores = args.nCores
|
self.nCores = args.nCores
|
||||||
|
|
||||||
def workflow(self):
|
def workflow(self):
|
||||||
'''
|
|
||||||
' Starting workflow...
|
|
||||||
'''
|
|
||||||
for index, job in enumerate(self.jobs):
|
|
||||||
print('%i: %s' % (index, job))
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
' Creating output directories...
|
' Creating output directories...
|
||||||
'''
|
'''
|
||||||
@ -114,9 +108,8 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
os.path.join(job['output_dir'], 'tmp', 'txt')
|
os.path.join(job['output_dir'], 'tmp', 'txt')
|
||||||
)
|
)
|
||||||
if not self.skipBinarisation:
|
if not self.skipBinarisation:
|
||||||
cmd += ' "%s" "%s"' % (
|
cmd += ' "%s"' % (
|
||||||
os.path.join(job['output_dir'], 'tmp', 'bin.png'),
|
os.path.join(job['output_dir'], 'tmp', 'bin.png')
|
||||||
os.path.join(job['output_dir'], 'tmp', 'nrm.png'),
|
|
||||||
)
|
)
|
||||||
create_output_directories_jobs.append(
|
create_output_directories_jobs.append(
|
||||||
self.addTask(
|
self.addTask(
|
||||||
@ -186,7 +179,13 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
post_binarisation_jobs = []
|
post_binarisation_jobs = []
|
||||||
for index, job in enumerate(self.jobs):
|
for index, job in enumerate(self.jobs):
|
||||||
number = 0
|
number = 0
|
||||||
for file in filter(lambda x: x.endswith(('.bin.png', '.nrm.png')), os.listdir(os.path.join(job['output_dir'], 'tmp'))):
|
files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
|
||||||
|
files = filter(lambda x: x.endswith('.bin.png'), files)
|
||||||
|
files = sorted(
|
||||||
|
files,
|
||||||
|
key=lambda x: int(re.search(r'\d+', x).group(0))
|
||||||
|
)
|
||||||
|
for file in files:
|
||||||
cmd = 'mv "%s" "%s"' % (
|
cmd = 'mv "%s" "%s"' % (
|
||||||
os.path.join(job['output_dir'], 'tmp', file),
|
os.path.join(job['output_dir'], 'tmp', file),
|
||||||
os.path.join(job['output_dir'], 'tmp', 'page-%i.%s' % (
|
os.path.join(job['output_dir'], 'tmp', 'page-%i.%s' % (
|
||||||
@ -210,7 +209,6 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
' Performing OCR...
|
' Performing OCR...
|
||||||
'''
|
'''
|
||||||
self.waitForTasks()
|
self.waitForTasks()
|
||||||
print(self)
|
|
||||||
ocr_jobs = []
|
ocr_jobs = []
|
||||||
'''
|
'''
|
||||||
' Tesseract runs fastest with four cores. So we run it with either four
|
' Tesseract runs fastest with four cores. So we run it with either four
|
||||||
@ -226,7 +224,16 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
ocr_job_nCores = 1
|
ocr_job_nCores = 1
|
||||||
for index, job in enumerate(self.jobs):
|
for index, job in enumerate(self.jobs):
|
||||||
number = 0
|
number = 0
|
||||||
for file in filter(lambda x: x.endswith('.tif') if self.skipBinarisation else x.endswith('.bin.png'), os.listdir(os.path.join(job['output_dir'], 'tmp'))):
|
files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
|
||||||
|
if self.skipBinarisation:
|
||||||
|
files = filter(lambda x: x.endswith('.tif'), files)
|
||||||
|
else:
|
||||||
|
files = filter(lambda x: x.endswith('.bin.png'), files)
|
||||||
|
files = sorted(
|
||||||
|
files,
|
||||||
|
key=lambda x: int(re.search(r'\d+', x).group(0))
|
||||||
|
)
|
||||||
|
for file in files:
|
||||||
cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % (
|
cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % (
|
||||||
os.path.join(job['output_dir'], 'tmp', file),
|
os.path.join(job['output_dir'], 'tmp', file),
|
||||||
os.path.join(
|
os.path.join(
|
||||||
@ -240,8 +247,9 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
ocr_job_dependencies = 'split_job_-_%i' % (index)
|
ocr_job_dependencies = 'split_job_-_%i' % (index)
|
||||||
else:
|
else:
|
||||||
ocr_job_dependencies = filter(
|
ocr_job_dependencies = filter(
|
||||||
lambda x: x.startswith(
|
lambda x: x == 'post_binarisation_job_-_%i-%i' % (
|
||||||
'post_binarisation_job_-_%i' % (index)
|
index,
|
||||||
|
number
|
||||||
),
|
),
|
||||||
post_binarisation_jobs
|
post_binarisation_jobs
|
||||||
)
|
)
|
||||||
@ -357,9 +365,8 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
os.path.join(job['output_dir'], 'tmp'),
|
os.path.join(job['output_dir'], 'tmp'),
|
||||||
os.path.join(job['output_dir'], 'tmp', 'bin.png'),
|
os.path.join(job['output_dir'], 'tmp', 'bin.png'),
|
||||||
)
|
)
|
||||||
cmd += ' && mv "%s"/*.nrm.png "%s"' % (
|
cmd += ' && rm "%s"/*.nrm.png' % (
|
||||||
os.path.join(job['output_dir'], 'tmp'),
|
os.path.join(job['output_dir'], 'tmp')
|
||||||
os.path.join(job['output_dir'], 'tmp', 'nrm.png'),
|
|
||||||
)
|
)
|
||||||
cleanup_jobs.append(
|
cleanup_jobs.append(
|
||||||
self.addTask(
|
self.addTask(
|
||||||
|
Loading…
Reference in New Issue
Block a user