diff --git a/ocr_pyflow b/ocr_pyflow index 725d8ee..d54e5ab 100755 --- a/ocr_pyflow +++ b/ocr_pyflow @@ -72,7 +72,7 @@ class OCRWorkflow(WorkflowRunner): ### mkdir_jobs = [] mkdir_job_number = 0 - for job in jobs: + for job in self.jobs: mkdir_job_number += 1 cmd = 'mkdir -p "%s" "%s" "%s" "%s"' % ( os.path.join(job["output_dir"], "hocr_files"), @@ -88,7 +88,7 @@ class OCRWorkflow(WorkflowRunner): ### split_jobs = [] split_job_number = 0 - for job in jobs: + for job in self.jobs: split_job_number += 1 if job["basename"].endswith(".tif") or job["basename"].endswith(".tiff"): # TODO: Make the following command work @@ -118,7 +118,7 @@ class OCRWorkflow(WorkflowRunner): ### ocropusnlbin_jobs = [] ocropusnlbin_job_number = 0 - for job in jobs: + for job in self.jobs: ocropusnlbin_job_number += 1 cmd = 'ocropus-nlbin -o "%s" "%s"' % ( os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"), @@ -133,7 +133,7 @@ class OCRWorkflow(WorkflowRunner): self.waitForTasks() tesseract_jobs = [] tesseract_job_number = 0 - for job in jobs: + for job in self.jobs: # This list is empty if you don't wait for ocropus_nlbin_jobs to complete for file in filter(lambda x: x.endswith(".bin.png"), os.listdir(os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"))): tesseract_job_number += 1 @@ -150,7 +150,7 @@ class OCRWorkflow(WorkflowRunner): ### hocr_to_teip5_jobs = [] hocr_to_teip5_job_number = 0 - for job in jobs: + for job in self.jobs: hocr_to_teip5_job_number += 1 cmd = 'parse_hocr "%s" "%s"' % ( os.path.join(job["output_dir"], "tmp", "tesseract"), @@ -164,7 +164,7 @@ class OCRWorkflow(WorkflowRunner): ### move_hocr_jobs = [] move_hocr_job_number = 0 - for job in jobs: + for job in self.jobs: move_hocr_job_number += 1 cmd = 'mv "%s"/*.hocr "%s"' % ( os.path.join(job["output_dir"], "tmp", "tesseract"), @@ -178,7 +178,7 @@ class OCRWorkflow(WorkflowRunner): ### pdf_merge_jobs = [] pdf_merge_job_number = 0 - for job in jobs: + for job in self.jobs: pdf_merge_job_number += 1 cmd = 'pdftk "%s"/*.pdf cat output "%s"' % ( os.path.join(job["output_dir"], "tmp", "tesseract"), @@ -192,7 +192,7 @@ class OCRWorkflow(WorkflowRunner): ### pdf_to_txt_jobs = [] pdf_to_txt_job_number = 0 - for job in jobs: + for job in self.jobs: pdf_to_txt_job_number += 1 cmd = 'pdftotext -raw "%s"' % ( os.path.join(job["output_dir"], job["basename"].rsplit(".", 1)[0] + ".pdf") @@ -206,7 +206,7 @@ class OCRWorkflow(WorkflowRunner): cleanup_jobs = [] cleanup_job_counter = 0 if not self.keepIntermediates: - for job in jobs: + for job in self.jobs: cleanup_job_counter += 1 cmd = 'rm -r "%s"' % ( os.path.join(job["output_dir"], "tmp")