mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
				synced 2025-10-31 21:23:14 +00:00 
			
		
		
		
	Some renaming and cleanup.
This commit is contained in:
		| @@ -55,9 +55,7 @@ RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /et | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata | ||||
|  | ||||
| RUN mkdir files_for_ocr files_from_ocr | ||||
|  | ||||
| COPY ocr_pyflow /usr/local/bin | ||||
| COPY parse_hocr /usr/local/bin | ||||
| COPY ocr /usr/local/bin | ||||
| COPY hocr2tei /usr/local/bin | ||||
|  | ||||
| CMD ["/bin/bash"] | ||||
| @@ -4,7 +4,7 @@ | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| ocr_pyflow.py | ||||
| ocr | ||||
| 
 | ||||
| Usage:  For usage instructions run with option --help | ||||
| Author: Patrick Jentsch <p.jentsch@uni-bielefeld.de> | ||||
| @@ -148,15 +148,15 @@ class OCRWorkflow(WorkflowRunner): | ||||
|         # Task "hocr_to_teip5_job": create TEI P5 file from hocr files | ||||
|         # Dependencies: tesseract_jobs | ||||
|         ### | ||||
|         hocr_to_teip5_jobs = [] | ||||
|         hocr_to_teip5_job_number = 0 | ||||
|         hocr_to_tei_jobs = [] | ||||
|         hocr_to_tei_job_number = 0 | ||||
|         for job in self.jobs: | ||||
|             hocr_to_teip5_job_number += 1 | ||||
|             cmd = 'parse_hocr "%s" "%s"' % ( | ||||
|             hocr_to_tei_job_number += 1 | ||||
|             cmd = 'hocrtotei "%s" "%s"' % ( | ||||
|                 os.path.join(job["output_dir"], "tmp", "tesseract"), | ||||
|                 os.path.join(job["output_dir"], job["basename"].rsplit(".", 1)[0] + ".xml") | ||||
|             ) | ||||
|             hocr_to_teip5_jobs.append(self.addTask(label="hocr_to_teip5_job_-_%i" % (hocr_to_teip5_job_number), command=cmd, dependencies=tesseract_jobs)) | ||||
|             hocr_to_tei_jobs.append(self.addTask(label="hocr_to_tei_job_-_%i" % (hocr_to_tei_job_number), command=cmd, dependencies=tesseract_jobs)) | ||||
| 
 | ||||
|         ### | ||||
|         # Task "move_hocr_job": move hocr files from <output_dir>/tmp/tesseract to <output_dir>/hocr_files | ||||
| @@ -170,7 +170,7 @@ class OCRWorkflow(WorkflowRunner): | ||||
|                 os.path.join(job["output_dir"], "tmp", "tesseract"), | ||||
|                 os.path.join(job["output_dir"], "hocr_files") | ||||
|             ) | ||||
|             move_hocr_jobs.append(self.addTask(label="move_hocr_job_-_%i" % (move_hocr_job_number), command=cmd, dependencies=hocr_to_teip5_jobs)) | ||||
|             move_hocr_jobs.append(self.addTask(label="move_hocr_job_-_%i" % (move_hocr_job_number), command=cmd, dependencies=hocr_to_tei_jobs)) | ||||
| 
 | ||||
|         ### | ||||
|         # Task "pdf_merge_job": Merge PDF files | ||||
| @@ -211,7 +211,7 @@ class OCRWorkflow(WorkflowRunner): | ||||
|                 cmd = 'rm -r "%s"' % ( | ||||
|                     os.path.join(job["output_dir"], "tmp") | ||||
|                 ) | ||||
|                 cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_teip5_jobs + move_hocr_jobs + pdf_merge_jobs + pdf_to_txt_jobs)) | ||||
|                 cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_tei_jobs + move_hocr_jobs + pdf_merge_jobs + pdf_to_txt_jobs)) | ||||
| 
 | ||||
| 
 | ||||
| def analyze_jobs(inputDir, outputDir, level=1): | ||||
		Reference in New Issue
	
	Block a user