mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
				synced 2025-10-31 21:33:15 +00:00 
			
		
		
		
	Correct order for output files.
This commit is contained in:
		
							
								
								
									
										20
									
								
								Dockerfile
									
									
									
									
									
								
							
							
						
						
									
										20
									
								
								Dockerfile
									
									
									
									
									
								
							| @@ -1,6 +1,6 @@ | ||||
| FROM debian:stretch-slim | ||||
|  | ||||
| MAINTAINER Patrick Jentsch <p.jentsch@uni-bielefeld.de> | ||||
| LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de" | ||||
|  | ||||
| ENV DEBIAN_FRONTEND=noninteractive | ||||
| ENV LANG=C.UTF-8 | ||||
| @@ -11,34 +11,31 @@ RUN apt-get update && \ | ||||
|     ca-certificates \ | ||||
|     gnupg2 \ | ||||
|     imagemagick \ | ||||
|     pdftk \ | ||||
|     poppler-utils \ | ||||
|     python2.7 \ | ||||
|     python3.5 \ | ||||
|     python-numpy \ | ||||
|     wget | ||||
|  | ||||
| WORKDIR /root | ||||
|  | ||||
| # Install ocropy | ||||
| ENV OCROPY_VERSION 1.3.3 | ||||
| RUN wget -nv https://github.com/tmbdev/ocropy/archive/v"$OCROPY_VERSION".tar.gz && \ | ||||
|     tar -xzf v"$OCROPY_VERSION".tar.gz && \ | ||||
|     rm v"$OCROPY_VERSION".tar.gz && \ | ||||
|     cd ocropy-"$OCROPY_VERSION" && \ | ||||
|     apt-get install -y --no-install-recommends $(cat PACKAGES) python-pil python-tk && \ | ||||
|     wget -nv http://www.tmbdev.net/en-default.pyrnn.gz -P models/ && \ | ||||
|     python2.7 setup.py install && \ | ||||
|     cd .. | ||||
|     cd .. && \ | ||||
|     rm -r v"$OCROPY_VERSION".tar.gz ocropy-"$OCROPY_VERSION" | ||||
|  | ||||
| # Install pyFlow | ||||
| ENV PYFLOW_VERSION 1.1.20 | ||||
| RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \ | ||||
|     tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \ | ||||
|     rm pyflow-"$PYFLOW_VERSION".tar.gz && \ | ||||
|     cd pyflow-"$PYFLOW_VERSION" && \ | ||||
|     python2.7 setup.py build install && \ | ||||
|     cd .. | ||||
|     cd .. && \ | ||||
|     rm -r pyflow-"$PYFLOW_VERSION".tar.gz pyflow-"$PYFLOW_VERSION" | ||||
|  | ||||
| # Install Tesseract OCR and Data Files | ||||
| RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list && \ | ||||
| @@ -52,11 +49,12 @@ RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /et | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/ita.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata | ||||
|  | ||||
| COPY ocr /usr/local/bin | ||||
| COPY hocrtotei /usr/local/bin | ||||
| COPY ocr /usr/local/bin | ||||
|  | ||||
| CMD ["/bin/bash"] | ||||
| ENTRYPOINT ["ocr"] | ||||
| CMD ["--help"] | ||||
|   | ||||
							
								
								
									
										45
									
								
								hocrtotei
									
									
									
									
									
								
							
							
						
						
									
										45
									
								
								hocrtotei
									
									
									
									
									
								
							| @@ -7,22 +7,31 @@ import os | ||||
| import re | ||||
| import sys | ||||
|  | ||||
| input_files = filter(lambda x: x.endswith(".hocr"), sorted(os.listdir(sys.argv[1]))) | ||||
| input_files = sorted( | ||||
|     filter( | ||||
|         lambda x: x.endswith(".hocr"), | ||||
|         os.listdir(sys.argv[1]) | ||||
|     ), | ||||
|     key=lambda x: int(re.search(r'\d+', x).group(0)) | ||||
| ) | ||||
| # "page-1.hocr" -> "1" | ||||
| output_file = open(sys.argv[2], "w") | ||||
|  | ||||
| output_file.write('<?xml version="1.0" encoding="UTF-8"?>\n' + | ||||
|            '<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n' + | ||||
|            '    <teiHeader>\n' + | ||||
|            '        <fileDesc>\n' + | ||||
|            '            <titleStmt/>\n' + | ||||
|            '            <publicationStmt/>\n' + | ||||
|            '            <sourceDesc/>\n' + | ||||
|            '        </fileDesc>\n' + | ||||
|            '        <encodingDesc/>\n' + | ||||
|            '        <profileDesc/>\n' + | ||||
|            '    </teiHeader>\n' + | ||||
|            '    <text>\n' + | ||||
|            '        <body>\n') | ||||
| output_file.write( | ||||
|       '<?xml version="1.0" encoding="UTF-8"?>\n' | ||||
|     + '<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n' | ||||
|     + '    <teiHeader>\n' | ||||
|     + '        <fileDesc>\n' | ||||
|     + '            <titleStmt/>\n' | ||||
|     + '            <publicationStmt/>\n' | ||||
|     + '            <sourceDesc/>\n' | ||||
|     + '        </fileDesc>\n' | ||||
|     + '        <encodingDesc/>\n' | ||||
|     + '        <profileDesc/>\n' | ||||
|     + '    </teiHeader>\n' | ||||
|     + '    <text>\n' | ||||
|     + '        <body>\n' | ||||
| ) | ||||
|  | ||||
| for input_file in input_files: | ||||
|     tree = ET.parse(os.path.join(sys.argv[1], input_file)) | ||||
| @@ -40,7 +49,9 @@ for input_file in input_files: | ||||
|                 output_file.write('<lb/>\n') | ||||
|         output_file.write('            </p>\n') | ||||
|  | ||||
| output_file.write('        </body>\n' + | ||||
|            '    </text>\n' + | ||||
|            '</TEI>') | ||||
| output_file.write( | ||||
|       '        </body>\n' | ||||
|     + '    </text>\n' | ||||
|     + '</TEI>') | ||||
|  | ||||
| output_file.close() | ||||
|   | ||||
							
								
								
									
										189
									
								
								ocr
									
									
									
									
									
								
							
							
						
						
									
										189
									
								
								ocr
									
									
									
									
									
								
							| @@ -19,32 +19,27 @@ from pyflow import WorkflowRunner | ||||
|  | ||||
| ''' TODO: | ||||
| ' Implement --end-page: Last page to ocr | ||||
| ' Implement --memMb: Total amount of memory (RAM) available for this workflow. Default: 2048 * nCores | ||||
| ' Implement --memMb: Total amount of memory (RAM) available for this workflow. | ||||
| '                    Default: 2048 * nCores | ||||
| ' Implement --rotate: Rotate pages from input (90, 180, 270) | ||||
| ' Implement --split-pages: Split pages in half after possible rotation | ||||
| ' Implement --start-page: First page to ocr | ||||
| ''' | ||||
|  | ||||
|  | ||||
| def parse_arguments(): | ||||
|     parser = argparse.ArgumentParser( | ||||
|         "Performs OCR of (historical) documents utilizing OCRopus for \ | ||||
|         preprocessing and Tesseract OCR for OCR. Available outputs are HOCR, \ | ||||
|         PDF, shrinked PDF, and simple DTAbf (TEI P5 compliant). Software \ | ||||
|         requirements: imagemagick, ocropus, pdftk, pdftoppm, poppler-utils, \ | ||||
|         pyflow, python2.7, tesseract" | ||||
|         pyflow, python2.7, python3.5, tesseract" | ||||
|     ) | ||||
|  | ||||
|     parser.add_argument("-i", | ||||
|                         dest="inputDir", | ||||
|                         help="Input directory.", | ||||
|                         required=True) | ||||
|     parser.add_argument("-l", | ||||
|                         dest='lang', | ||||
|                         help="Language for OCR", | ||||
|                         required=True) | ||||
|     parser.add_argument("-o", | ||||
|                         dest="outputDir", | ||||
|                         help="Output directory.", | ||||
|                         required=True) | ||||
|     parser.add_argument("--skip-binarization", | ||||
|                         action='store_true', | ||||
|                         default=False, | ||||
| @@ -67,14 +62,16 @@ def parse_arguments(): | ||||
|  | ||||
|  | ||||
| class OCRWorkflow(WorkflowRunner): | ||||
|     def __init__(self, jobs, skipBinarization, keepIntermediates, lang, nCores): | ||||
|         self.jobs = jobs | ||||
|         self.skipBinarization = skipBinarization | ||||
|         self.keepIntermediates = keepIntermediates | ||||
|         self.lang = lang | ||||
|         self.nCores = nCores | ||||
|         self.defaultNCores = min(nCores, max(1, int(nCores / len(jobs)))) | ||||
|  | ||||
|     def __init__(self, args): | ||||
|         self.jobs = analyze_jobs() | ||||
|         self.skipBinarization = args.skipBinarization | ||||
|         self.keepIntermediates = args.keepIntermediates | ||||
|         self.lang = args.lang | ||||
|         self.nCores = args.nCores | ||||
|         self.defaultNCores = min( | ||||
|             self.nCores, | ||||
|             max(1, int(self.nCores / len(self.jobs))) | ||||
|         ) | ||||
|  | ||||
|     def workflow(self): | ||||
|         ### | ||||
| @@ -93,10 +90,17 @@ class OCRWorkflow(WorkflowRunner): | ||||
|             ) | ||||
|             if not self.skipBinarization: | ||||
|                 cmd += ' "%s" "%s"' % ( | ||||
|                     os.path.join(job["output_dir"], "tmp", "binarized_png"), | ||||
|                     os.path.join(job["output_dir"], "tmp", "normalized_png"), | ||||
|                     os.path.join(job["output_dir"], "tmp", "bin.png"), | ||||
|                     os.path.join(job["output_dir"], "tmp", "nrm.png"), | ||||
|                 ) | ||||
|             create_output_directories_jobs.append( | ||||
|                 self.addTask( | ||||
|                     command=cmd, | ||||
|                     label="create_output_directories_job_-_%i" % ( | ||||
|                         create_output_directories_job_number | ||||
|                     ), | ||||
|                     nCores=self.defaultNCores) | ||||
|                 ) | ||||
|             create_output_directories_jobs.append(self.addTask(label="create_output_directories_job_-_%i" % (create_output_directories_job_number), command=cmd, nCores=self.defaultNCores)) | ||||
|  | ||||
|         ### | ||||
|         # Task "split_job": split input file into one tiff file per page | ||||
| @@ -116,7 +120,14 @@ class OCRWorkflow(WorkflowRunner): | ||||
|                     job["path"], | ||||
|                     os.path.join(job["output_dir"], "tmp", "page") | ||||
|                 ) | ||||
|             split_jobs.append(self.addTask(label="split_job_-_%i" % (split_job_number), command=cmd, dependencies=create_output_directories_jobs, nCores=self.defaultNCores)) | ||||
|             split_jobs.append( | ||||
|                 self.addTask( | ||||
|                     command=cmd, | ||||
|                     dependencies=create_output_directories_jobs, | ||||
|                     label="split_job_-_%i" % (split_job_number), | ||||
|                     nCores=self.defaultNCores | ||||
|                 ) | ||||
|             ) | ||||
|  | ||||
|         ### | ||||
|         # Task "ocropus_nlbin_job": binarize tiff files from previous split | ||||
| @@ -132,12 +143,21 @@ class OCRWorkflow(WorkflowRunner): | ||||
|         if not self.skipBinarization: | ||||
|             for job in self.jobs: | ||||
|                 binarization_job_number += 1 | ||||
|                 cmd = 'ocropus-nlbin --output "%s" --parallel "%i" $(ls "%s"/*.tif | sort -V)' % ( | ||||
|                 cmd = 'ocropus-nlbin --output "%s" --parallel "%i" $(ls --quoting-style=shell-escape -v "%s"/*.tif)' % ( | ||||
|                     os.path.join(job["output_dir"], "tmp"), | ||||
|                     binarization_job_nCores, | ||||
|                     os.path.join(job["output_dir"], "tmp") | ||||
|                 ) | ||||
|                 binarization_jobs.append(self.addTask(label="binarization_job_-_%i" % (binarization_job_number), command=cmd, dependencies=split_jobs, nCores=binarization_job_nCores)) | ||||
|                 binarization_jobs.append( | ||||
|                     self.addTask( | ||||
|                         command=cmd, | ||||
|                         dependencies=split_jobs, | ||||
|                         label="binarization_job_-_%i" % ( | ||||
|                             binarization_job_number | ||||
|                         ), | ||||
|                         nCores=binarization_job_nCores | ||||
|                     ) | ||||
|                 ) | ||||
|  | ||||
|         ### | ||||
|         # Task "post_binarization_job": Normalize file names from binarization | ||||
| @@ -152,9 +172,21 @@ class OCRWorkflow(WorkflowRunner): | ||||
|                     post_binarization_job_number += 1 | ||||
|                     cmd = 'mv "%s" "%s"' % ( | ||||
|                         os.path.join(job["output_dir"], "tmp", file), | ||||
|                         os.path.join(job["output_dir"], "tmp", "page-%i.%s" % (int(file.split(".", 1)[0]), file.split(".", 1)[1])), | ||||
|                         os.path.join(job["output_dir"], "tmp", "page-%i.%s" % ( | ||||
|                             int(file.split(".", 1)[0]), | ||||
|                             file.split(".", 1)[1]) | ||||
|                         ), | ||||
|                     ) | ||||
|                     post_binarization_jobs.append( | ||||
|                         self.addTask( | ||||
|                             command=cmd, | ||||
|                             dependencies=binarization_jobs, | ||||
|                             label="post_binarization_job_-_%i" % ( | ||||
|                                 post_binarization_job_number | ||||
|                             ), | ||||
|                             nCores=self.defaultNCores | ||||
|                         ) | ||||
|                     ) | ||||
|                     post_binarization_jobs.append(self.addTask(label="post_binarization_job_-_%i" % (post_binarization_job_number), command=cmd, dependencies=binarization_jobs, nCores=self.defaultNCores)) | ||||
|  | ||||
|         ### | ||||
|         # Task "ocr_job": perform OCR | ||||
| @@ -165,8 +197,8 @@ class OCRWorkflow(WorkflowRunner): | ||||
|         ocr_job_number = 0 | ||||
|         ''' | ||||
|         ' Tesseract runs fastest with four cores. So we run it with either four | ||||
|         ' or, if there are less then four cores available for this workflow, the | ||||
|         ' available core number. | ||||
|         ' or, if there are less then four cores available for this workflow, | ||||
|         ' the available core number. | ||||
|         ''' | ||||
|         ocr_job_nCores = min(4, self.nCores) | ||||
|         ''' | ||||
| @@ -183,7 +215,14 @@ class OCRWorkflow(WorkflowRunner): | ||||
|                     os.path.join(job["output_dir"], "tmp", file.rsplit(".", 1 if self.skipBinarization else 2)[0]), | ||||
|                     self.lang | ||||
|                 ) | ||||
|                 ocr_jobs.append(self.addTask(label="ocr_job_-_%i" % (ocr_job_number), command=cmd, dependencies=post_binarization_jobs, nCores=ocr_job_nCores)) | ||||
|                 ocr_jobs.append( | ||||
|                     self.addTask( | ||||
|                         command=cmd, | ||||
|                         dependencies=post_binarization_jobs, | ||||
|                         label="ocr_job_-_%i" % (ocr_job_number), | ||||
|                         nCores=ocr_job_nCores | ||||
|                     ) | ||||
|                 ) | ||||
|  | ||||
|         ### | ||||
|         # Task "hocr_to_tei_job": create TEI P5 file from hocr files | ||||
| @@ -197,7 +236,14 @@ class OCRWorkflow(WorkflowRunner): | ||||
|                 os.path.join(job["output_dir"], "tmp"), | ||||
|                 os.path.join(job["output_dir"], job["filename"].rsplit(".", 1)[0] + ".xml") | ||||
|             ) | ||||
|             hocr_to_tei_jobs.append(self.addTask(label="hocr_to_tei_job_-_%i" % (hocr_to_tei_job_number), command=cmd, dependencies=ocr_jobs, nCores=self.defaultNCores)) | ||||
|             hocr_to_tei_jobs.append( | ||||
|                 self.addTask( | ||||
|                     command=cmd, | ||||
|                     dependencies=ocr_jobs, | ||||
|                     label="hocr_to_tei_job_-_%i" % (hocr_to_tei_job_number), | ||||
|                     nCores=self.defaultNCores | ||||
|                 ) | ||||
|             ) | ||||
|  | ||||
|         ### | ||||
|         # Task "pdf_merge_job": Merge PDF files | ||||
| @@ -207,11 +253,18 @@ class OCRWorkflow(WorkflowRunner): | ||||
|         pdf_merge_job_number = 0 | ||||
|         for job in self.jobs: | ||||
|             pdf_merge_job_number += 1 | ||||
|             cmd = 'pdftk $(ls "%s"/*.pdf | sort -V) cat output "%s"' % ( | ||||
|             cmd = '(ls --quoting-style=shell-escape -v "%s"/*.pdf && echo "\'%s\'") | xargs pdfunite' % ( | ||||
|                 os.path.join(job["output_dir"], "tmp"), | ||||
|                 os.path.join(job["output_dir"], job["filename"].rsplit(".", 1)[0] + ".pdf") | ||||
|             ) | ||||
|             pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=ocr_jobs, nCores=self.defaultNCores)) | ||||
|             pdf_merge_jobs.append( | ||||
|                 self.addTask( | ||||
|                     command=cmd, | ||||
|                     dependencies=ocr_jobs, | ||||
|                     label="pdf_merge_job_-_%i" % (pdf_merge_job_number), | ||||
|                     nCores=self.defaultNCores | ||||
|                 ) | ||||
|             ) | ||||
|  | ||||
|         ### | ||||
|         # Task "txt_merge_job": Merge .txt files | ||||
| @@ -221,11 +274,18 @@ class OCRWorkflow(WorkflowRunner): | ||||
|         txt_merge_job_number = 0 | ||||
|         for job in self.jobs: | ||||
|             txt_merge_job_number += 1 | ||||
|             cmd = 'cat $(ls "%s"/*.txt | sort -V) > "%s"' % ( | ||||
|             cmd = 'ls --quoting-style=shell-escape -v "%s"/*.txt | xargs cat > "%s"' % ( | ||||
|                 os.path.join(job["output_dir"], "tmp"), | ||||
|                 os.path.join(job["output_dir"], job["filename"].rsplit(".", 1)[0] + ".txt") | ||||
|             ) | ||||
|             txt_merge_jobs.append(self.addTask(label="txt_merge_job_-_%i" % (txt_merge_job_number), command=cmd, dependencies=ocr_jobs, nCores=self.defaultNCores)) | ||||
|             txt_merge_jobs.append( | ||||
|                 self.addTask( | ||||
|                     command=cmd, | ||||
|                     dependencies=ocr_jobs, | ||||
|                     label="txt_merge_job_-_%i" % (txt_merge_job_number), | ||||
|                     nCores=self.defaultNCores | ||||
|                 ) | ||||
|             ) | ||||
|  | ||||
|         ### | ||||
|         # Task "cleanup_job": remove temporary files | ||||
| @@ -236,35 +296,59 @@ class OCRWorkflow(WorkflowRunner): | ||||
|         if self.keepIntermediates: | ||||
|             for job in self.jobs: | ||||
|                 cleanup_job_counter += 1 | ||||
|                 cmd = 'mv "%s"/*.hocr "%s" && mv "%s"/*.pdf "%s" && mv "%s"/*.tif "%s" && mv "%s"/*.txt "%s"' % ( | ||||
|                 cmd = 'mv "%s"/*.hocr "%s"' % ( | ||||
|                     os.path.join(job["output_dir"], "tmp"), | ||||
|                     os.path.join(job["output_dir"], "tmp", "hocr"), | ||||
|                 ) | ||||
|                 cmd += ' && mv "%s"/*.pdf "%s"' % ( | ||||
|                     os.path.join(job["output_dir"], "tmp"), | ||||
|                     os.path.join(job["output_dir"], "tmp", "pdf"), | ||||
|                 ) | ||||
|                 cmd += ' && mv "%s"/*.tif "%s"' % ( | ||||
|                     os.path.join(job["output_dir"], "tmp"), | ||||
|                     os.path.join(job["output_dir"], "tmp", "tiff"), | ||||
|                 ) | ||||
|                 cmd += ' && mv "%s"/*.txt "%s"' % ( | ||||
|                     os.path.join(job["output_dir"], "tmp"), | ||||
|                     os.path.join(job["output_dir"], "tmp", "txt") | ||||
|                     os.path.join(job["output_dir"], "tmp", "txt"), | ||||
|                 ) | ||||
|                 if not self.skipBinarization: | ||||
|                     cmd += ' && mv "%s"/*.bin.png "%s" && mv "%s"/*.nrm.png "%s"' % ( | ||||
|                     cmd += ' && mv "%s"/*.bin.png "%s"' % ( | ||||
|                         os.path.join(job["output_dir"], "tmp"), | ||||
|                         os.path.join(job["output_dir"], "tmp", "binarized_png"), | ||||
|                         os.path.join(job["output_dir"], "tmp"), | ||||
|                         os.path.join(job["output_dir"], "tmp", "normalized_png"), | ||||
|                         os.path.join(job["output_dir"], "tmp", "bin.png"), | ||||
|                     ) | ||||
|                 cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs, nCores=self.defaultNCores)) | ||||
|                     cmd += ' && mv "%s"/*.nrm.png "%s"' % ( | ||||
|                         os.path.join(job["output_dir"], "tmp"), | ||||
|                         os.path.join(job["output_dir"], "tmp", "nrm.png"), | ||||
|                     ) | ||||
|             cleanup_jobs.append( | ||||
|                 self.addTask( | ||||
|                     command=cmd, | ||||
|                     dependencies=hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs, | ||||
|                     label="cleanup_job_-_%i" % (cleanup_job_counter), | ||||
|                     nCores=self.defaultNCores | ||||
|                 ) | ||||
|             ) | ||||
|         else: | ||||
|             for job in self.jobs: | ||||
|                 cleanup_job_counter += 1 | ||||
|                 cmd = 'rm -r "%s"' % ( | ||||
|                     os.path.join(job["output_dir"], "tmp") | ||||
|                 ) | ||||
|                 cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs), nCores=self.defaultNCores) | ||||
|                 cleanup_jobs.append( | ||||
|                     self.addTask( | ||||
|                         label="cleanup_job_-_%i" % (cleanup_job_counter), | ||||
|                         command=cmd, | ||||
|                         dependencies=hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs, | ||||
|                         nCores=self.defaultNCores | ||||
|                     ) | ||||
|                 ) | ||||
|  | ||||
|  | ||||
| def analyze_jobs(inputDir, outputDir): | ||||
| def analyze_jobs(): | ||||
|     inputDir = "/files_for_ocr" | ||||
|     jobs = [] | ||||
|     outputDir = "/files_from_ocr" | ||||
|  | ||||
|     for file in os.listdir(inputDir): | ||||
|         if os.path.isdir(os.path.join(inputDir, file)): | ||||
| @@ -273,7 +357,13 @@ def analyze_jobs(inputDir, outputDir): | ||||
|                 os.path.join(outputDir, file) | ||||
|             ) | ||||
|         elif file.endswith((".pdf", ".tif", ".tiff")): | ||||
|             jobs.append({"filename": file, "output_dir": os.path.join(outputDir, file), "path": os.path.join(inputDir, file)}) | ||||
|             jobs.append( | ||||
|                 { | ||||
|                     "filename": file, | ||||
|                     "output_dir": os.path.join(outputDir, file), | ||||
|                     "path": os.path.join(inputDir, file) | ||||
|                 } | ||||
|             ) | ||||
|  | ||||
|     return jobs | ||||
|  | ||||
| @@ -281,15 +371,10 @@ def analyze_jobs(inputDir, outputDir): | ||||
| def main(): | ||||
|     args = parse_arguments() | ||||
|  | ||||
|     wflow = OCRWorkflow( | ||||
|         analyze_jobs(args.inputDir, args.outputDir), | ||||
|         args.skipBinarization, | ||||
|         args.keepIntermediates, | ||||
|         args.lang, | ||||
|         args.nCores | ||||
|     ) | ||||
|     wflow = OCRWorkflow(args) | ||||
|  | ||||
|     retval = wflow.run(dataDirRoot="/files_from_ocr", nCores=args.nCores) | ||||
|  | ||||
|     retval = wflow.run(nCores=args.nCores) | ||||
|     sys.exit(retval) | ||||
|  | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user