mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
				synced 2025-10-22 22:35:27 +00:00 
			
		
		
		
	Compare commits
	
		
			2 Commits
		
	
	
		
			a0760487ae
			...
			8a3816121c
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 8a3816121c | ||
|  | e1b78b6ba4 | 
							
								
								
									
										39
									
								
								Dockerfile
									
									
									
									
									
								
							
							
						
						
									
										39
									
								
								Dockerfile
									
									
									
									
									
								
							| @@ -9,8 +9,14 @@ ENV LANG=C.UTF-8 | |||||||
|  |  | ||||||
| RUN apt-get update \ | RUN apt-get update \ | ||||||
|  && apt-get install --no-install-recommends --yes \ |  && apt-get install --no-install-recommends --yes \ | ||||||
|       wget |       ghostscript \ | ||||||
|  |       procps \ | ||||||
|  |       python3.7 \ | ||||||
|  |       python3-pip \ | ||||||
|  |       rename \ | ||||||
|  |       wget \ | ||||||
|  |       zip \ | ||||||
|  |  && python3 -m pip install lxml | ||||||
|  |  | ||||||
| # Install the OCR pipeline and it's dependencies # | # Install the OCR pipeline and it's dependencies # | ||||||
| ## Install pyFlow ## | ## Install pyFlow ## | ||||||
| @@ -43,7 +49,7 @@ RUN wget --no-check-certificate --quiet \ | |||||||
|  |  | ||||||
|  |  | ||||||
| ## Install Tesseract OCR ## | ## Install Tesseract OCR ## | ||||||
| ENV TESSERACT_VERSION=4.1.1 | ENV TESSERACT_VERSION=5.0.0 | ||||||
| RUN wget --no-check-certificate --quiet \ | RUN wget --no-check-certificate --quiet \ | ||||||
|       "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \ |       "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \ | ||||||
|  && tar -xzf "${TESSERACT_VERSION}.tar.gz" \ |  && tar -xzf "${TESSERACT_VERSION}.tar.gz" \ | ||||||
| @@ -61,37 +67,20 @@ RUN wget --no-check-certificate --quiet \ | |||||||
|       pkg-config \ |       pkg-config \ | ||||||
|       zlib1g-dev \ |       zlib1g-dev \ | ||||||
|  && ./autogen.sh \ |  && ./autogen.sh \ | ||||||
|  && ./configure \ |  && ./configure --disable-openmp --disable-shared 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic' \ | ||||||
|  && make \ |  && make \ | ||||||
|  && make install \ |  && make install \ | ||||||
|  && ldconfig \ |  && ldconfig \ | ||||||
|  && cd - > /dev/null \ |  && cd - > /dev/null \ | ||||||
|  && rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz" |  && rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz" | ||||||
|  |  | ||||||
| ENV TESSERACT_MODELS="ara,chi_tra,dan,deu,ell,eng,enm,fra,frk,frm,ita,por,rus,spa" |  | ||||||
| ENV TESSDATA_BEST_VERSION=4.1.0 |  | ||||||
| RUN wget --no-check-certificate --quiet \ |  | ||||||
|       "https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}.tar.gz" \ |  | ||||||
|  && tar -xzf "${TESSDATA_BEST_VERSION}.tar.gz" \ |  | ||||||
|  && for tesseract_model in $(echo ${TESSERACT_MODELS} | tr "," "\n"); do mv "tessdata_best-${TESSDATA_BEST_VERSION}/${tesseract_model}.traineddata" "/usr/local/share/tessdata/"; done \ |  | ||||||
|  && rm -r "tessdata_best-${TESSDATA_BEST_VERSION}" "${TESSDATA_BEST_VERSION}.tar.gz" |  | ||||||
|  |  | ||||||
|  |  | ||||||
| ## Further dependencies ## |  | ||||||
| RUN apt-get install --no-install-recommends --yes \ |  | ||||||
|       procps \ |  | ||||||
|       ghostscript \ |  | ||||||
|       python3.7 \ |  | ||||||
|       rename \ |  | ||||||
|       zip |  | ||||||
|  |  | ||||||
|  |  | ||||||
| ## Install Pipeline ## |  | ||||||
| COPY hocrtotei ocr /usr/local/bin/ |  | ||||||
|  |  | ||||||
|  |  | ||||||
| RUN rm -r /var/lib/apt/lists/* | RUN rm -r /var/lib/apt/lists/* | ||||||
|  |  | ||||||
|  |  | ||||||
|  | ## Install Pipeline ## | ||||||
|  | COPY hocr2tei hocr-combine ocr /usr/local/bin/ | ||||||
|  |  | ||||||
|  |  | ||||||
| ENTRYPOINT ["ocr"] | ENTRYPOINT ["ocr"] | ||||||
| CMD ["--help"] | CMD ["--help"] | ||||||
|   | |||||||
							
								
								
									
										21
									
								
								LICENSE
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								LICENSE
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,21 @@ | |||||||
|  | MIT License | ||||||
|  |  | ||||||
|  | Copyright (c) 2021 Bielefeld University - CRC 1288 - INF | ||||||
|  |  | ||||||
|  | Permission is hereby granted, free of charge, to any person obtaining a copy | ||||||
|  | of this software and associated documentation files (the "Software"), to deal | ||||||
|  | in the Software without restriction, including without limitation the rights | ||||||
|  | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||||||
|  | copies of the Software, and to permit persons to whom the Software is | ||||||
|  | furnished to do so, subject to the following conditions: | ||||||
|  |  | ||||||
|  | The above copyright notice and this permission notice shall be included in all | ||||||
|  | copies or substantial portions of the Software. | ||||||
|  |  | ||||||
|  | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||||||
|  | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||||||
|  | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||||||
|  | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||||||
|  | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||||||
|  | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||||
|  | SOFTWARE. | ||||||
							
								
								
									
										43
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										43
									
								
								README.md
									
									
									
									
									
								
							| @@ -1,6 +1,6 @@ | |||||||
| # OCR - Optical Character Recognition | # OCR - Optical Character Recognition | ||||||
|  |  | ||||||
| This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided. | This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided. The pipeline is designed to run on Linux operating systems, but with some tweaks it should also run on Windows with WSL installed. | ||||||
|  |  | ||||||
| ## Software used in this pipeline implementation | ## Software used in this pipeline implementation | ||||||
|  |  | ||||||
| @@ -8,37 +8,26 @@ This software implements a heavily parallelized pipeline to recognize text in PD | |||||||
|   - Software from Debian Buster's free repositories |   - Software from Debian Buster's free repositories | ||||||
| - ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3 | - ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3 | ||||||
| - pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20 | - pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20 | ||||||
| - Tesseract OCR (4.1.1): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1 | - Tesseract OCR (5.0.0): https://github.com/tesseract-ocr/tesseract/releases/tag/5.0.0 | ||||||
| - tessdata_best (4.1.0): https://github.com/tesseract-ocr/tessdata_best/releases/tag/4.1.0 |  | ||||||
|  |  | ||||||
| ## Use this image | ## Installation | ||||||
|  |  | ||||||
| 1. Create input and output directories for the pipeline. | 1. Install Docker and Python 3. | ||||||
| ``` bash | 2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git` | ||||||
| mkdir -p /<my_data_location>/input /<my_data_location>/output | 2. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:v0.1.0 ocr` | ||||||
| ``` | 2. Add the wrapper script (`wrapper/ocr` relative to this README file) to your `${PATH}`. | ||||||
|  | 3. Create working directories for the pipeline: `mkdir -p /<my_data_location>/{input,models,output}`. | ||||||
|  | 4. Place your Tesseract OCR model(s) inside `/<my_data_location>/models`. | ||||||
|  |  | ||||||
| 2. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language. | ## Use the Pipeline | ||||||
|  |  | ||||||
|  | 1. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language. | ||||||
|  | 2. Clear your `/<my_data_location>/output` directory. | ||||||
| 3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details. | 3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details. | ||||||
| ``` | ```bash | ||||||
| # Option one: Use the wrapper script |  | ||||||
| ## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/raw/development/wrapper/ocr, make it executeable and add it to your ${PATH} |  | ||||||
| cd /<my_data_location> | cd /<my_data_location> | ||||||
| ocr -i input -l <language_code> -o output <optional_pipeline_arguments> | ocr -i input -o output -m models/<model_name> -l <language_code> <optional_pipeline_arguments> | ||||||
|  | # or | ||||||
| # Option two: Classic Docker style | ocr -i input -o output -m models/* -l <language_code> <optional_pipeline_arguments> | ||||||
| docker run \ |  | ||||||
|     --rm \ |  | ||||||
|     -it \ |  | ||||||
|     -u $(id -u $USER):$(id -g $USER) \ |  | ||||||
|     -v /<my_data_location>/input:/input \ |  | ||||||
|     -v /<my_data_location>/output:/output \ |  | ||||||
|     gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:development \ |  | ||||||
|         -i /ocr_pipeline/input \ |  | ||||||
|         -l <language_code> \ |  | ||||||
|         -o /ocr_pipeline/output \ |  | ||||||
|         <optional_pipeline_arguments> |  | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
| 4. Check your results in the `/<my_data_location>/output` directory. | 4. Check your results in the `/<my_data_location>/output` directory. | ||||||
|   | |||||||
							
								
								
									
										35
									
								
								hocr-combine
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								hocr-combine
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,35 @@ | |||||||
|  | #!/usr/bin/env python3.7 | ||||||
|  | # coding=utf-8 | ||||||
|  |  | ||||||
|  | """"Combine multiple hOCR files.""" | ||||||
|  |  | ||||||
|  | from argparse import ArgumentParser | ||||||
|  | from lxml import html | ||||||
|  |  | ||||||
|  |  | ||||||
|  | parser = ArgumentParser(description='Combine multiple hOCR files.') | ||||||
|  | parser.add_argument('file', help='Input file(s)', nargs='+') | ||||||
|  | parser.add_argument('-o', '--output-file', help='Output file', required=True) | ||||||
|  | args = parser.parse_args() | ||||||
|  |  | ||||||
|  |  | ||||||
|  | for file in args.file: | ||||||
|  |     files = [] | ||||||
|  |     if file.startswith('@'): | ||||||
|  |         with open(file[1:], 'r') as f: | ||||||
|  |             files += [x for x in f.read().split("\n") if x != ''] | ||||||
|  |     else: | ||||||
|  |         files.append(file) | ||||||
|  | if len(files) == 0: | ||||||
|  |     exit(1) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | hocr = html.parse(files[0]) | ||||||
|  | hocr_body = hocr.find('body') | ||||||
|  | for file in files[1:]: | ||||||
|  |     for ocr_page in html.parse(file).findall('//div[@class="ocr_page"]'): | ||||||
|  |         hocr_body.append(ocr_page) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | with open(args.output_file, 'wb') as f: | ||||||
|  |     hocr.write(f, encoding='UTF-8', method='html') | ||||||
							
								
								
									
										39
									
								
								hocrtotei → hocr2tei
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
						
						
									
										39
									
								
								hocrtotei → hocr2tei
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							| @@ -3,16 +3,18 @@ | |||||||
| 
 | 
 | ||||||
| """"Convert hOCR to TEI XML.""" | """"Convert hOCR to TEI XML.""" | ||||||
| 
 | 
 | ||||||
| from xml.sax.saxutils import escape |  | ||||||
| from argparse import ArgumentParser | from argparse import ArgumentParser | ||||||
|  | from lxml import html | ||||||
|  | from xml.sax.saxutils import escape | ||||||
| import re | import re | ||||||
| import xml.etree.ElementTree as ET | 
 | ||||||
| 
 | 
 | ||||||
| parser = ArgumentParser(description='Convert hOCR to TEI XML.') | parser = ArgumentParser(description='Convert hOCR to TEI XML.') | ||||||
| parser.add_argument('input', metavar='Path to hOCR input file') | parser.add_argument('file', help='Input file') | ||||||
| parser.add_argument('output', metavar='Path to TEI output file') | parser.add_argument('-o', '--output-file', help='Output file', required=True) | ||||||
| args = parser.parse_args() | args = parser.parse_args() | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| tei = '' | tei = '' | ||||||
| tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n' | tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n' | ||||||
| tei += '  <teiHeader>\n' | tei += '  <teiHeader>\n' | ||||||
| @@ -30,28 +32,27 @@ tei += '    </fileDesc>\n' | |||||||
| tei += '  </teiHeader>\n' | tei += '  </teiHeader>\n' | ||||||
| tei += '  <text>\n' | tei += '  <text>\n' | ||||||
| tei += '    <body>\n' | tei += '    <body>\n' | ||||||
| # Conversion start | hocr = html.parse(args.file) | ||||||
| hocr = ET.parse(args.input) | for ocr_page in hocr.findall('.//div[@class="ocr_page"]'): | ||||||
| for page in hocr.findall('.//*[@class="ocr_page"]'): |     ocr_page_title_attrib = ocr_page.attrib.get('title') | ||||||
|     page_properties = page.attrib.get('title') |     facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1) | ||||||
|     facsimile = re.search(r'image \"(.*?)\"', page_properties).group(1) |     page_number = re.search(r'ppageno (\d+)', ocr_page_title_attrib).group(1) | ||||||
|     page_number = re.search(r'ppageno (\d+)', page_properties).group(1) |     tei += f'      <pb facs="{facsimile}" n="{page_number}"/>\n' | ||||||
|     tei += '      <pb facs="{}" n="{}"/>\n'.format(facsimile, page_number) |     for ocr_par in ocr_page.findall('.//p[@class="ocr_par"]'): | ||||||
|     for para in page.findall('.//*[@class="ocr_par"]'): |  | ||||||
|         tei += '      <p>\n' |         tei += '      <p>\n' | ||||||
|         for line in para.findall('.//*[@class="ocr_line"]'): |         for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'): | ||||||
|             tei += '        <lb/>' |             tei += '        <lb/>' | ||||||
|             indent = '' |             indent = '' | ||||||
|             for word in line.findall('.//*[@class="ocrx_word"]'): |             for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'): | ||||||
|                 if word.text is not None: |                 if ocrx_word.text is not None: | ||||||
|                     tei += indent + escape(word.text.strip()) |                     tei += indent + escape(ocrx_word.text) | ||||||
|                     indent = ' ' |                     indent = ' ' | ||||||
|             tei += '\n' |             tei += '\n' | ||||||
|         tei += '      </p>\n' |         tei += '      </p>\n' | ||||||
| # Conversion end |  | ||||||
| tei += '    </body>\n' | tei += '    </body>\n' | ||||||
| tei += '  </text>\n' | tei += '  </text>\n' | ||||||
| tei += '</TEI>\n' | tei += '</TEI>\n' | ||||||
| 
 | 
 | ||||||
| with open(args.output, 'w') as tei_file: | 
 | ||||||
|     tei_file.write(tei) | with open(args.output_file, 'w') as f: | ||||||
|  |     f.write(tei) | ||||||
							
								
								
									
										711
									
								
								ocr
									
									
									
									
									
								
							
							
						
						
									
										711
									
								
								ocr
									
									
									
									
									
								
							| @@ -1,11 +1,9 @@ | |||||||
| #!/usr/bin/env python2.7 | #!/usr/bin/env python2.7 | ||||||
| # coding=utf-8 | # coding=utf-8 | ||||||
|  |  | ||||||
| """OCR pipeline for PDF file processing.""" | ''' OCR pipeline for PDF file processing. ''' | ||||||
|  | __version__ = '0.1.0' | ||||||
|  |  | ||||||
| __author__ = 'Patrick Jentsch <p.jentsch@uni-bielefeld.de>,' \ |  | ||||||
|              'Stephan Porada <porada@posteo.de>' |  | ||||||
| __version__ = '1.0.0' |  | ||||||
|  |  | ||||||
| from argparse import ArgumentParser | from argparse import ArgumentParser | ||||||
| from pyflow import WorkflowRunner | from pyflow import WorkflowRunner | ||||||
| @@ -14,145 +12,402 @@ import os | |||||||
| import sys | import sys | ||||||
|  |  | ||||||
|  |  | ||||||
| class OCRPipelineJob: | class PipelineJob: | ||||||
|     """An OCR pipeline job class |     ''' | ||||||
|  |     OCR pipeline job class. | ||||||
|  |  | ||||||
|     Each input file of the pipeline is represented as an OCR pipeline job, |     Each input file of the pipeline is represented as an OCR pipeline job, | ||||||
|     which holds all necessary information for the pipeline to process it. |     which holds all necessary information for the pipeline to process it. | ||||||
|  |  | ||||||
|     Arguments: |     Arguments: | ||||||
|     file -- Path to the file |     file -- Path to the file | ||||||
|     output_dir -- Path to a directory, where job results a stored |     output_dir -- Path to a directory, where job results are stored | ||||||
|     """ |     ''' | ||||||
|  |  | ||||||
|     def __init__(self, file, output_dir): |     def __init__(self, file, output_dir): | ||||||
|         self.file = file |         self.file = file | ||||||
|         self.name = os.path.basename(file).rsplit('.', 1)[0] |         self.name = os.path.basename(file)[:-4] | ||||||
|         self.output_dir = output_dir |         self.output_dir = output_dir | ||||||
|         self.page_dir = os.path.join(output_dir, 'pages') |         self.tmp_dir = os.path.join(output_dir, 'tmp') | ||||||
|  |  | ||||||
|  |  | ||||||
| class OCRPipeline(WorkflowRunner): | class SplitInputWorkflow(WorkflowRunner): | ||||||
|     def __init__(self, input_dir, lang, output_dir, binarize, zip): |     def __init__(self, job): | ||||||
|  |         self.job = job | ||||||
|  |  | ||||||
|  |     def workflow(self): | ||||||
|  |         ''' | ||||||
|  |         ' ################################################## | ||||||
|  |         ' # gs                                             # | ||||||
|  |         ' ################################################## | ||||||
|  |         ''' | ||||||
|  |         n_cores = min(2, self.getNCores()) | ||||||
|  |         mem_mb = min(n_cores * 512, self.getMemMb()) | ||||||
|  |         cmd = 'gs' | ||||||
|  |         cmd += ' -dBATCH' | ||||||
|  |         cmd += ' -dNOPAUSE' | ||||||
|  |         cmd += ' -dBufferSpace={}'.format(mem_mb * 1000000) | ||||||
|  |         cmd += ' -dNumRenderingThreads={}'.format(n_cores) | ||||||
|  |         cmd += ' -dQUIET' | ||||||
|  |         cmd += ' -r300' | ||||||
|  |         cmd += ' -sDEVICE=png16m' | ||||||
|  |         cmd += ' -sOutputFile="{}/page-%d.png"'.format( | ||||||
|  |             os.path.join(self.job.tmp_dir, 'images') | ||||||
|  |         ) | ||||||
|  |         cmd += ' "{}"'.format(self.job.file) | ||||||
|  |         self.addTask( | ||||||
|  |             'gs', | ||||||
|  |             command=cmd, | ||||||
|  |             memMb=mem_mb, | ||||||
|  |             nCores=n_cores | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class BinarizationWorkflow(WorkflowRunner): | ||||||
|  |     def __init__(self, job): | ||||||
|  |         self.job = job | ||||||
|  |  | ||||||
|  |     def workflow(self): | ||||||
|  |         ''' | ||||||
|  |         ' ################################################## | ||||||
|  |         ' # ocropus-nlbin                                  # | ||||||
|  |         ' ################################################## | ||||||
|  |         ''' | ||||||
|  |         # TODO: Update to newer ocropus-nlbin and start one task per page | ||||||
|  |         n_cores = self.getNCores() | ||||||
|  |         mem_mb = min(512 * n_cores, self.getMemMb()) | ||||||
|  |         cmd = 'ls -dv "{}/"* > "{}"'.format( | ||||||
|  |             os.path.join(self.job.tmp_dir, 'images'), | ||||||
|  |             os.path.join(self.job.tmp_dir, 'images', 'inputs.txt') | ||||||
|  |         ) | ||||||
|  |         cmd += ' && ' | ||||||
|  |         cmd += 'ocropus-nlbin "@{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt'))  # noqa | ||||||
|  |         cmd += ' --nocheck' | ||||||
|  |         cmd += ' --output "{}"'.format( | ||||||
|  |             os.path.join(self.job.tmp_dir, 'images')) | ||||||
|  |         cmd += ' --parallel "{}"'.format(n_cores) | ||||||
|  |         cmd += ' && ' | ||||||
|  |         cmd += 'rm "{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt'))  # noqa | ||||||
|  |         ocropus_nlbin_task = self.addTask( | ||||||
|  |             'ocropus_nlbin', | ||||||
|  |             command=cmd, | ||||||
|  |             memMb=mem_mb, | ||||||
|  |             nCores=n_cores | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |         ''' | ||||||
|  |         ' ################################################## | ||||||
|  |         ' # cleanup                                        # | ||||||
|  |         ' ################################################## | ||||||
|  |         ''' | ||||||
|  |         n_cores = 1 | ||||||
|  |         mem_mb = min(128, self.getMemMb()) | ||||||
|  |         cmd = 'cd "{}"'.format(os.path.join(self.job.tmp_dir, 'images')) | ||||||
|  |         cmd += ' && ' | ||||||
|  |         cmd += 'mkdir tmp' | ||||||
|  |         cmd += ' && ' | ||||||
|  |         cmd += 'mv *.bin.png tmp' | ||||||
|  |         cmd += ' && ' | ||||||
|  |         cmd += 'rm *.png' | ||||||
|  |         cmd += ' && ' | ||||||
|  |         cmd += 'mv tmp/* .' | ||||||
|  |         cmd += ' && ' | ||||||
|  |         cmd += 'rmdir tmp' | ||||||
|  |         cmd += ' && ' | ||||||
|  |         cmd += 'rename \'s/^0*/page-/\' *' | ||||||
|  |         cmd += ' && ' | ||||||
|  |         cmd += 'rename \'s/.bin.png$/.png/\' *' | ||||||
|  |         cmd += ' && ' | ||||||
|  |         cmd += 'cd -' | ||||||
|  |         self.addTask( | ||||||
|  |             'cleanup', | ||||||
|  |             command=cmd, | ||||||
|  |             dependencies=ocropus_nlbin_task, | ||||||
|  |             memMb=mem_mb, | ||||||
|  |             nCores=n_cores | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class OCRWorkflow(WorkflowRunner): | ||||||
|  |     def __init__(self, job, lang): | ||||||
|  |         self.job = job | ||||||
|  |         self.lang = lang | ||||||
|  |  | ||||||
|  |     def workflow(self): | ||||||
|  |         ''' | ||||||
|  |         ' ################################################## | ||||||
|  |         ' # tesseract                                      # | ||||||
|  |         ' ################################################## | ||||||
|  |         ''' | ||||||
|  |         tesseract_tasks = [] | ||||||
|  |         n_cores = 1 | ||||||
|  |         mem_mb = min(512, self.getMemMb()) | ||||||
|  |         for i, file in enumerate(os.listdir(os.path.join(self.job.tmp_dir, 'images'))):  # noqa | ||||||
|  |             cmd = 'tesseract "{}" "{}"'.format( | ||||||
|  |                 os.path.join(self.job.tmp_dir, 'images', file), | ||||||
|  |                 os.path.join(self.job.tmp_dir, file[:-4]) | ||||||
|  |             ) | ||||||
|  |             cmd += ' -l "{}"'.format(self.lang) | ||||||
|  |             cmd += ' hocr pdf txt' | ||||||
|  |             cmd += ' || ' | ||||||
|  |             cmd += 'echo "${?}"' | ||||||
|  |             task = self.addTask( | ||||||
|  |                 'tesseract_-_{}'.format(i), | ||||||
|  |                 command=cmd, | ||||||
|  |                 env={'OMP_THREAD_LIMIT': '{}'.format(n_cores)}, | ||||||
|  |                 memMb=mem_mb, | ||||||
|  |                 nCores=n_cores | ||||||
|  |             ) | ||||||
|  |             tesseract_tasks.append(task) | ||||||
|  |  | ||||||
|  |         ''' | ||||||
|  |         ' ################################################## | ||||||
|  |         ' # move_files                                     # | ||||||
|  |         ' ################################################## | ||||||
|  |         ''' | ||||||
|  |         n_cores = 1 | ||||||
|  |         mem_mb = min(128, self.getMemMb()) | ||||||
|  |         for i, file_extension in enumerate(['hocr', 'pdf', 'txt']): | ||||||
|  |             cmd = 'mv "{}/"*.{} "{}"'.format( | ||||||
|  |                 self.job.tmp_dir, | ||||||
|  |                 file_extension, | ||||||
|  |                 os.path.join(self.job.tmp_dir, file_extension) | ||||||
|  |             ) | ||||||
|  |             self.addTask( | ||||||
|  |                 'move_{}_files'.format(file_extension), | ||||||
|  |                 command=cmd, | ||||||
|  |                 dependencies=tesseract_tasks, | ||||||
|  |                 memMb=mem_mb, | ||||||
|  |                 nCores=n_cores | ||||||
|  |             ) | ||||||
|  |         cmd = 'mv "{}" "{}"'.format( | ||||||
|  |             os.path.join(self.job.tmp_dir, 'images'), | ||||||
|  |             os.path.join(self.job.output_dir) | ||||||
|  |         ) | ||||||
|  |         self.addTask( | ||||||
|  |             'move_image_files', | ||||||
|  |             command=cmd, | ||||||
|  |             dependencies=tesseract_tasks, | ||||||
|  |             memMb=mem_mb, | ||||||
|  |             nCores=n_cores | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class CreateHOCRWorkflow(WorkflowRunner): | ||||||
|  |     def __init__(self, job): | ||||||
|  |         self.job = job | ||||||
|  |  | ||||||
|  |     def workflow(self): | ||||||
|  |         ''' | ||||||
|  |         ' ################################################## | ||||||
|  |         ' # fix-hocr                                       # | ||||||
|  |         ' ################################################## | ||||||
|  |         ''' | ||||||
|  |         fix_hocr_tasks = [] | ||||||
|  |         n_cores = 1 | ||||||
|  |         mem_mb = min(256, self.getMemMb()) | ||||||
|  |         for i, file in enumerate(os.listdir(os.path.join(self.job.tmp_dir, 'hocr'))):  # noqa | ||||||
|  |             cmd = 'sed -i \'s>{}>images>g\' "{}"'.format( | ||||||
|  |                 os.path.join(self.job.tmp_dir, 'images'), | ||||||
|  |                 os.path.join(self.job.tmp_dir, 'hocr', file) | ||||||
|  |             ) | ||||||
|  |             cmd += ' && ' | ||||||
|  |             cmd += 'sed -i \'s>ppageno [0-9]\\+>ppageno {}>g\' "{}"'.format( | ||||||
|  |                 file[5:-5], | ||||||
|  |                 os.path.join(self.job.tmp_dir, 'hocr', file) | ||||||
|  |             ) | ||||||
|  |             cmd += ' && ' | ||||||
|  |             cmd += 'sed -i \'s>page_[0-9]\\+>page_{}>g\' "{}"'.format( | ||||||
|  |                 file[5:-5], | ||||||
|  |                 os.path.join(self.job.tmp_dir, 'hocr', file) | ||||||
|  |             ) | ||||||
|  |             cmd += ' && ' | ||||||
|  |             cmd += 'sed -i \'s>block_[0-9]\\+>block_{}>g\' "{}"'.format( | ||||||
|  |                 file[5:-5], | ||||||
|  |                 os.path.join(self.job.tmp_dir, 'hocr', file) | ||||||
|  |             ) | ||||||
|  |             cmd += ' && ' | ||||||
|  |             cmd += 'sed -i \'s>par_[0-9]\\+>par_{}>g\' "{}"'.format( | ||||||
|  |                 file[5:-5], | ||||||
|  |                 os.path.join(self.job.tmp_dir, 'hocr', file) | ||||||
|  |             ) | ||||||
|  |             cmd += ' && ' | ||||||
|  |             cmd += 'sed -i \'s>line_[0-9]\\+>line_{}>g\' "{}"'.format( | ||||||
|  |                 file[5:-5], | ||||||
|  |                 os.path.join(self.job.tmp_dir, 'hocr', file) | ||||||
|  |             ) | ||||||
|  |             cmd += ' && ' | ||||||
|  |             cmd += 'sed -i \'s>word_[0-9]\\+>word_{}>g\' "{}"'.format( | ||||||
|  |                 file[5:-5], | ||||||
|  |                 os.path.join(self.job.tmp_dir, 'hocr', file) | ||||||
|  |             ) | ||||||
|  |             task = self.addTask( | ||||||
|  |                 'fix-hocr_-_{}'.format(i), | ||||||
|  |                 command=cmd, | ||||||
|  |                 memMb=mem_mb, | ||||||
|  |                 nCores=n_cores | ||||||
|  |             ) | ||||||
|  |             fix_hocr_tasks.append(task) | ||||||
|  |  | ||||||
|  |         ''' | ||||||
|  |         ' ################################################## | ||||||
|  |         ' # hocr-combine                                   # | ||||||
|  |         ' ################################################## | ||||||
|  |         ''' | ||||||
|  |         n_cores = 1 | ||||||
|  |         mem_mb = min(512, self.getMemMb()) | ||||||
|  |         cmd = 'ls -dv "{}/"* > "{}"'.format( | ||||||
|  |             os.path.join(self.job.tmp_dir, 'hocr'), | ||||||
|  |             os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt') | ||||||
|  |         ) | ||||||
|  |         cmd += ' && ' | ||||||
|  |         cmd += 'hocr-combine "@{}"'.format( | ||||||
|  |             os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt') | ||||||
|  |         ) | ||||||
|  |         cmd += ' --output-file "{}.hocr"'.format( | ||||||
|  |             os.path.join(self.job.output_dir, self.job.name) | ||||||
|  |         ) | ||||||
|  |         cmd += ' && ' | ||||||
|  |         cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'hocr')) | ||||||
|  |         self.addTask( | ||||||
|  |             'hocr_combine', | ||||||
|  |             command=cmd, | ||||||
|  |             dependencies=fix_hocr_tasks, | ||||||
|  |             memMb=mem_mb, | ||||||
|  |             nCores=n_cores | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class CreatePDFWorkflow(WorkflowRunner): | ||||||
|  |     def __init__(self, job): | ||||||
|  |         self.job = job | ||||||
|  |  | ||||||
|  |     def workflow(self): | ||||||
|  |         ''' | ||||||
|  |         ' ################################################## | ||||||
|  |         ' # pdf_combine                                    # | ||||||
|  |         ' ################################################## | ||||||
|  |         ''' | ||||||
|  |         n_cores = min(2, self.getNCores()) | ||||||
|  |         mem_mb = min(n_cores * 256, self.getMemMb()) | ||||||
|  |         cmd = 'ls -dQv "{}"/*'.format(os.path.join(self.job.tmp_dir, 'pdf')) | ||||||
|  |         cmd += ' | ' | ||||||
|  |         cmd += 'xargs gs' | ||||||
|  |         cmd += ' -dBATCH' | ||||||
|  |         cmd += ' -dNOPAUSE' | ||||||
|  |         cmd += ' -dBufferSpace={}'.format(mem_mb * 1000000) | ||||||
|  |         cmd += ' -dNumRenderingThreads={}'.format(n_cores) | ||||||
|  |         cmd += ' -dPDFSETTINGS=/ebook' | ||||||
|  |         cmd += ' -dQUIET' | ||||||
|  |         cmd += ' -sDEVICE=pdfwrite' | ||||||
|  |         cmd += ' -sOutputFile="{}.pdf"'.format( | ||||||
|  |             os.path.join(self.job.output_dir, self.job.name) | ||||||
|  |         ) | ||||||
|  |         cmd += ' && ' | ||||||
|  |         cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'pdf')) | ||||||
|  |         self.addTask('pdf_combine', command=cmd, memMb=mem_mb, nCores=n_cores) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class CreateTEIWorkflow(WorkflowRunner): | ||||||
|  |     def __init__(self, job): | ||||||
|  |         self.job = job | ||||||
|  |  | ||||||
|  |     def workflow(self): | ||||||
|  |         ''' | ||||||
|  |         ' ################################################## | ||||||
|  |         ' # hocr2tei                                       # | ||||||
|  |         ' ################################################## | ||||||
|  |         ''' | ||||||
|  |         n_cores = 1 | ||||||
|  |         mem_mb = min(512, self.getMemMb()) | ||||||
|  |         cmd = 'hocr2tei "{}.hocr"'.format( | ||||||
|  |             os.path.join(self.job.output_dir, self.job.name) | ||||||
|  |         ) | ||||||
|  |         cmd += ' --output-file "{}.xml"'.format( | ||||||
|  |             os.path.join(self.job.output_dir, self.job.name) | ||||||
|  |         ) | ||||||
|  |         self.addTask('hocr2tei', command=cmd, memMb=mem_mb, nCores=n_cores) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class CreateTxtWorkflow(WorkflowRunner): | ||||||
|  |     def __init__(self, job): | ||||||
|  |         self.job = job | ||||||
|  |  | ||||||
|  |     def workflow(self): | ||||||
|  |         ''' | ||||||
|  |         ' ################################################## | ||||||
|  |         ' # txt_combine                                    # | ||||||
|  |         ' ################################################## | ||||||
|  |         ''' | ||||||
|  |         n_cores = 1 | ||||||
|  |         mem_mb = min(512, self.getMemMb()) | ||||||
|  |         cmd = 'ls -dQv "{}"/*'.format(os.path.join(self.job.tmp_dir, 'txt')) | ||||||
|  |         cmd += ' | ' | ||||||
|  |         cmd += 'xargs cat' | ||||||
|  |         cmd += ' > ' | ||||||
|  |         cmd += '"{}.txt"'.format(os.path.join(self.job.output_dir, self.job.name))  # noqa | ||||||
|  |         cmd += ' && ' | ||||||
|  |         cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'txt')) | ||||||
|  |         self.addTask('txt_combine', command=cmd, memMb=mem_mb, nCores=n_cores) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class MainWorkflow(WorkflowRunner): | ||||||
|  |     def __init__(self, input_dir, lang, output_dir, binarize): | ||||||
|         self.input_dir = input_dir |         self.input_dir = input_dir | ||||||
|         self.lang = lang |         self.lang = lang | ||||||
|         self.output_dir = output_dir |         self.output_dir = output_dir | ||||||
|         self.binarize = binarize |         self.binarize = binarize | ||||||
|         self.zip = zip |         self.jobs = self.collect_jobs() | ||||||
|         self.jobs = collect_jobs(self.input_dir, self.output_dir) |  | ||||||
|  |     def collect_jobs(self): | ||||||
|  |         jobs = [] | ||||||
|  |         for file in os.listdir(self.input_dir): | ||||||
|  |             if os.path.isdir(os.path.join(self.input_dir, file)): | ||||||
|  |                 continue | ||||||
|  |             if file.lower().endswith('.pdf'): | ||||||
|  |                 job = PipelineJob( | ||||||
|  |                     os.path.join(self.input_dir, file), | ||||||
|  |                     os.path.join(self.output_dir, file) | ||||||
|  |                 ) | ||||||
|  |                 jobs.append(job) | ||||||
|  |         return jobs | ||||||
|  |  | ||||||
|     def workflow(self): |     def workflow(self): | ||||||
|         if not self.jobs: |         if not self.jobs: | ||||||
|             return |             return | ||||||
|  |  | ||||||
|         ''' |         # Create output and temporary directories | ||||||
|         ' ################################################## |         for job in self.jobs: | ||||||
|         ' # setup output directory                         # |             os.mkdir(job.output_dir) | ||||||
|         ' ################################################## |             os.mkdir(job.tmp_dir) | ||||||
|         ''' |             os.mkdir(os.path.join(job.tmp_dir, 'hocr')) | ||||||
|         setup_output_directory_tasks = [] |             os.mkdir(os.path.join(job.tmp_dir, 'pdf')) | ||||||
|         for i, job in enumerate(self.jobs): |             os.mkdir(os.path.join(job.tmp_dir, 'images')) | ||||||
|             cmd = 'mkdir -p "{}"'.format(job.page_dir) |             os.mkdir(os.path.join(job.tmp_dir, 'txt')) | ||||||
|             lbl = 'setup_output_directory_-_{}'.format(i) |  | ||||||
|             task = self.addTask(command=cmd, label=lbl) |  | ||||||
|             setup_output_directory_tasks.append(task) |  | ||||||
|  |  | ||||||
|         ''' |         ''' | ||||||
|         ' ################################################## |         ' ################################################## | ||||||
|         ' # split input                                    # |         ' # split-input                                    # | ||||||
|         ' ################################################## |         ' ################################################## | ||||||
|         ''' |         ''' | ||||||
|         split_input_tasks = [] |  | ||||||
|         n_cores = max(1, int(self.getNCores() / len(self.jobs))) |  | ||||||
|         for i, job in enumerate(self.jobs): |         for i, job in enumerate(self.jobs): | ||||||
|             input_file = job.file |             self.addWorkflowTask( | ||||||
|             output_file = '{}/page-%d.tif'.format(job.page_dir) |                 'split_input_-_{}'.format(i), | ||||||
|             cmd = 'gs' |                 SplitInputWorkflow(job) | ||||||
|             cmd += ' -dBATCH' |             ) | ||||||
|             cmd += ' -dNOPAUSE' |  | ||||||
|             cmd += ' -dNumRenderingThreads={}'.format(n_cores) |  | ||||||
|             cmd += ' -dQUIET' |  | ||||||
|             cmd += ' -r300' |  | ||||||
|             cmd += ' -sDEVICE=tiff24nc' |  | ||||||
|             cmd += ' -sCompression=lzw' |  | ||||||
|             cmd += ' "-sOutputFile={}"'.format(output_file) |  | ||||||
|             cmd += ' "{}"'.format(input_file) |  | ||||||
|             deps = 'setup_output_directory_-_{}'.format(i) |  | ||||||
|             lbl = 'split_input_-_{}'.format(i) |  | ||||||
|             task = self.addTask(command=cmd, dependencies=deps, label=lbl, |  | ||||||
|                                 nCores=n_cores) |  | ||||||
|             split_input_tasks.append(task) |  | ||||||
|  |  | ||||||
|         if self.binarize: |         if self.binarize: | ||||||
|             ''' |  | ||||||
|             ' ################################################## |  | ||||||
|             ' # pre binarization                               # |  | ||||||
|             ' ################################################## |  | ||||||
|             ''' |  | ||||||
|             pre_binarization_tasks = [] |  | ||||||
|             for i, job in enumerate(self.jobs): |  | ||||||
|                 input_file = os.path.join(job.output_dir, 'binarization_input_files.txt')  # noqa |  | ||||||
|                 cmd = 'ls -dv "{}/"* >> "{}"'.format(job.page_dir, input_file) |  | ||||||
|                 deps = 'split_input_-_{}'.format(i) |  | ||||||
|                 lbl = 'pre_binarization_-_{}'.format(i) |  | ||||||
|                 task = self.addTask(command=cmd, dependencies=deps, label=lbl) |  | ||||||
|                 pre_binarization_tasks.append(task) |  | ||||||
|  |  | ||||||
|             ''' |             ''' | ||||||
|             ' ################################################## |             ' ################################################## | ||||||
|             ' # binarization                                   # |             ' # binarization                                   # | ||||||
|             ' ################################################## |             ' ################################################## | ||||||
|             ''' |             ''' | ||||||
|             binarization_tasks = [] |  | ||||||
|             n_cores = self.getNCores() |  | ||||||
|             mem_mb = self.getMemMb() |  | ||||||
|             for i, job in enumerate(self.jobs): |             for i, job in enumerate(self.jobs): | ||||||
|                 input_file = os.path.join(job.output_dir, 'binarization_input_files.txt')  # noqa |                 self.addWorkflowTask( | ||||||
|                 cmd = 'ocropus-nlbin "@{}"'.format(input_file) |                     'binarization_-_{}'.format(i), | ||||||
|                 cmd += ' --nocheck' |                     BinarizationWorkflow(job), | ||||||
|                 cmd += ' --output "{}"'.format(job.page_dir) |                     dependencies='split_input_-_{}'.format(i) | ||||||
|                 cmd += ' --parallel "{}"'.format(n_cores) |                 ) | ||||||
|                 deps = 'pre_binarization_-_{}'.format(i) |  | ||||||
|                 lbl = 'binarization_-_{}'.format(i) |  | ||||||
|                 task = self.addTask(command=cmd, dependencies=deps, label=lbl, |  | ||||||
|                                     memMb=mem_mb, nCores=n_cores) |  | ||||||
|                 binarization_tasks.append(task) |  | ||||||
|  |  | ||||||
|             ''' |  | ||||||
|             ' ################################################## |  | ||||||
|             ' # post binarization                              # |  | ||||||
|             ' ################################################## |  | ||||||
|             ''' |  | ||||||
|             post_binarization_tasks = [] |  | ||||||
|             for i, job in enumerate(self.jobs): |  | ||||||
|                 input_file = os.path.join(job.output_dir, 'binarization_input_files.txt')  # noqa |  | ||||||
|                 cmd = 'rm "{}"'.format(input_file) |  | ||||||
|                 cmd += ' && ' |  | ||||||
|                 cmd += 'cd "{}"'.format(job.page_dir) |  | ||||||
|                 cmd += ' && ' |  | ||||||
|                 cmd += 'rm *.{nrm.png,tif}' |  | ||||||
|                 cmd += ' && ' |  | ||||||
|                 cmd += 'rename \'s/^0*/page-/\' *' |  | ||||||
|                 cmd += ' && ' |  | ||||||
|                 cmd += 'cd -' |  | ||||||
|                 deps = 'binarization_-_{}'.format(i) |  | ||||||
|                 lbl = 'post_binarization_-_{}'.format(i) |  | ||||||
|                 task = self.addTask(command=cmd, dependencies=deps, label=lbl) |  | ||||||
|                 post_binarization_tasks.append(task) |  | ||||||
|  |  | ||||||
|         ''' |  | ||||||
|         ' ################################################## |  | ||||||
|         ' # pre ocr                                        # |  | ||||||
|         ' ################################################## |  | ||||||
|         ''' |  | ||||||
|         pre_ocr_tasks = [] |  | ||||||
|         for i, job in enumerate(self.jobs): |  | ||||||
|             input_file = os.path.join(job.output_dir, 'ocr_input_files.txt') |  | ||||||
|             cmd = 'ls -dv "{}/"* >> "{}"'.format(job.page_dir, input_file) |  | ||||||
|             deps = 'post_binarization_-_{}'.format(i) if self.binarize else 'split_input_-_{}'.format(i)  # noqa |  | ||||||
|             lbl = 'pre_ocr_-_{}'.format(i) |  | ||||||
|             task = self.addTask(command=cmd, dependencies=deps, label=lbl) |  | ||||||
|             pre_ocr_tasks.append(task) |  | ||||||
|  |  | ||||||
|         ''' |         ''' | ||||||
|         ' ################################################## |         ' ################################################## | ||||||
| @@ -160,175 +415,117 @@ class OCRPipeline(WorkflowRunner): | |||||||
|         ' ################################################## |         ' ################################################## | ||||||
|         ''' |         ''' | ||||||
|         ocr_tasks = [] |         ocr_tasks = [] | ||||||
|         n_cores = min(4, self.getNCores()) |  | ||||||
|         mem_mb = min(n_cores * 2048, self.getMemMb()) |  | ||||||
|         for i, job in enumerate(self.jobs): |         for i, job in enumerate(self.jobs): | ||||||
|             input_file = os.path.join(job.output_dir, 'ocr_input_files.txt') |             if self.binarize: | ||||||
|             output_file_base = os.path.join(job.output_dir, job.name) |                 deps = 'binarization_-_{}'.format(i) | ||||||
|             cmd = 'tesseract "{}" "{}"'.format(input_file, output_file_base) |             else: | ||||||
|             cmd += ' -l "{}"'.format(self.lang) |                 deps = 'split_input_-_{}'.format(i) | ||||||
|             cmd += ' hocr pdf txt' |             task = self.addWorkflowTask( | ||||||
|             deps = 'pre_ocr_-_{}'.format(i) |                 'ocr_-_{}'.format(i), | ||||||
|             lbl = 'ocr_-_{}'.format(i) |                 OCRWorkflow(job, self.lang), | ||||||
|             task = self.addTask(command=cmd, dependencies=deps, |                 dependencies=deps | ||||||
|                                 env={'OMP_THREAD_LIMIT': '{}'.format(n_cores)}, |             ) | ||||||
|                                 label=lbl, memMb=mem_mb, nCores=n_cores) |  | ||||||
|             ocr_tasks.append(task) |             ocr_tasks.append(task) | ||||||
|  |  | ||||||
|         ''' |         ''' | ||||||
|         ' ################################################## |         ' ################################################## | ||||||
|         ' # post ocr                                       # |         ' # create-hocr                                    # | ||||||
|         ' ################################################## |         ' ################################################## | ||||||
|         ''' |         ''' | ||||||
|         post_ocr_tasks = [] |         create_hocr_tasks = [] | ||||||
|         for i, job in enumerate(self.jobs): |         for i, job in enumerate(self.jobs): | ||||||
|             input_file = os.path.join(job.output_dir, 'ocr_input_files.txt') |             task = self.addWorkflowTask( | ||||||
|             output_file_base = os.path.join(job.output_dir, job.name) |                 'create_hocr_-_{}'.format(i), | ||||||
|             cmd = 'rm "{}"'.format(input_file) |                 CreateHOCRWorkflow(job), | ||||||
|             cmd += ' && ' |                 dependencies='ocr_-_{}'.format(i) | ||||||
|             cmd += 'sed -i \'s+{}+pages+g\' "{}.hocr"'.format(job.page_dir, output_file_base)  # noqa |             ) | ||||||
|             deps = 'ocr_-_{}'.format(i) |             create_hocr_tasks.append(task) | ||||||
|             lbl = 'post_ocr_-_{}'.format(i) |  | ||||||
|             task = self.addTask(command=cmd, dependencies=deps, label=lbl) |  | ||||||
|             post_ocr_tasks.append(task) |  | ||||||
|  |  | ||||||
|         ''' |         ''' | ||||||
|         ' ################################################## |         ' ################################################## | ||||||
|         ' # hocr to tei                                    # |         ' # create-pdf                                     # | ||||||
|         ' ################################################## |         ' ################################################## | ||||||
|         ''' |         ''' | ||||||
|         hocr_to_tei_tasks = [] |         create_pdf_tasks = [] | ||||||
|         for i, job in enumerate(self.jobs): |         for i, job in enumerate(self.jobs): | ||||||
|             output_file_base = os.path.join(job.output_dir, job.name) |             task = self.addWorkflowTask( | ||||||
|             cmd = 'hocrtotei "{}.hocr" "{}.xml"'.format(output_file_base, output_file_base)  # noqa |                 'create_pdf_-_{}'.format(i), | ||||||
|             deps = 'post_ocr_-_{}'.format(i) |                 CreatePDFWorkflow(job), | ||||||
|             lbl = 'hocr_to_tei_-_{}'.format(i) |                 dependencies='ocr_-_{}'.format(i) | ||||||
|             task = self.addTask(command=cmd, dependencies=deps, label=lbl) |             ) | ||||||
|             hocr_to_tei_tasks.append(task) |             create_pdf_tasks.append(task) | ||||||
|  |  | ||||||
|         ''' |         ''' | ||||||
|         ' ################################################## |         ' ################################################## | ||||||
|         ' # zip creation                                   # |         ' # create-tei                                     # | ||||||
|         ' ################################################## |         ' ################################################## | ||||||
|         ''' |         ''' | ||||||
|         zip_creation_tasks = [] |         create_tei_tasks = [] | ||||||
|         if self.zip is not None: |         for i, job in enumerate(self.jobs): | ||||||
|             # zip all files |             task = self.addWorkflowTask( | ||||||
|             cmd = 'cd "{}"'.format(self.output_dir) |                 'create_tei_-_{}'.format(i), | ||||||
|             cmd += ' && ' |                 CreateTEIWorkflow(job), | ||||||
|             cmd += 'zip' |                 dependencies='create_hocr_-_{}'.format(i) | ||||||
|             cmd += ' -r' |             ) | ||||||
|             cmd += ' "{}.all.zip" .'.format(self.zip) |             create_tei_tasks.append(task) | ||||||
|             cmd += ' -x "pyflow.data*" "*tmp*"' |  | ||||||
|             cmd += ' -i "*.pdf" "*.txt" "*.xml" "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif')  # noqa |  | ||||||
|             cmd += ' && ' |  | ||||||
|             cmd += 'cd -' |  | ||||||
|             deps = hocr_to_tei_tasks |  | ||||||
|             lbl = 'zip_creation_-_all' |  | ||||||
|             task = self.addTask(command=cmd, dependencies=deps, label=lbl) |  | ||||||
|             zip_creation_tasks.append(task) |  | ||||||
|             # zip PDF files |  | ||||||
|             cmd = 'cd "{}"'.format(self.output_dir) |  | ||||||
|             cmd += ' && ' |  | ||||||
|             cmd += 'zip' |  | ||||||
|             cmd += ' -r' |  | ||||||
|             cmd += ' "{}.pdf.zip" .'.format(self.zip) |  | ||||||
|             cmd += ' -x "pyflow.data*" "*tmp*"' |  | ||||||
|             cmd += ' -i "*.pdf"' |  | ||||||
|             cmd += ' && ' |  | ||||||
|             cmd += 'cd -' |  | ||||||
|             deps = ocr_tasks |  | ||||||
|             lbl = 'zip_creation_-_pdf' |  | ||||||
|             task = self.addTask(command=cmd, dependencies=deps, label=lbl) |  | ||||||
|             zip_creation_tasks.append(task) |  | ||||||
|             # zip TXT files |  | ||||||
|             cmd = 'cd "{}"'.format(self.output_dir) |  | ||||||
|             cmd += ' && ' |  | ||||||
|             cmd += 'zip' |  | ||||||
|             cmd += ' -r' |  | ||||||
|             cmd += ' "{}.txt.zip" .'.format(self.zip) |  | ||||||
|             cmd += ' -x "pyflow.data*" "*tmp*"' |  | ||||||
|             cmd += ' -i "*.txt"' |  | ||||||
|             cmd += ' && ' |  | ||||||
|             cmd += 'cd -' |  | ||||||
|             deps = ocr_tasks |  | ||||||
|             lbl = 'zip_creation_-_txt' |  | ||||||
|             task = self.addTask(command=cmd, dependencies=deps, label=lbl) |  | ||||||
|             zip_creation_tasks.append(task) |  | ||||||
|             # zip XML files |  | ||||||
|             cmd = 'cd "{}"'.format(self.output_dir) |  | ||||||
|             cmd += ' && ' |  | ||||||
|             cmd += 'zip' |  | ||||||
|             cmd += ' -r' |  | ||||||
|             cmd += ' "{}.xml.zip" .'.format(self.zip) |  | ||||||
|             cmd += ' -x "pyflow.data*" "*tmp*"' |  | ||||||
|             cmd += ' -i "*.xml"' |  | ||||||
|             cmd += ' && ' |  | ||||||
|             cmd += 'cd -' |  | ||||||
|             deps = hocr_to_tei_tasks |  | ||||||
|             lbl = 'zip_creation_-_xml' |  | ||||||
|             task = self.addTask(command=cmd, dependencies=deps, label=lbl) |  | ||||||
|             zip_creation_tasks.append(task) |  | ||||||
|             # zip PoCo bundles |  | ||||||
|             cmd = 'cd "{}"'.format(self.output_dir) |  | ||||||
|             cmd += ' && ' |  | ||||||
|             cmd += 'zip' |  | ||||||
|             cmd += ' -r' |  | ||||||
|             cmd += ' "{}.poco.zip" .'.format(self.zip) |  | ||||||
|             cmd += ' -x "pyflow.data*" "*tmp*"' |  | ||||||
|             cmd += ' -i "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif')  # noqa |  | ||||||
|             cmd += ' && ' |  | ||||||
|             cmd += 'cd -' |  | ||||||
|             deps = post_ocr_tasks |  | ||||||
|             lbl = 'zip_creation_-_poco' |  | ||||||
|             task = self.addTask(command=cmd, dependencies=deps, label=lbl) |  | ||||||
|             zip_creation_tasks.append(task) |  | ||||||
|  |  | ||||||
|  |         ''' | ||||||
|  |         ' ################################################## | ||||||
|  |         ' # create-txt                                     # | ||||||
|  |         ' ################################################## | ||||||
|  |         ''' | ||||||
|  |         create_txt_tasks = [] | ||||||
|  |         for i, job in enumerate(self.jobs): | ||||||
|  |             task = self.addWorkflowTask( | ||||||
|  |                 'create_txt_-_{}'.format(i), | ||||||
|  |                 CreateTxtWorkflow(job), | ||||||
|  |                 dependencies='ocr_-_{}'.format(i) | ||||||
|  |             ) | ||||||
|  |             create_txt_tasks.append(task) | ||||||
|  |  | ||||||
| def collect_jobs(input_dir, output_dir): |         # Remove temporary directories when all tasks are completed | ||||||
|     jobs = [] |         self.waitForTasks() | ||||||
|     for file in os.listdir(input_dir): |         for job in self.jobs: | ||||||
|         if os.path.isdir(os.path.join(input_dir, file)): |             os.rmdir(job.tmp_dir) | ||||||
|             continue |  | ||||||
|         if file.lower().endswith('.pdf'): |  | ||||||
|             job = OCRPipelineJob(os.path.join(input_dir, file), |  | ||||||
|                                  os.path.join(output_dir, file)) |  | ||||||
|             jobs.append(job) |  | ||||||
|     return jobs |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def parse_args(): | def parse_args(): | ||||||
|     parser = ArgumentParser(description='OCR pipeline for PDF file processing', |     parser = ArgumentParser(description='OCR pipeline for PDF file processing') | ||||||
|                             prog='OCR pipeline') |     parser.add_argument( | ||||||
|     parser.add_argument('-i', '--input-dir', |         '-i', '--input-dir', help='Input directory', required=True) | ||||||
|                         help='Input directory', |     parser.add_argument( | ||||||
|                         required=True) |         '-o', '--output-dir', help='Output directory', required=True) | ||||||
|     parser.add_argument('-o', '--output-dir', |     parser.add_argument( | ||||||
|                         help='Output directory', |         '-l', '--language', | ||||||
|                         required=True) |         choices=[x[:-12] for x in os.listdir('/usr/local/share/tessdata') | ||||||
|     parser.add_argument('-l', '--language', |                  if x.endswith('.traineddata') and len(x) > 12], | ||||||
|                         choices=list(map(lambda x: x[:-12], filter(lambda x: x.endswith('.traineddata'), os.listdir('/usr/local/share/tessdata')))),  # noqa |         help='Language of the input (3-character ISO 639-2 language codes)', | ||||||
|                         help='Language of the input ' |         required=True | ||||||
|                              '(3-character ISO 639-2 language codes)', |     ) | ||||||
|                         required=True) |     parser.add_argument( | ||||||
|     parser.add_argument('--binarize', |         '--binarize', | ||||||
|         action='store_true', |         action='store_true', | ||||||
|                         help='Add binarization as a preprocessing step') |         help='Add binarization as a preprocessing step' | ||||||
|     parser.add_argument('--log-dir', |     ) | ||||||
|                         help='Logging directory') |     parser.add_argument( | ||||||
|     parser.add_argument('--mem-mb', |         '--log-dir', help='Logging directory (Default: --output-dir)') | ||||||
|                         help='Amount of system memory to be used (Default: min(--n-cores * 2048, available system memory))',  # noqa |     parser.add_argument( | ||||||
|                         type=int) |         '--mem-mb', | ||||||
|     parser.add_argument('--n-cores', |         help='Amount of system memory to be used (Default: min(--n-cores * 512, available system memory))',  # noqa | ||||||
|  |         type=int | ||||||
|  |     ) | ||||||
|  |     parser.add_argument( | ||||||
|  |         '--n-cores', | ||||||
|         default=min(4, multiprocessing.cpu_count()), |         default=min(4, multiprocessing.cpu_count()), | ||||||
|                         help='Number of CPU threads to be used (Default: min(4, number of CPUs))',  # noqa |         help='Number of CPU threads to be used (Default: min(4, CPU count))', | ||||||
|                         type=int) |         type=int | ||||||
|     parser.add_argument('--zip', |     ) | ||||||
|                         help='Create one zip file per filetype') |     parser.add_argument( | ||||||
|     parser.add_argument('-v', '--version', |         '-v', '--version', | ||||||
|         action='version', |         action='version', | ||||||
|         help='Returns the current version of the OCR pipeline', |         help='Returns the current version of the OCR pipeline', | ||||||
|                         version='%(prog)s {}'.format(__version__)) |         version='%(prog)s {}'.format(__version__) | ||||||
|  |     ) | ||||||
|     args = parser.parse_args() |     args = parser.parse_args() | ||||||
|  |  | ||||||
|     # Set some tricky default values and check for insufficient input |     # Set some tricky default values and check for insufficient input | ||||||
| @@ -338,20 +535,18 @@ def parse_args(): | |||||||
|         raise Exception('--n-cores must be greater or equal 1') |         raise Exception('--n-cores must be greater or equal 1') | ||||||
|     if args.mem_mb is None: |     if args.mem_mb is None: | ||||||
|         max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0]) |         max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0]) | ||||||
|         args.mem_mb = min(args.n_cores * 2048, max_mem_mb) |         args.mem_mb = min(args.n_cores * 512, max_mem_mb) | ||||||
|     if args.mem_mb < 2048: |     if args.mem_mb < 512: | ||||||
|         raise Exception('--mem-mb must be greater or equal 2048') |         raise Exception('--mem-mb must be greater or equal 512') | ||||||
|     if args.zip is not None and args.zip.lower().endswith('.zip'): |  | ||||||
|         # Remove .zip file extension if provided |  | ||||||
|         args.zip = args.zip[:-4] |  | ||||||
|         args.zip = args.zip if args.zip else 'output' |  | ||||||
|     return args |     return args | ||||||
|  |  | ||||||
|  |  | ||||||
| def main(): | def main(): | ||||||
|     args = parse_args() |     args = parse_args() | ||||||
|     ocr_pipeline = OCRPipeline(args.input_dir, args.language, args.output_dir, args.binarize, args.zip)  # noqa |     ocr_pipeline = MainWorkflow( | ||||||
|     retval = ocr_pipeline.run(dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores)  # noqa |         args.input_dir, args.language, args.output_dir, args.binarize) | ||||||
|  |     retval = ocr_pipeline.run( | ||||||
|  |         dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores) | ||||||
|     sys.exit(retval) |     sys.exit(retval) | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										16
									
								
								wrapper/ocr
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								wrapper/ocr
									
									
									
									
									
								
							| @@ -6,9 +6,10 @@ import os | |||||||
| import subprocess | import subprocess | ||||||
| import sys | import sys | ||||||
|  |  | ||||||
| CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0' | CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:v0.1.0' | ||||||
| CONTAINER_INPUT_DIR = '/input' | CONTAINER_INPUT_DIR = '/input' | ||||||
| CONTAINER_OUTPUT_DIR = '/output' | CONTAINER_OUTPUT_DIR = '/output' | ||||||
|  | CONTAINER_MODELS_DIR = '/usr/local/share/tessdata' | ||||||
| CONTAINER_LOG_DIR = '/logs' | CONTAINER_LOG_DIR = '/logs' | ||||||
| UID = str(os.getuid()) | UID = str(os.getuid()) | ||||||
| GID = str(os.getgid()) | GID = str(os.getgid()) | ||||||
| @@ -16,20 +17,25 @@ GID = str(os.getgid()) | |||||||
| parser = ArgumentParser(add_help=False) | parser = ArgumentParser(add_help=False) | ||||||
| parser.add_argument('-i', '--input-dir') | parser.add_argument('-i', '--input-dir') | ||||||
| parser.add_argument('-o', '--output-dir') | parser.add_argument('-o', '--output-dir') | ||||||
|  | parser.add_argument('-m', '--model', action='extend', dest='models', nargs='+') | ||||||
| parser.add_argument('--log-dir') | parser.add_argument('--log-dir') | ||||||
| args, remaining_args = parser.parse_known_args() | args, remaining_args = parser.parse_known_args() | ||||||
|  |  | ||||||
| cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)] | cmd = ['docker', 'run', '--rm', '-it', '-u', f'{UID}:{GID}'] | ||||||
| if args.input_dir is not None: | if args.input_dir is not None: | ||||||
|     mapping = os.path.abspath(args.input_dir) + ':' + CONTAINER_INPUT_DIR |     mapping = f'{os.path.abspath(args.input_dir)}:{CONTAINER_INPUT_DIR}' | ||||||
|     cmd += ['-v', mapping] |     cmd += ['-v', mapping] | ||||||
|     remaining_args += ['-i', CONTAINER_INPUT_DIR] |     remaining_args += ['-i', CONTAINER_INPUT_DIR] | ||||||
| if args.output_dir is not None: | if args.output_dir is not None: | ||||||
|     mapping = os.path.abspath(args.output_dir) + ':' + CONTAINER_OUTPUT_DIR |     mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}' | ||||||
|     cmd += ['-v', mapping] |     cmd += ['-v', mapping] | ||||||
|     remaining_args += ['-o', CONTAINER_OUTPUT_DIR] |     remaining_args += ['-o', CONTAINER_OUTPUT_DIR] | ||||||
|  | if args.models is not None: | ||||||
|  |     for model in args.models: | ||||||
|  |         mapping = f'{os.path.abspath(model)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model)}'  # noqa | ||||||
|  |         cmd += ['-v', mapping] | ||||||
| if args.log_dir is not None: | if args.log_dir is not None: | ||||||
|     mapping = os.path.abspath(args.log_dir) + ':' + CONTAINER_LOG_DIR |     mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}' | ||||||
|     cmd += ['-v', mapping] |     cmd += ['-v', mapping] | ||||||
|     remaining_args += ['--log-dir', CONTAINER_LOG_DIR] |     remaining_args += ['--log-dir', CONTAINER_LOG_DIR] | ||||||
| cmd.append(CONTAINER_IMAGE) | cmd.append(CONTAINER_IMAGE) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user