mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
				synced 2025-10-31 07:32:44 +00:00 
			
		
		
		
	Codestyle enhacements
This commit is contained in:
		
							
								
								
									
										22
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										22
									
								
								README.md
									
									
									
									
									
								
							| @@ -26,8 +26,24 @@ This software implements a heavily parallelized pipeline to recognize text in PD | ||||
| 3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details. | ||||
| ```bash | ||||
| cd /<my_data_location> | ||||
| ocr -i input -o output -m models/<model_name> -l <language_code> <optional_pipeline_arguments> | ||||
| # or | ||||
| ocr -i input -o output -m models/* -l <language_code> <optional_pipeline_arguments> | ||||
| # <model_code> is the model filename without the ".traineddata" suffix | ||||
| ocr \ | ||||
|   --input-dir input \ | ||||
|   --output-dir output \ | ||||
|   --model-file models/<model> | ||||
|   -m <model_code> <optional_pipeline_arguments> | ||||
| # More then one model | ||||
| ocr \ | ||||
|   --input-dir input \ | ||||
|   --output-dir output \ | ||||
|   --model-file models/<model1> | ||||
|   --model-file models/<model2> | ||||
|   -m <model1_code>+<model2_code> <optional_pipeline_arguments> | ||||
| # Instead of multiple --model-file statements, you can also use | ||||
| ocr \ | ||||
|   --input-dir input \ | ||||
|   --output-dir output \ | ||||
|   --model-file models/* | ||||
|   -m <model1_code>+<model2_code> <optional_pipeline_arguments> | ||||
| ``` | ||||
| 4. Check your results in the `/<my_data_location>/output` directory. | ||||
|   | ||||
							
								
								
									
										37
									
								
								hocr-combine
									
									
									
									
									
								
							
							
						
						
									
										37
									
								
								hocr-combine
									
									
									
									
									
								
							| @@ -1,33 +1,42 @@ | ||||
| #!/usr/bin/env python3.7 | ||||
| # coding=utf-8 | ||||
|  | ||||
| """"Combine multiple hOCR files.""" | ||||
| ''' Combine multiple hOCR files. ''' | ||||
|  | ||||
| from argparse import ArgumentParser | ||||
| from lxml import html | ||||
|  | ||||
|  | ||||
| parser = ArgumentParser(description='Combine multiple hOCR files.') | ||||
| parser.add_argument('file', help='Input file(s)', nargs='+') | ||||
| parser.add_argument('-o', '--output-file', help='Output file', required=True) | ||||
| parser.add_argument( | ||||
|     '-i', '--input-file', | ||||
|     help='Input file', | ||||
|     nargs='+', | ||||
|     required=True | ||||
| ) | ||||
| parser.add_argument( | ||||
|     '-o', '--output-file', | ||||
|     help='Output file', | ||||
|     required=True | ||||
| ) | ||||
| args = parser.parse_args() | ||||
| print(args) | ||||
|  | ||||
|  | ||||
| for file in args.file: | ||||
|     files = [] | ||||
|     if file.startswith('@'): | ||||
|         with open(file[1:], 'r') as f: | ||||
|             files += [x for x in f.read().split("\n") if x != ''] | ||||
| for input_file in args.input_file: | ||||
|     input_files = [] | ||||
|     if input_file.startswith('@'): | ||||
|         with open(input_file[1:], 'r') as f: | ||||
|             input_files += [x for x in f.read().split("\n") if x != ''] | ||||
|     else: | ||||
|         files.append(file) | ||||
| if len(files) == 0: | ||||
|         input_files.append(input_file) | ||||
| if len(input_files) == 0: | ||||
|     exit(1) | ||||
|  | ||||
|  | ||||
| hocr = html.parse(files[0]) | ||||
| hocr = html.parse(input_files[0]) | ||||
| hocr_body = hocr.find('body') | ||||
| for file in files[1:]: | ||||
|     for ocr_page in html.parse(file).findall('//div[@class="ocr_page"]'): | ||||
| for input_file in input_files[1:]: | ||||
|     for ocr_page in html.parse(input_file).findall('//div[@class="ocr_page"]'): | ||||
|         hocr_body.append(ocr_page) | ||||
|  | ||||
|  | ||||
|   | ||||
							
								
								
									
										23
									
								
								hocr2tei
									
									
									
									
									
								
							
							
						
						
									
										23
									
								
								hocr2tei
									
									
									
									
									
								
							| @@ -1,7 +1,7 @@ | ||||
| #!/usr/bin/env python3.7 | ||||
| # coding=utf-8 | ||||
|  | ||||
| """"Convert hOCR to TEI XML.""" | ||||
| ''' Convert hOCR to TEI XML. ''' | ||||
|  | ||||
| from argparse import ArgumentParser | ||||
| from lxml import html | ||||
| @@ -10,8 +10,15 @@ import re | ||||
|  | ||||
|  | ||||
| parser = ArgumentParser(description='Convert hOCR to TEI XML.') | ||||
| parser.add_argument('file', help='Input file') | ||||
| parser.add_argument('-o', '--output-file', help='Output file', required=True) | ||||
| parser.add_argument( | ||||
|     '-i', '--input-file', | ||||
|     help='Input file' | ||||
| ) | ||||
| parser.add_argument( | ||||
|     '-o', '--output-file', | ||||
|     help='Output file', | ||||
|     required=True | ||||
| ) | ||||
| args = parser.parse_args() | ||||
|  | ||||
|  | ||||
| @@ -32,7 +39,7 @@ tei += '    </fileDesc>\n' | ||||
| tei += '  </teiHeader>\n' | ||||
| tei += '  <text>\n' | ||||
| tei += '    <body>\n' | ||||
| hocr = html.parse(args.file) | ||||
| hocr = html.parse(args.input_file) | ||||
| for ocr_page in hocr.findall('.//div[@class="ocr_page"]'): | ||||
|     ocr_page_title_attrib = ocr_page.attrib.get('title') | ||||
|     facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1) | ||||
| @@ -42,11 +49,13 @@ for ocr_page in hocr.findall('.//div[@class="ocr_page"]'): | ||||
|         tei += '      <p>\n' | ||||
|         for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'): | ||||
|             tei += '        <lb/>' | ||||
|             indent = '' | ||||
|             is_first_word_in_line = True | ||||
|             for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'): | ||||
|                 if ocrx_word.text is not None: | ||||
|                     tei += indent + escape(ocrx_word.text) | ||||
|                     indent = ' ' | ||||
|                     if not is_first_word_in_line: | ||||
|                         tei += ' ' | ||||
|                     tei += escape(ocrx_word.text) | ||||
|                     is_first_word_in_line = False | ||||
|             tei += '\n' | ||||
|         tei += '      </p>\n' | ||||
| tei += '    </body>\n' | ||||
|   | ||||
							
								
								
									
										179
									
								
								ocr
									
									
									
									
									
								
							
							
						
						
									
										179
									
								
								ocr
									
									
									
									
									
								
							| @@ -8,7 +8,6 @@ __version__ = '0.1.0' | ||||
| from argparse import ArgumentParser | ||||
| from pyflow import WorkflowRunner | ||||
| import json | ||||
| import multiprocessing | ||||
| import os | ||||
| import sys | ||||
|  | ||||
| @@ -52,8 +51,8 @@ class SplitInputWorkflow(WorkflowRunner): | ||||
|         cmd += ' -dQUIET' | ||||
|         cmd += ' -r300' | ||||
|         cmd += ' -sDEVICE=png16m' | ||||
|         cmd += ' -sOutputFile="{}/page-%d.png"'.format( | ||||
|             os.path.join(self.job.tmp_dir, 'images') | ||||
|         cmd += ' -sOutputFile="{}"'.format( | ||||
|             os.path.join(self.job.tmp_dir, 'images', 'page-%d.png') | ||||
|         ) | ||||
|         cmd += ' "{}"'.format(self.job.file) | ||||
|         self.addTask( | ||||
| @@ -82,13 +81,18 @@ class BinarizationWorkflow(WorkflowRunner): | ||||
|             os.path.join(self.job.tmp_dir, 'images', 'inputs.txt') | ||||
|         ) | ||||
|         cmd += ' && ' | ||||
|         cmd += 'ocropus-nlbin "@{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt'))  # noqa | ||||
|         cmd += 'ocropus-nlbin "@{}"'.format( | ||||
|             os.path.join(self.job.tmp_dir, 'images', 'inputs.txt') | ||||
|         ) | ||||
|         cmd += ' --nocheck' | ||||
|         cmd += ' --output "{}"'.format( | ||||
|             os.path.join(self.job.tmp_dir, 'images')) | ||||
|             os.path.join(self.job.tmp_dir, 'images') | ||||
|         ) | ||||
|         cmd += ' --parallel "{}"'.format(n_cores) | ||||
|         cmd += ' && ' | ||||
|         cmd += 'rm "{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt'))  # noqa | ||||
|         cmd += 'rm "{}"'.format( | ||||
|             os.path.join(self.job.tmp_dir, 'images', 'inputs.txt') | ||||
|         ) | ||||
|         ocropus_nlbin_task = self.addTask( | ||||
|             'ocropus_nlbin', | ||||
|             command=cmd, | ||||
| @@ -130,9 +134,9 @@ class BinarizationWorkflow(WorkflowRunner): | ||||
|  | ||||
|  | ||||
| class OCRWorkflow(WorkflowRunner): | ||||
|     def __init__(self, job, lang): | ||||
|     def __init__(self, job, model): | ||||
|         self.job = job | ||||
|         self.lang = lang | ||||
|         self.model = model | ||||
|  | ||||
|     def workflow(self): | ||||
|         ''' | ||||
| @@ -148,7 +152,7 @@ class OCRWorkflow(WorkflowRunner): | ||||
|                 os.path.join(self.job.tmp_dir, 'images', file), | ||||
|                 os.path.join(self.job.tmp_dir, file[:-4]) | ||||
|             ) | ||||
|             cmd += ' -l "{}"'.format(self.lang) | ||||
|             cmd += ' -l "{}"'.format(self.model) | ||||
|             cmd += ' hocr pdf txt' | ||||
|             cmd += ' || ' | ||||
|             cmd += 'echo "${?}"' | ||||
| @@ -166,6 +170,7 @@ class OCRWorkflow(WorkflowRunner): | ||||
|         ' # move_files                                     # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         move_files_tasks = [] | ||||
|         n_cores = 1 | ||||
|         mem_mb = min(128, self.getMemMb()) | ||||
|         for i, file_extension in enumerate(['hocr', 'pdf', 'txt']): | ||||
| @@ -174,24 +179,26 @@ class OCRWorkflow(WorkflowRunner): | ||||
|                 file_extension, | ||||
|                 os.path.join(self.job.tmp_dir, file_extension) | ||||
|             ) | ||||
|             self.addTask( | ||||
|             task = self.addTask( | ||||
|                 'move_{}_files'.format(file_extension), | ||||
|                 command=cmd, | ||||
|                 dependencies=tesseract_tasks, | ||||
|                 memMb=mem_mb, | ||||
|                 nCores=n_cores | ||||
|             ) | ||||
|             move_files_tasks.append(task) | ||||
|         cmd = 'mv "{}" "{}"'.format( | ||||
|             os.path.join(self.job.tmp_dir, 'images'), | ||||
|             os.path.join(self.job.output_dir) | ||||
|         ) | ||||
|         self.addTask( | ||||
|         task = self.addTask( | ||||
|             'move_image_files', | ||||
|             command=cmd, | ||||
|             dependencies=tesseract_tasks, | ||||
|             memMb=mem_mb, | ||||
|             nCores=n_cores | ||||
|         ) | ||||
|         move_files_tasks.append(task) | ||||
|  | ||||
|  | ||||
| class CreateHOCRWorkflow(WorkflowRunner): | ||||
| @@ -256,13 +263,14 @@ class CreateHOCRWorkflow(WorkflowRunner): | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         n_cores = 1 | ||||
|         mem_mb = min(512, self.getMemMb()) | ||||
|         mem_mb = min(256, self.getMemMb()) | ||||
|         cmd = 'ls -dv "{}/"* > "{}"'.format( | ||||
|             os.path.join(self.job.tmp_dir, 'hocr'), | ||||
|             os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt') | ||||
|         ) | ||||
|         cmd += ' && ' | ||||
|         cmd += 'hocr-combine "@{}"'.format( | ||||
|         cmd += 'hocr-combine' | ||||
|         cmd += ' --input-file "@{}"'.format( | ||||
|             os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt') | ||||
|         ) | ||||
|         cmd += ' --output-file "{}.hocr"'.format( | ||||
| @@ -301,12 +309,17 @@ class CreatePDFWorkflow(WorkflowRunner): | ||||
|         cmd += ' -dPDFSETTINGS=/ebook' | ||||
|         cmd += ' -dQUIET' | ||||
|         cmd += ' -sDEVICE=pdfwrite' | ||||
|         cmd += ' -sOutputFile="{}.pdf"'.format( | ||||
|             os.path.join(self.job.output_dir, self.job.name) | ||||
|         cmd += ' -sOutputFile="{}"'.format( | ||||
|             os.path.join(self.job.output_dir, '{}.pdf'.format(self.job.name)) | ||||
|         ) | ||||
|         cmd += ' && ' | ||||
|         cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'pdf')) | ||||
|         self.addTask('pdf_combine', command=cmd, memMb=mem_mb, nCores=n_cores) | ||||
|         self.addTask( | ||||
|             'pdf_combine', | ||||
|             command=cmd, | ||||
|             memMb=mem_mb, | ||||
|             nCores=n_cores | ||||
|         ) | ||||
|  | ||||
|  | ||||
| class CreateTEIWorkflow(WorkflowRunner): | ||||
| @@ -320,14 +333,23 @@ class CreateTEIWorkflow(WorkflowRunner): | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         n_cores = 1 | ||||
|         mem_mb = min(512, self.getMemMb()) | ||||
|         cmd = 'hocr2tei "{}.hocr"'.format( | ||||
|             os.path.join(self.job.output_dir, self.job.name) | ||||
|         mem_mb = min(256, self.getMemMb()) | ||||
|         cmd = 'hocr2tei' | ||||
|         cmd += ' --input-file "{}"'.format( | ||||
|             os.path.join(self.job.output_dir, '{}.hocr'.format(self.job.name)) | ||||
|         ) | ||||
|         cmd += ' --output-file "{}.tei.xml"'.format( | ||||
|             os.path.join(self.job.output_dir, self.job.name) | ||||
|         cmd += ' --output-file "{}"'.format( | ||||
|             os.path.join( | ||||
|                 self.job.output_dir, | ||||
|                 '{}.tei.xml'.format(self.job.name) | ||||
|             ) | ||||
|         ) | ||||
|         self.addTask( | ||||
|             'hocr2tei', | ||||
|             command=cmd, | ||||
|             memMb=mem_mb, | ||||
|             nCores=n_cores | ||||
|         ) | ||||
|         self.addTask('hocr2tei', command=cmd, memMb=mem_mb, nCores=n_cores) | ||||
|  | ||||
|  | ||||
| class CreatePoCoZipWorkflow(WorkflowRunner): | ||||
| @@ -354,7 +376,12 @@ class CreatePoCoZipWorkflow(WorkflowRunner): | ||||
|         cmd += 'rm -r images' | ||||
|         cmd += ' && ' | ||||
|         cmd += 'cd -' | ||||
|         task = self.addTask('zip', command=cmd, memMb=mem_mb, nCores=n_cores) | ||||
|         task = self.addTask( | ||||
|             'zip', | ||||
|             command=cmd, | ||||
|             memMb=mem_mb, | ||||
|             nCores=n_cores | ||||
|         ) | ||||
|         zip_tasks.append(task) | ||||
|  | ||||
|  | ||||
| @@ -377,13 +404,18 @@ class CreateTxtWorkflow(WorkflowRunner): | ||||
|         cmd += '"{}.txt"'.format(os.path.join(self.job.output_dir, self.job.name))  # noqa | ||||
|         cmd += ' && ' | ||||
|         cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'txt')) | ||||
|         self.addTask('txt_combine', command=cmd, memMb=mem_mb, nCores=n_cores) | ||||
|         self.addTask( | ||||
|             'txt_combine', | ||||
|             command=cmd, | ||||
|             memMb=mem_mb, | ||||
|             nCores=n_cores | ||||
|         ) | ||||
|  | ||||
|  | ||||
| class MainWorkflow(WorkflowRunner): | ||||
|     def __init__(self, input_dir, lang, output_dir, binarize): | ||||
|     def __init__(self, input_dir, model, output_dir, binarize): | ||||
|         self.input_dir = input_dir | ||||
|         self.lang = lang | ||||
|         self.model = model | ||||
|         self.output_dir = output_dir | ||||
|         self.binarize = binarize | ||||
|         self.jobs = [] | ||||
| @@ -419,11 +451,13 @@ class MainWorkflow(WorkflowRunner): | ||||
|         ' # split-input                                    # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         split_input_tasks = [] | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             self.addWorkflowTask( | ||||
|             task = self.addWorkflowTask( | ||||
|                 'split_input_-_{}'.format(i), | ||||
|                 SplitInputWorkflow(job) | ||||
|             ) | ||||
|             split_input_tasks.append(task) | ||||
|  | ||||
|         if self.binarize: | ||||
|             ''' | ||||
| @@ -431,12 +465,14 @@ class MainWorkflow(WorkflowRunner): | ||||
|             ' # binarization                                   # | ||||
|             ' ################################################## | ||||
|             ''' | ||||
|             binarization_tasks = [] | ||||
|             for i, job in enumerate(self.jobs): | ||||
|                 self.addWorkflowTask( | ||||
|                 task = self.addWorkflowTask( | ||||
|                     'binarization_-_{}'.format(i), | ||||
|                     BinarizationWorkflow(job), | ||||
|                     dependencies='split_input_-_{}'.format(i) | ||||
|                 ) | ||||
|                 binarization_tasks.append(task) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
| @@ -451,7 +487,7 @@ class MainWorkflow(WorkflowRunner): | ||||
|                 deps = 'split_input_-_{}'.format(i) | ||||
|             task = self.addWorkflowTask( | ||||
|                 'ocr_-_{}'.format(i), | ||||
|                 OCRWorkflow(job, self.lang), | ||||
|                 OCRWorkflow(job, self.model), | ||||
|                 dependencies=deps | ||||
|             ) | ||||
|             ocr_tasks.append(task) | ||||
| @@ -527,55 +563,80 @@ class MainWorkflow(WorkflowRunner): | ||||
|             create_txt_tasks.append(task) | ||||
|  | ||||
|         self.waitForTasks() | ||||
|         output_files = [] | ||||
|         outputs = [] | ||||
|         for job in self.jobs: | ||||
|             # Remove temporary directory | ||||
|             os.rmdir(job.tmp_dir) | ||||
|             # Track output files | ||||
|             relative_output_dir = os.path.relpath(job.output_dir, start=self.output_dir)  # noqa | ||||
|             output_files.append( | ||||
|             relative_output_dir = os.path.relpath( | ||||
|                 job.output_dir, | ||||
|                 start=self.output_dir | ||||
|             ) | ||||
|             outputs.append( | ||||
|                 { | ||||
|                     'description': 'Post correction package (.png and .hocr).', | ||||
|                     'file': os.path.join(relative_output_dir, '{}.poco.zip'.format(job.name)),  # noqa | ||||
|                     'file': os.path.join( | ||||
|                         relative_output_dir, | ||||
|                         '{}.poco.zip'.format(job.name) | ||||
|                     ), | ||||
|                     'mimetype': 'application/zip' | ||||
|                 } | ||||
|             ) | ||||
|             output_files.append( | ||||
|             outputs.append( | ||||
|                 { | ||||
|                     'description': 'PDF file with text layer.', | ||||
|                     'file': os.path.join(relative_output_dir, '{}.pdf'.format(job.name)),  # noqa | ||||
|                     'file': os.path.join( | ||||
|                         relative_output_dir, | ||||
|                         '{}.pdf'.format(job.name) | ||||
|                     ), | ||||
|                     'mimetype': 'application/pdf' | ||||
|                 } | ||||
|             ) | ||||
|             output_files.append( | ||||
|             outputs.append( | ||||
|                 { | ||||
|                     'description': 'Plain text file.', | ||||
|                     'file': os.path.join(relative_output_dir, '{}.txt'.format(job.name)),  # noqa | ||||
|                     'file': os.path.join( | ||||
|                         relative_output_dir, | ||||
|                         '{}.txt'.format(job.name) | ||||
|                     ), | ||||
|                     'mimetype': 'text/plain' | ||||
|                 } | ||||
|             ) | ||||
|             output_files.append( | ||||
|             outputs.append( | ||||
|                 { | ||||
|                     'description': 'TEI compliant XML file.', | ||||
|                     'file': os.path.join(relative_output_dir, '{}.tei.xml'.format(job.name)),  # noqa | ||||
|                     'file': os.path.join( | ||||
|                         relative_output_dir, | ||||
|                         '{}.tei.xml'.format(job.name) | ||||
|                     ), | ||||
|                     'mimetype': 'application/tei+xml' | ||||
|                 } | ||||
|             ) | ||||
|         with open(os.path.join(self.output_dir, 'output_records.json'), 'w') as f:  # noqa | ||||
|             json.dump(output_files, f, indent=4) | ||||
|         with open(os.path.join(self.output_dir, 'outputs.json'), 'w') as f: | ||||
|             json.dump(outputs, f, indent=4) | ||||
|  | ||||
|  | ||||
| def parse_args(): | ||||
|     parser = ArgumentParser(description='Pipeline for PDF file OCR processing') | ||||
|     parser = ArgumentParser( | ||||
|         description='Pipeline for PDF file OCR processing' | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         '-i', '--input-dir', help='Input directory', required=True) | ||||
|         '-i', '--input-dir', | ||||
|         help='Input directory', | ||||
|         required=True | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         '-o', '--output-dir', help='Output directory', required=True) | ||||
|         '-o', '--output-dir', | ||||
|         help='Output directory', | ||||
|         required=True | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         '-l', '--language', | ||||
|         choices=[x[:-12] for x in os.listdir('/usr/local/share/tessdata') | ||||
|                  if x.endswith('.traineddata') and len(x) > 12], | ||||
|         help='Language of the input (3-character ISO 639-2 language codes)', | ||||
|         '-m', '--model', | ||||
|         choices=[ | ||||
|             x[:-12] for x in os.listdir('/usr/local/share/tessdata') | ||||
|             if x.endswith('.traineddata') and len(x) > 12 | ||||
|         ], | ||||
|         help='Name of the model to be used', | ||||
|         required=True | ||||
|     ) | ||||
|     parser.add_argument( | ||||
| @@ -584,16 +645,19 @@ def parse_args(): | ||||
|         help='Add binarization as a preprocessing step' | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         '--log-dir', help='Logging directory (Default: --output-dir)') | ||||
|         '--log-dir', | ||||
|         help='Logging directory (Default: --output-dir)' | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         '--mem-mb', | ||||
|         help='Amount of system memory to be used (Default: min(--n-cores * 512, available system memory))',  # noqa | ||||
|         help='Amount of system memory to be used ' | ||||
|              '(Default: min(--n-cores * 512, available system memory))', | ||||
|         type=int | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         '--n-cores', | ||||
|         default=min(4, multiprocessing.cpu_count()), | ||||
|         help='Number of CPU threads to be used (Default: min(4, CPU count))', | ||||
|         default=1, | ||||
|         help='Number of CPU threads to be used', | ||||
|         type=int | ||||
|     ) | ||||
|     parser.add_argument( | ||||
| @@ -620,10 +684,17 @@ def parse_args(): | ||||
| def main(): | ||||
|     args = parse_args() | ||||
|     main_workflow = MainWorkflow( | ||||
|         args.input_dir, args.language, args.output_dir, args.binarize) | ||||
|         args.input_dir, | ||||
|         args.model, | ||||
|         args.output_dir, | ||||
|         args.binarize | ||||
|     ) | ||||
|     main_workflow.collect_jobs() | ||||
|     retval = main_workflow.run( | ||||
|         dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores) | ||||
|         dataDirRoot=args.log_dir, | ||||
|         memMb=args.mem_mb, | ||||
|         nCores=args.n_cores | ||||
|     ) | ||||
|     sys.exit(retval) | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -17,7 +17,7 @@ GID = str(os.getgid()) | ||||
| parser = ArgumentParser(add_help=False) | ||||
| parser.add_argument('-i', '--input-dir') | ||||
| parser.add_argument('-o', '--output-dir') | ||||
| parser.add_argument('-m', '--model', action='extend', dest='models', nargs='+') | ||||
| parser.add_argument('-t', '--model-file', action='extend', nargs='+') | ||||
| parser.add_argument('--log-dir') | ||||
| args, remaining_args = parser.parse_known_args() | ||||
|  | ||||
| @@ -30,9 +30,9 @@ if args.output_dir is not None: | ||||
|     mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}' | ||||
|     cmd += ['-v', mapping] | ||||
|     remaining_args += ['-o', CONTAINER_OUTPUT_DIR] | ||||
| if args.models is not None: | ||||
|     for model in args.models: | ||||
|         mapping = f'{os.path.abspath(model)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model)}'  # noqa | ||||
| if args.model_file is not None: | ||||
|     for model_file in args.model_file: | ||||
|         mapping = f'{os.path.abspath(model_file)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model_file)}'  # noqa | ||||
|         cmd += ['-v', mapping] | ||||
| if args.log_dir is not None: | ||||
|     mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}' | ||||
|   | ||||
		Reference in New Issue
	
	Block a user