mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
				synced 2025-11-04 12:52:56 +00:00 
			
		
		
		
	Update
This commit is contained in:
		
							
								
								
									
										147
									
								
								ocr
									
									
									
									
									
								
							
							
						
						
									
										147
									
								
								ocr
									
									
									
									
									
								
							@@ -36,6 +36,12 @@ def parse_arguments():
 | 
				
			|||||||
        requirements: imagemagick, ocropus, pdftoppm, pdfunite, \
 | 
					        requirements: imagemagick, ocropus, pdftoppm, pdfunite, \
 | 
				
			||||||
        poppler-utils, pyflow, python2.7, python3.5, tesseract'
 | 
					        poppler-utils, pyflow, python2.7, python3.5, tesseract'
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					    parser.add_argument(
 | 
				
			||||||
 | 
					        '-i',
 | 
				
			||||||
 | 
					        dest='inputDirectory',
 | 
				
			||||||
 | 
					        help='The input directory.',
 | 
				
			||||||
 | 
					        required=True
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
    parser.add_argument(
 | 
					    parser.add_argument(
 | 
				
			||||||
        '-l',
 | 
					        '-l',
 | 
				
			||||||
        dest='lang',
 | 
					        dest='lang',
 | 
				
			||||||
@@ -43,18 +49,10 @@ def parse_arguments():
 | 
				
			|||||||
        required=True
 | 
					        required=True
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    parser.add_argument(
 | 
					    parser.add_argument(
 | 
				
			||||||
        '--i',
 | 
					        '-o',
 | 
				
			||||||
        default=os.path.normpath('/files_for_ocr'),
 | 
					 | 
				
			||||||
        dest='inputDirectory',
 | 
					 | 
				
			||||||
        help='The input directory.',
 | 
					 | 
				
			||||||
        required=False
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    parser.add_argument(
 | 
					 | 
				
			||||||
        '--o',
 | 
					 | 
				
			||||||
        default=os.path.normpath('/files_from_ocr'),
 | 
					 | 
				
			||||||
        dest='outputDirectory',
 | 
					        dest='outputDirectory',
 | 
				
			||||||
        help='The output directory.',
 | 
					        help='The output directory.',
 | 
				
			||||||
        required=False
 | 
					        required=True
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    parser.add_argument(
 | 
					    parser.add_argument(
 | 
				
			||||||
        '--skip-binarisation',
 | 
					        '--skip-binarisation',
 | 
				
			||||||
@@ -93,7 +91,9 @@ class OCRWorkflow(WorkflowRunner):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    def workflow(self):
 | 
					    def workflow(self):
 | 
				
			||||||
        '''
 | 
					        '''
 | 
				
			||||||
        ' Creating output directories...
 | 
					        ' ##################################################
 | 
				
			||||||
 | 
					        ' # Create output directories                      #
 | 
				
			||||||
 | 
					        ' ##################################################
 | 
				
			||||||
        '''
 | 
					        '''
 | 
				
			||||||
        create_output_directories_jobs = []
 | 
					        create_output_directories_jobs = []
 | 
				
			||||||
        for index, job in enumerate(self.jobs):
 | 
					        for index, job in enumerate(self.jobs):
 | 
				
			||||||
@@ -119,7 +119,9 @@ class OCRWorkflow(WorkflowRunner):
 | 
				
			|||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        '''
 | 
					        '''
 | 
				
			||||||
        ' Splitting...
 | 
					        ' ##################################################
 | 
				
			||||||
 | 
					        ' # Split                                          #
 | 
				
			||||||
 | 
					        ' ##################################################
 | 
				
			||||||
        '''
 | 
					        '''
 | 
				
			||||||
        split_jobs = []
 | 
					        split_jobs = []
 | 
				
			||||||
        split_job_nCores = min(
 | 
					        split_job_nCores = min(
 | 
				
			||||||
@@ -128,7 +130,7 @@ class OCRWorkflow(WorkflowRunner):
 | 
				
			|||||||
        )
 | 
					        )
 | 
				
			||||||
        for index, job in enumerate(self.jobs):
 | 
					        for index, job in enumerate(self.jobs):
 | 
				
			||||||
            if job['filename'].endswith(('.tif', '.tiff')):
 | 
					            if job['filename'].endswith(('.tif', '.tiff')):
 | 
				
			||||||
                cmd = 'convert "%s" -compress LZW -density 300 -scene 1 "%s"/page-%%d.tif' % (
 | 
					                cmd = 'convert "%s" -compress LZW -density 300 -scene 1 "%s/page-%%d.tif"' % (
 | 
				
			||||||
                    job['path'],
 | 
					                    job['path'],
 | 
				
			||||||
                    os.path.join(job['output_dir'], 'tmp')
 | 
					                    os.path.join(job['output_dir'], 'tmp')
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
@@ -148,7 +150,15 @@ class OCRWorkflow(WorkflowRunner):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        if not self.skipBinarisation:
 | 
					        if not self.skipBinarisation:
 | 
				
			||||||
            '''
 | 
					            '''
 | 
				
			||||||
            ' Binarising...
 | 
					            ' The binarisation_jobs are created based of the output files of
 | 
				
			||||||
 | 
					            ' the split_jobs. So wait until they are finished.
 | 
				
			||||||
 | 
					            '''
 | 
				
			||||||
 | 
					            self.waitForTasks()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            '''
 | 
				
			||||||
 | 
					            ' ##################################################
 | 
				
			||||||
 | 
					            ' # Binarise                                       #
 | 
				
			||||||
 | 
					            ' ##################################################
 | 
				
			||||||
            '''
 | 
					            '''
 | 
				
			||||||
            binarisation_jobs = []
 | 
					            binarisation_jobs = []
 | 
				
			||||||
            '''
 | 
					            '''
 | 
				
			||||||
@@ -158,10 +168,17 @@ class OCRWorkflow(WorkflowRunner):
 | 
				
			|||||||
            '''
 | 
					            '''
 | 
				
			||||||
            binarisation_job_nCores = min(4, self.nCores)
 | 
					            binarisation_job_nCores = min(4, self.nCores)
 | 
				
			||||||
            for index, job in enumerate(self.jobs):
 | 
					            for index, job in enumerate(self.jobs):
 | 
				
			||||||
                cmd = 'ls --quoting-style=shell-escape -v "%s"/*.tif | xargs ocropus-nlbin --output "%s" --parallel "%i"' % (
 | 
					                files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
 | 
				
			||||||
 | 
					                files = filter(lambda x: x.endswith('.tif'), files)
 | 
				
			||||||
 | 
					                files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
 | 
				
			||||||
 | 
					                files = map(
 | 
				
			||||||
 | 
					                    lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"',
 | 
				
			||||||
 | 
					                    files
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					                cmd = 'ocropus-nlbin --output "%s" --parallel "%i" %s' % (
 | 
				
			||||||
                    os.path.join(job['output_dir'], 'tmp'),
 | 
					                    os.path.join(job['output_dir'], 'tmp'),
 | 
				
			||||||
                    os.path.join(job['output_dir'], 'tmp'),
 | 
					                    binarisation_job_nCores,
 | 
				
			||||||
                    binarisation_job_nCores
 | 
					                    ' '.join(files)
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
                binarisation_jobs.append(
 | 
					                binarisation_jobs.append(
 | 
				
			||||||
                    self.addTask(
 | 
					                    self.addTask(
 | 
				
			||||||
@@ -173,25 +190,30 @@ class OCRWorkflow(WorkflowRunner):
 | 
				
			|||||||
                )
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            '''
 | 
					            '''
 | 
				
			||||||
            ' Normalising file names from binarisation...
 | 
					            ' The post_binarisation_jobs are created based of the output files
 | 
				
			||||||
 | 
					            ' of the binarisation_jobs. So wait until they are finished.
 | 
				
			||||||
            '''
 | 
					            '''
 | 
				
			||||||
            self.waitForTasks()
 | 
					            self.waitForTasks()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            '''
 | 
				
			||||||
 | 
					            ' ##################################################
 | 
				
			||||||
 | 
					            ' # Normalise file names from binarisation         #
 | 
				
			||||||
 | 
					            ' ##################################################
 | 
				
			||||||
 | 
					            '''
 | 
				
			||||||
            post_binarisation_jobs = []
 | 
					            post_binarisation_jobs = []
 | 
				
			||||||
            for index, job in enumerate(self.jobs):
 | 
					            for index, job in enumerate(self.jobs):
 | 
				
			||||||
                number = 0
 | 
					                number = 0
 | 
				
			||||||
                files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
 | 
					                files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
 | 
				
			||||||
                files = filter(lambda x: x.endswith('.bin.png'), files)
 | 
					                files = filter(lambda x: x.endswith('.bin.png'), files)
 | 
				
			||||||
                files = sorted(
 | 
					                files.sort()
 | 
				
			||||||
                    files,
 | 
					 | 
				
			||||||
                    key=lambda x: int(re.search(r'\d+', x).group(0))
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
                for file in files:
 | 
					                for file in files:
 | 
				
			||||||
                    cmd = 'mv "%s" "%s"' % (
 | 
					                    cmd = 'mv "%s" "%s"' % (
 | 
				
			||||||
                        os.path.join(job['output_dir'], 'tmp', file),
 | 
					                        os.path.join(job['output_dir'], 'tmp', file),
 | 
				
			||||||
                        os.path.join(job['output_dir'], 'tmp', 'page-%i.%s' % (
 | 
					                        os.path.join(
 | 
				
			||||||
                            int(file.split('.', 1)[0]),
 | 
					                            job['output_dir'],
 | 
				
			||||||
                            file.split('.', 1)[1])
 | 
					                            'tmp',
 | 
				
			||||||
                        ),
 | 
					                            'page-%i.bin.png' % (int(file.split('.', 1)[0]))
 | 
				
			||||||
 | 
					                        )
 | 
				
			||||||
                    )
 | 
					                    )
 | 
				
			||||||
                    post_binarisation_jobs.append(
 | 
					                    post_binarisation_jobs.append(
 | 
				
			||||||
                        self.addTask(
 | 
					                        self.addTask(
 | 
				
			||||||
@@ -206,9 +228,17 @@ class OCRWorkflow(WorkflowRunner):
 | 
				
			|||||||
                    number += 1
 | 
					                    number += 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        '''
 | 
					        '''
 | 
				
			||||||
        ' Performing OCR...
 | 
					        ' The ocr_jobs are created based of the output files of either the
 | 
				
			||||||
 | 
					        ' split_jobs or post_binarisation_jobs. So wait until they are
 | 
				
			||||||
 | 
					        ' finished.
 | 
				
			||||||
        '''
 | 
					        '''
 | 
				
			||||||
        self.waitForTasks()
 | 
					        self.waitForTasks()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        '''
 | 
				
			||||||
 | 
					        ' ##################################################
 | 
				
			||||||
 | 
					        ' # Optical Character Recognition                  #
 | 
				
			||||||
 | 
					        ' ##################################################
 | 
				
			||||||
 | 
					        '''
 | 
				
			||||||
        ocr_jobs = []
 | 
					        ocr_jobs = []
 | 
				
			||||||
        '''
 | 
					        '''
 | 
				
			||||||
        ' Tesseract runs fastest with four cores. So we run it with either four
 | 
					        ' Tesseract runs fastest with four cores. So we run it with either four
 | 
				
			||||||
@@ -223,19 +253,20 @@ class OCRWorkflow(WorkflowRunner):
 | 
				
			|||||||
        if self.lang == "deu_frak":
 | 
					        if self.lang == "deu_frak":
 | 
				
			||||||
            ocr_job_nCores = 1
 | 
					            ocr_job_nCores = 1
 | 
				
			||||||
        for index, job in enumerate(self.jobs):
 | 
					        for index, job in enumerate(self.jobs):
 | 
				
			||||||
            number = 0
 | 
					 | 
				
			||||||
            files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
 | 
					            files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
 | 
				
			||||||
            if self.skipBinarisation:
 | 
					            if self.skipBinarisation:
 | 
				
			||||||
                files = filter(lambda x: x.endswith('.tif'), files)
 | 
					                files = filter(lambda x: x.endswith('.tif'), files)
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                files = filter(lambda x: x.endswith('.bin.png'), files)
 | 
					                files = filter(lambda x: x.endswith('.bin.png'), files)
 | 
				
			||||||
            files = sorted(
 | 
					            files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
 | 
				
			||||||
                files,
 | 
					            files = map(
 | 
				
			||||||
                key=lambda x: int(re.search(r'\d+', x).group(0))
 | 
					                lambda x: os.path.join(job['output_dir'], 'tmp', x),
 | 
				
			||||||
 | 
					                files
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					            number = 0
 | 
				
			||||||
            for file in files:
 | 
					            for file in files:
 | 
				
			||||||
                cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % (
 | 
					                cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % (
 | 
				
			||||||
                    os.path.join(job['output_dir'], 'tmp', file),
 | 
					                    file,
 | 
				
			||||||
                    os.path.join(
 | 
					                    os.path.join(
 | 
				
			||||||
                        job['output_dir'],
 | 
					                        job['output_dir'],
 | 
				
			||||||
                        'tmp',
 | 
					                        'tmp',
 | 
				
			||||||
@@ -253,7 +284,6 @@ class OCRWorkflow(WorkflowRunner):
 | 
				
			|||||||
                        ),
 | 
					                        ),
 | 
				
			||||||
                        post_binarisation_jobs
 | 
					                        post_binarisation_jobs
 | 
				
			||||||
                    )
 | 
					                    )
 | 
				
			||||||
                print(ocr_job_dependencies)
 | 
					 | 
				
			||||||
                ocr_jobs.append(
 | 
					                ocr_jobs.append(
 | 
				
			||||||
                    self.addTask(
 | 
					                    self.addTask(
 | 
				
			||||||
                        command=cmd,
 | 
					                        command=cmd,
 | 
				
			||||||
@@ -265,7 +295,15 @@ class OCRWorkflow(WorkflowRunner):
 | 
				
			|||||||
                number += 1
 | 
					                number += 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        '''
 | 
					        '''
 | 
				
			||||||
        ' Creating TEI P5 files...
 | 
					        ' The following jobs are created based of the output files of the
 | 
				
			||||||
 | 
					        ' ocr_jobs. So wait until they are finished.
 | 
				
			||||||
 | 
					        '''
 | 
				
			||||||
 | 
					        self.waitForTasks()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        '''
 | 
				
			||||||
 | 
					        ' ##################################################
 | 
				
			||||||
 | 
					        ' # Create TEI P5 files                            #
 | 
				
			||||||
 | 
					        ' ##################################################
 | 
				
			||||||
        '''
 | 
					        '''
 | 
				
			||||||
        hocr_to_tei_jobs = []
 | 
					        hocr_to_tei_jobs = []
 | 
				
			||||||
        for index, job in enumerate(self.jobs):
 | 
					        for index, job in enumerate(self.jobs):
 | 
				
			||||||
@@ -273,7 +311,7 @@ class OCRWorkflow(WorkflowRunner):
 | 
				
			|||||||
                os.path.join(job['output_dir'], 'tmp'),
 | 
					                os.path.join(job['output_dir'], 'tmp'),
 | 
				
			||||||
                os.path.join(
 | 
					                os.path.join(
 | 
				
			||||||
                    job['output_dir'],
 | 
					                    job['output_dir'],
 | 
				
			||||||
                    job['filename'].rsplit('.', 1)[0] + '.xml'
 | 
					                    os.path.join(job['output_dir'], job['name'] + '.xml')
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
            hocr_to_tei_jobs.append(
 | 
					            hocr_to_tei_jobs.append(
 | 
				
			||||||
@@ -288,15 +326,24 @@ class OCRWorkflow(WorkflowRunner):
 | 
				
			|||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        '''
 | 
					        '''
 | 
				
			||||||
        ' Merging PDF files...
 | 
					        ' ##################################################
 | 
				
			||||||
 | 
					        ' # Merge PDF files                                #
 | 
				
			||||||
 | 
					        ' ##################################################
 | 
				
			||||||
        '''
 | 
					        '''
 | 
				
			||||||
        pdf_merge_jobs = []
 | 
					        pdf_merge_jobs = []
 | 
				
			||||||
        for index, job in enumerate(self.jobs):
 | 
					        for index, job in enumerate(self.jobs):
 | 
				
			||||||
            cmd = '(ls --quoting-style=shell-escape -v "%s"/*.pdf && echo "\'%s\'") | xargs pdfunite' % (
 | 
					            files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
 | 
				
			||||||
                os.path.join(job['output_dir'], 'tmp'),
 | 
					            files = filter(lambda x: x.endswith('.pdf'), files)
 | 
				
			||||||
 | 
					            files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
 | 
				
			||||||
 | 
					            files = map(
 | 
				
			||||||
 | 
					                lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"',
 | 
				
			||||||
 | 
					                files
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            cmd = 'pdfunite %s "%s"' % (
 | 
				
			||||||
 | 
					                ' '.join(files),
 | 
				
			||||||
                os.path.join(
 | 
					                os.path.join(
 | 
				
			||||||
                    job['output_dir'],
 | 
					                    job['output_dir'],
 | 
				
			||||||
                    job['filename'].rsplit('.', 1)[0] + '.pdf'
 | 
					                    os.path.join(job['output_dir'], job['name'] + '.pdf')
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
            pdf_merge_jobs.append(
 | 
					            pdf_merge_jobs.append(
 | 
				
			||||||
@@ -311,15 +358,24 @@ class OCRWorkflow(WorkflowRunner):
 | 
				
			|||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        '''
 | 
					        '''
 | 
				
			||||||
        ' Merging text files...
 | 
					        ' ##################################################
 | 
				
			||||||
 | 
					        ' # Merge text files                               #
 | 
				
			||||||
 | 
					        ' ##################################################
 | 
				
			||||||
        '''
 | 
					        '''
 | 
				
			||||||
        txt_merge_jobs = []
 | 
					        txt_merge_jobs = []
 | 
				
			||||||
        for index, job in enumerate(self.jobs):
 | 
					        for index, job in enumerate(self.jobs):
 | 
				
			||||||
            cmd = 'ls --quoting-style=shell-escape -v "%s"/*.txt | xargs cat > "%s"' % (
 | 
					            files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
 | 
				
			||||||
                os.path.join(job['output_dir'], 'tmp'),
 | 
					            files = filter(lambda x: x.endswith('.txt'), files)
 | 
				
			||||||
 | 
					            files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
 | 
				
			||||||
 | 
					            files = map(
 | 
				
			||||||
 | 
					                lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"',
 | 
				
			||||||
 | 
					                files
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            cmd = 'cat %s > "%s"' % (
 | 
				
			||||||
 | 
					                ' '.join(files),
 | 
				
			||||||
                os.path.join(
 | 
					                os.path.join(
 | 
				
			||||||
                    job['output_dir'],
 | 
					                    job['output_dir'],
 | 
				
			||||||
                    job['filename'].rsplit('.', 1)[0] + '.txt'
 | 
					                    os.path.join(job['output_dir'], job['name'] + '.txt')
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
            txt_merge_jobs.append(
 | 
					            txt_merge_jobs.append(
 | 
				
			||||||
@@ -334,7 +390,9 @@ class OCRWorkflow(WorkflowRunner):
 | 
				
			|||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        '''
 | 
					        '''
 | 
				
			||||||
        ' Cleanup...
 | 
					        ' ##################################################
 | 
				
			||||||
 | 
					        ' # Cleanup                                        #
 | 
				
			||||||
 | 
					        ' ##################################################
 | 
				
			||||||
        '''
 | 
					        '''
 | 
				
			||||||
        cleanup_jobs = []
 | 
					        cleanup_jobs = []
 | 
				
			||||||
        if self.keepIntermediates:
 | 
					        if self.keepIntermediates:
 | 
				
			||||||
@@ -407,6 +465,7 @@ def analyze_jobs(inputDirectory, outputDirectory):
 | 
				
			|||||||
            jobs.append(
 | 
					            jobs.append(
 | 
				
			||||||
                {
 | 
					                {
 | 
				
			||||||
                    'filename': file,
 | 
					                    'filename': file,
 | 
				
			||||||
 | 
					                    'name': file.rsplit('.', 1)[0],
 | 
				
			||||||
                    'output_dir': os.path.join(outputDirectory, file),
 | 
					                    'output_dir': os.path.join(outputDirectory, file),
 | 
				
			||||||
                    'path': os.path.join(inputDirectory, file)
 | 
					                    'path': os.path.join(inputDirectory, file)
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user