mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
				synced 2025-10-31 21:23:14 +00:00 
			
		
		
		
	Use language models from repository. Remove workaround for the legacy German Fraktur model.
This commit is contained in:
		
							
								
								
									
										23
									
								
								Dockerfile
									
									
									
									
									
								
							
							
						
						
									
										23
									
								
								Dockerfile
									
									
									
									
									
								
							| @@ -42,21 +42,18 @@ RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /et | ||||
|     wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - && \ | ||||
|     apt-get update && \ | ||||
|     apt-get install -y --no-install-recommends \ | ||||
|     tesseract-ocr  && \ | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata/raw/master/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/ita.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata | ||||
|     tesseract-ocr  \ | ||||
|     tesseract-ocr-deu \ | ||||
|     tesseract-ocr-eng \ | ||||
|     tesseract-ocr-enm \ | ||||
|     tesseract-ocr-fra \ | ||||
|     tesseract-ocr-frk \ | ||||
|     tesseract-ocr-frm \ | ||||
|     tesseract-ocr-ita \ | ||||
|     tesseract-ocr-por \ | ||||
|     tesseract-ocr-spa | ||||
|  | ||||
| COPY hocrtotei /usr/local/bin | ||||
| COPY ocr /usr/local/bin | ||||
|  | ||||
| RUN mkdir /input /output && \ | ||||
|     chmod a+rw /input /output | ||||
|  | ||||
| ENTRYPOINT ["ocr"] | ||||
|   | ||||
							
								
								
									
										30
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										30
									
								
								README.md
									
									
									
									
									
								
							| @@ -60,7 +60,7 @@ If you want to use the prebuilt image, replace `sfb1288inf/ocr:latest` with `git | ||||
|  | ||||
| `-l languagecode` | ||||
| * Tells tesseract which language will be used. | ||||
| * options = deu (German), deu_frak (German Fraktur), eng (English), enm (Middle englisch), fra (French), frm (Middle french), ita (Italian), por (Portuguese), spa (Spanish) | ||||
| * options = deu (German), eng (English), enm (Middle englisch), fra (French), frk (German Fraktur), frm (Middle french), ita (Italian), por (Portuguese), spa (Spanish) | ||||
| * required = True | ||||
|  | ||||
| `--keep-intermediates` | ||||
| @@ -94,31 +94,3 @@ docker run \ | ||||
|         --nCores 8 \ | ||||
|         --skip-binarisation | ||||
| ``` | ||||
|  | ||||
| # Additional language models for OCR | ||||
| Additional language models can be easily installed. Just add them analogical to the existing models to the `Dockerfile`. | ||||
|  | ||||
| The standard language models for various languages can be found under https://github.com/tesseract-ocr/tessdata. Click on one of the languages and copy the link from the download button. The URL for Afrikaans (afr) would be for example https://github.com/tesseract-ocr/tessdata/raw/4.00/afr.traineddata. | ||||
|  | ||||
| The more accurate but slower language models can be found under https://github.com/tesseract-ocr/tessdata_best. Click on one of the languages and copy the link from the download button. The URL for Afrikaans (afr) would be for example https://github.com/tesseract-ocr/tessdata_best/raw/master/afr.traineddata. | ||||
|  | ||||
| Language models for fraktur fonts can also be found in the standard tessdata repository https://github.com/tesseract-ocr/tessdata. | ||||
|  | ||||
| The `Dockerfile` section for the language models with added language support for Afrikaans would look like this: | ||||
|  | ||||
| ``` | ||||
| RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list && \ | ||||
|     wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - && \ | ||||
|     apt-get update && \ | ||||
|     apt-get install -y --no-install-recommends tesseract-ocr && \ | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/afr.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata \ | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata/raw/master/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/ita.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata | ||||
| ``` | ||||
|   | ||||
							
								
								
									
										8
									
								
								ocr
									
									
									
									
									
								
							
							
						
						
									
										8
									
								
								ocr
									
									
									
									
									
								
							| @@ -30,7 +30,7 @@ def parse_arguments(): | ||||
|     parser.add_argument( | ||||
|         '-l', | ||||
|         choices=[ | ||||
|             'deu', 'deu_frak', 'eng', 'enm', 'fra', 'frm', 'ita', 'por', 'spa' | ||||
|             'deu', 'eng', 'enm', 'fra', 'frk', 'frm', 'ita', 'por', 'spa' | ||||
|         ], | ||||
|         dest='lang', | ||||
|         required=True | ||||
| @@ -240,12 +240,6 @@ class OCRWorkflow(WorkflowRunner): | ||||
|         ' the available core number. | ||||
|         ''' | ||||
|         ocr_job_n_cores = min(4, self.n_cores) | ||||
|         ''' | ||||
|         ' WORKAROUND: Tesseract only uses one core for the deu_frak language | ||||
|         ' model, so the workflow will also only reserve one in this case. | ||||
|         ''' | ||||
|         if self.lang == "deu_frak": | ||||
|             ocr_job_n_cores = 1 | ||||
|         for index, job in enumerate(self.jobs): | ||||
|             files = os.listdir(os.path.join(job['output_dir'], 'tmp')) | ||||
|             if self.skip_binarisation: | ||||
|   | ||||
		Reference in New Issue
	
	Block a user