mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
				synced 2025-10-20 18:52:06 +00:00 
			
		
		
		
	Compare commits
	
		
			40 Commits
		
	
	
		
			1.0
			...
			c057d324cf
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | c057d324cf | ||
|  | f51a8c4546 | ||
|  | c640d9743f | ||
|  | e3fd679b38 | ||
|  | 8a3816121c | ||
|  | e1b78b6ba4 | ||
|  | a0760487ae | ||
|  | a798457c43 | ||
|  | e2da0fb839 | ||
|  | e78f667438 | ||
|  | 41f70da8eb | ||
|  | 6db7f70446 | ||
|  | 947658a7d8 | ||
|  | acbf61be05 | ||
|  | 104598039e | ||
|  | da29659a9b | ||
|  | 613bceb4ff | ||
|  | ca7df6d0ed | ||
|  | 07635dcdfa | ||
|  | c0069d5453 | ||
|  | e941f64ee4 | ||
|  | cb68d6de2d | ||
|  | 4b84488fe6 | ||
|  | 7d52ad9f68 | ||
|  | ac4b5c2fd8 | ||
|  | 6d90d43699 | ||
|  | 4bd0d3bb01 | ||
|  | 15061bfaaf | ||
|  | 7cc8ebd666 | ||
|  | 82285a8e6c | ||
|  | 7322a5bc7c | ||
|  | 2b63ba9e59 | ||
|  | aee9628e5e | ||
|  | ec5b4eb521 | ||
|  | b77ca5914f | ||
|  | 018939ae55 | ||
|  | 64fe706126 | ||
|  | a75b32ca1d | ||
|  | 364e3d626d | ||
|  | 36a86887b0 | 
| @@ -1,44 +1,68 @@ | |||||||
| image: docker:stable | image: docker:19.03.13 | ||||||
|  |  | ||||||
| services: | services: | ||||||
|   - docker:stable-dind |   - docker:19.03.13-dind | ||||||
|  |  | ||||||
| variables: |  | ||||||
|   DOCKER_DRIVER: overlay2 |  | ||||||
|  |  | ||||||
| stages: | stages: | ||||||
|   - build |   - build | ||||||
|   - push |   - push | ||||||
|  |  | ||||||
| before_script: | variables: | ||||||
|   - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY |   DOCKER_TLS_CERTDIR: "/certs" | ||||||
|  |   INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME-$CI_COMMIT_SHA | ||||||
|  |  | ||||||
| Build: | .reg_setup: | ||||||
|  |   before_script: | ||||||
|  |     - apk add --no-cache curl | ||||||
|  |     - curl --fail --show-error --location "https://github.com/genuinetools/reg/releases/download/v$REG_VERSION/reg-linux-amd64" --output /usr/local/bin/reg | ||||||
|  |     - echo "$REG_SHA256  /usr/local/bin/reg" | sha256sum -c - | ||||||
|  |     - chmod a+x /usr/local/bin/reg | ||||||
|  |   variables: | ||||||
|  |     REG_SHA256: ade837fc5224acd8c34732bf54a94f579b47851cc6a7fd5899a98386b782e228 | ||||||
|  |     REG_VERSION: 0.16.1 | ||||||
|  |  | ||||||
|  | build_image: | ||||||
|   script: |   script: | ||||||
|     - docker build --pull -t $CI_REGISTRY_IMAGE:tmp . |     - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY | ||||||
|     - docker push $CI_REGISTRY_IMAGE:tmp |     - docker build -t $INTERMEDIATE_IMAGE_TAG . | ||||||
|  |     - docker push $INTERMEDIATE_IMAGE_TAG | ||||||
|   stage: build |   stage: build | ||||||
|   tags: |   tags: | ||||||
|   - docker |     - docker | ||||||
|  |  | ||||||
| Push latest: | push_master: | ||||||
|  |   extends: | ||||||
|  |     - .reg_setup | ||||||
|   only: |   only: | ||||||
|     - master |     - master | ||||||
|   script: |   script: | ||||||
|     - docker pull $CI_REGISTRY_IMAGE:tmp |     - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY | ||||||
|     - docker tag $CI_REGISTRY_IMAGE:tmp $CI_REGISTRY_IMAGE:latest |     - docker pull $INTERMEDIATE_IMAGE_TAG | ||||||
|     - docker push $CI_REGISTRY_IMAGE:latest |     - /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG | ||||||
|  |     - docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG | ||||||
|  |     - docker push $IMAGE_TAG | ||||||
|   stage: push |   stage: push | ||||||
|   tags: |   tags: | ||||||
|   - docker |     - docker | ||||||
|  |   variables: | ||||||
|  |     IMAGE_TAG: $CI_REGISTRY_IMAGE:latest | ||||||
|  |  | ||||||
| Push tag: | push_other: | ||||||
|  |   extends: | ||||||
|  |     - .reg_setup | ||||||
|  |   except: | ||||||
|  |     - master | ||||||
|   only: |   only: | ||||||
|  |     - branches | ||||||
|     - tags |     - tags | ||||||
|   script: |   script: | ||||||
|     - docker pull $CI_REGISTRY_IMAGE:tmp |     - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY | ||||||
|     - docker tag $CI_REGISTRY_IMAGE:tmp $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME |     - docker pull $INTERMEDIATE_IMAGE_TAG | ||||||
|     - docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME |     - /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG | ||||||
|  |     - docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG | ||||||
|  |     - docker push $IMAGE_TAG | ||||||
|   stage: push |   stage: push | ||||||
|   tags: |   tags: | ||||||
|   - docker |     - docker | ||||||
|  |   variables: | ||||||
|  |     IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME | ||||||
|   | |||||||
							
								
								
									
										116
									
								
								Dockerfile
									
									
									
									
									
								
							
							
						
						
									
										116
									
								
								Dockerfile
									
									
									
									
									
								
							| @@ -1,73 +1,85 @@ | |||||||
| FROM debian:9-slim | FROM debian:buster-slim | ||||||
|  |  | ||||||
|  |  | ||||||
| # Define image metadata | LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>" | ||||||
| LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de" |  | ||||||
|  |  | ||||||
|  |  | ||||||
| ENV LANG=C.UTF-8 | ENV LANG=C.UTF-8 | ||||||
|  |  | ||||||
|  |  | ||||||
| # Install prerequisites |  | ||||||
| RUN apt-get update \ | RUN apt-get update \ | ||||||
|  && apt-get install -y --no-install-recommends \ |  && apt-get install --no-install-recommends --yes \ | ||||||
|       apt-transport-https \ |       ghostscript \ | ||||||
|       ca-certificates \ |       procps \ | ||||||
|       gnupg2 \ |       python3.7 \ | ||||||
|       imagemagick \ |       python3-pip \ | ||||||
|       poppler-utils \ |       rename \ | ||||||
|       python2.7 \ |  | ||||||
|       python3.5 \ |  | ||||||
|       wget \ |       wget \ | ||||||
|       zip \ |       zip \ | ||||||
|  && rm -rf /var/lib/apt/lists/* |  && python3 -m pip install lxml | ||||||
|  |  | ||||||
| ENV OCROPY_VERSION 1.3.3 | # Install the OCR pipeline and it's dependencies # | ||||||
| ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" . | ## Install pyFlow ## | ||||||
| RUN tar -xzf "v${OCROPY_VERSION}.tar.gz" \ | ENV PYFLOW_VERSION=1.1.20 | ||||||
|  | RUN wget --no-check-certificate --quiet \ | ||||||
|  |       "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \ | ||||||
|  |  && tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \ | ||||||
|  |  && cd "pyflow-${PYFLOW_VERSION}" \ | ||||||
|  |  && apt-get install --no-install-recommends --yes \ | ||||||
|  |       python2.7 \ | ||||||
|  |  && python2.7 setup.py build install \ | ||||||
|  |  && cd - > /dev/null \ | ||||||
|  |  && rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz" | ||||||
|  |  | ||||||
|  |  | ||||||
|  | ## Install ocropy ## | ||||||
|  | ENV OCROPY_VERSION=1.3.3 | ||||||
|  | RUN wget --no-check-certificate --quiet \ | ||||||
|  |       "https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" \ | ||||||
|  |  && tar -xzf "v${OCROPY_VERSION}.tar.gz" \ | ||||||
|  && cd "ocropy-${OCROPY_VERSION}" \ |  && cd "ocropy-${OCROPY_VERSION}" \ | ||||||
|  && apt-get update \ |  && apt-get install --no-install-recommends --yes \ | ||||||
|  && apt-get install -y --no-install-recommends \ |       python2.7 \ | ||||||
|       python-pil \ |       python-pil \ | ||||||
|       python-tk \ |       python-tk \ | ||||||
|       $(cat PACKAGES) \ |       $(cat PACKAGES) \ | ||||||
|  && rm -rf /var/lib/apt/lists/* \ |  | ||||||
|  && python2.7 setup.py install \ |  && python2.7 setup.py install \ | ||||||
|  && cd .. \ |  && cd - > /dev/null \ | ||||||
|  && rm -rf \ |  && rm -r "ocropy-${OCROPY_VERSION}" "v${OCROPY_VERSION}.tar.gz" | ||||||
|       "ocropy-${OCROPY_VERSION}" \ |  | ||||||
|       "v${OCROPY_VERSION}.tar.gz" |  | ||||||
|  |  | ||||||
| ENV PYFLOW_VERSION=1.1.20 |  | ||||||
| ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" . |  | ||||||
| RUN tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \ |  | ||||||
|  && cd "pyflow-${PYFLOW_VERSION}" \ |  | ||||||
|  && python2.7 setup.py build install \ |  | ||||||
|  && cd .. \ |  | ||||||
|  && rm -rf \ |  | ||||||
|       "pyflow-${PYFLOW_VERSION}" \ |  | ||||||
|       "pyflow-${PYFLOW_VERSION}.tar.gz" |  | ||||||
|  |  | ||||||
| RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list \ |  | ||||||
|  && wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - \ |  | ||||||
|  && apt-get update \ |  | ||||||
|  && apt-get install -y --no-install-recommends \ |  | ||||||
|       tesseract-ocr  \ |  | ||||||
|       tesseract-ocr-deu \ |  | ||||||
|       tesseract-ocr-eng \ |  | ||||||
|       tesseract-ocr-enm \ |  | ||||||
|       tesseract-ocr-fra \ |  | ||||||
|       tesseract-ocr-frk \ |  | ||||||
|       tesseract-ocr-frm \ |  | ||||||
|       tesseract-ocr-ita \ |  | ||||||
|       tesseract-ocr-por \ |  | ||||||
|       tesseract-ocr-spa \ |  | ||||||
|  && rm -rf /var/lib/apt/lists/* |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # Install OCR pipeline | ## Install Tesseract OCR ## | ||||||
| COPY hocrtotei /usr/local/bin | ENV TESSERACT_VERSION=5.0.0 | ||||||
| COPY ocr /usr/local/bin | RUN wget --no-check-certificate --quiet \ | ||||||
|  |       "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \ | ||||||
|  |  && tar -xzf "${TESSERACT_VERSION}.tar.gz" \ | ||||||
|  |  && cd "tesseract-${TESSERACT_VERSION}" \ | ||||||
|  |  && apt-get install --no-install-recommends --yes \ | ||||||
|  |       autoconf \ | ||||||
|  |       automake \ | ||||||
|  |       g++ \ | ||||||
|  |       libjpeg62-turbo-dev \ | ||||||
|  |       libleptonica-dev \ | ||||||
|  |       libtiff5-dev \ | ||||||
|  |       libtool \ | ||||||
|  |       libpng-dev \ | ||||||
|  |       make \ | ||||||
|  |       pkg-config \ | ||||||
|  |       zlib1g-dev \ | ||||||
|  |  && ./autogen.sh \ | ||||||
|  |  && ./configure --disable-openmp --disable-shared 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic' \ | ||||||
|  |  && make \ | ||||||
|  |  && make install \ | ||||||
|  |  && ldconfig \ | ||||||
|  |  && cd - > /dev/null \ | ||||||
|  |  && rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz" | ||||||
|  |  | ||||||
|  |  | ||||||
|  | RUN rm -r /var/lib/apt/lists/* | ||||||
|  |  | ||||||
|  |  | ||||||
|  | ## Install Pipeline ## | ||||||
|  | COPY hocr2tei hocr-combine ocr /usr/local/bin/ | ||||||
|  |  | ||||||
|  |  | ||||||
| ENTRYPOINT ["ocr"] | ENTRYPOINT ["ocr"] | ||||||
|   | |||||||
							
								
								
									
										21
									
								
								LICENSE
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								LICENSE
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,21 @@ | |||||||
|  | MIT License | ||||||
|  |  | ||||||
|  | Copyright (c) 2021 Bielefeld University - CRC 1288 - INF | ||||||
|  |  | ||||||
|  | Permission is hereby granted, free of charge, to any person obtaining a copy | ||||||
|  | of this software and associated documentation files (the "Software"), to deal | ||||||
|  | in the Software without restriction, including without limitation the rights | ||||||
|  | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||||||
|  | copies of the Software, and to permit persons to whom the Software is | ||||||
|  | furnished to do so, subject to the following conditions: | ||||||
|  |  | ||||||
|  | The above copyright notice and this permission notice shall be included in all | ||||||
|  | copies or substantial portions of the Software. | ||||||
|  |  | ||||||
|  | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||||||
|  | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||||||
|  | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||||||
|  | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||||||
|  | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||||||
|  | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||||
|  | SOFTWARE. | ||||||
							
								
								
									
										123
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										123
									
								
								README.md
									
									
									
									
									
								
							| @@ -1,96 +1,33 @@ | |||||||
| # OCR | # OCR - Optical Character Recognition | ||||||
|  |  | ||||||
| ## Build image | This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided. The pipeline is designed to run on Linux operating systems, but with some tweaks it should also run on Windows with WSL installed. | ||||||
|  |  | ||||||
| 1. Clone this repository and navigate into it: | ## Software used in this pipeline implementation | ||||||
| ``` |  | ||||||
| git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git && cd ocr | - Official Debian Docker image (buster-slim): https://hub.docker.com/_/debian | ||||||
| ``` |   - Software from Debian Buster's free repositories | ||||||
|  | - ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3 | ||||||
| 2. Build image: | - pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20 | ||||||
| ``` | - Tesseract OCR (5.0.0): https://github.com/tesseract-ocr/tesseract/releases/tag/5.0.0 | ||||||
| docker build -t sfb1288inf/ocr:latest . |  | ||||||
| ``` | ## Installation | ||||||
|  |  | ||||||
| Alternatively build from the GitLab repository without cloning: | 1. Install Docker and Python 3. | ||||||
|  | 2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git` | ||||||
| 1. Build image: | 2. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:v0.1.0 ocr` | ||||||
| ``` | 2. Add the wrapper script (`wrapper/ocr` relative to this README file) to your `${PATH}`. | ||||||
| docker build -t sfb1288inf/ocr:latest https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git | 3. Create working directories for the pipeline: `mkdir -p /<my_data_location>/{input,models,output}`. | ||||||
| ``` | 4. Place your Tesseract OCR model(s) inside `/<my_data_location>/models`. | ||||||
|  |  | ||||||
| ## Download prebuilt image | ## Use the Pipeline | ||||||
|  |  | ||||||
| The GitLab registry provides a prebuilt image. It is automatically created, utilizing the conquaire build servers. | 1. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language. | ||||||
|  | 2. Clear your `/<my_data_location>/output` directory. | ||||||
| 1. Download image: | 3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details. | ||||||
| ``` | ```bash | ||||||
| docker pull gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest | cd /<my_data_location> | ||||||
| ``` | ocr -i input -o output -m models/<model_name> -l <language_code> <optional_pipeline_arguments> | ||||||
|  | # or | ||||||
| ## Run | ocr -i input -o output -m models/* -l <language_code> <optional_pipeline_arguments> | ||||||
|  |  | ||||||
| 1. Create input and output directories for the OCR software: |  | ||||||
| ``` |  | ||||||
| mkdir -p /<mydatalocation>/files_for_ocr /<mydatalocation>/files_from_ocr |  | ||||||
| ``` |  | ||||||
|  |  | ||||||
| 2. Place your files inside the `/<mydatalocation>/files_for_ocr` directory. Files can either be PDF (.pdf) or multipage TIFF (.tiff, .tif) files. Files should all contain text of the same language. |  | ||||||
|  |  | ||||||
| 3. Start the OCR process. |  | ||||||
| ``` |  | ||||||
| docker run \ |  | ||||||
|     --rm \ |  | ||||||
|     -it \ |  | ||||||
|     -u $(id -u $USER):$(id -g $USER) \ |  | ||||||
|     -v /<mydatalocation>/files_for_ocr:/input \ |  | ||||||
|     -v /<mydatalocation>/files_from_ocr:/output \ |  | ||||||
|     sfb1288inf/ocr:latest \ |  | ||||||
|         -i /input \ |  | ||||||
|         -l <languagecode> \ |  | ||||||
|         -o /output |  | ||||||
| ``` |  | ||||||
| The arguments below `sfb1288inf/ocr:latest` are described in the [OCR arguments](#ocr-arguments) part. |  | ||||||
|  |  | ||||||
| If you want to use the prebuilt image, replace `sfb1288inf/ocr:latest` with `gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest`. |  | ||||||
|  |  | ||||||
| 4. Check your results in the `/<mydatalocation>/files_from_ocr` directory. |  | ||||||
|  |  | ||||||
| ### OCR arguments |  | ||||||
|  |  | ||||||
| `-l languagecode` |  | ||||||
| * Tells tesseract which language will be used. |  | ||||||
| * options = deu (German), eng (English), enm (Middle englisch), fra (French), frk (German Fraktur), frm (Middle french), ita (Italian), por (Portuguese), spa (Spanish) |  | ||||||
| * required = True |  | ||||||
|  |  | ||||||
| `--keep-intermediates` |  | ||||||
| * If set, all intermediate files created during the OCR process will be |  | ||||||
| kept. |  | ||||||
| * default = False |  | ||||||
| * required = False |  | ||||||
|  |  | ||||||
| `--nCores corenumber` |  | ||||||
| * Sets the number of CPU cores being used during the OCR process. |  | ||||||
| * default = min(4, multiprocessing.cpu_count()) |  | ||||||
| * required = False |  | ||||||
|  |  | ||||||
| `--skip-binarisation` |  | ||||||
| * Used to skip binarization with ocropus. If skipped, only the tesseract binarization is used. |  | ||||||
| * default = False |  | ||||||
|  |  | ||||||
| Example with all arguments used: |  | ||||||
| ``` |  | ||||||
| docker run \ |  | ||||||
|     --rm \ |  | ||||||
|     -it \ |  | ||||||
|     -u $(id -u $USER):$(id -g $USER) \ |  | ||||||
|     -v "$HOME"/ocr/files_for_ocr:/input \ |  | ||||||
|     -v "$HOME"/ocr/files_from_ocr:/output \ |  | ||||||
|     sfb1288inf/ocr:latest \ |  | ||||||
|         -i /input \ |  | ||||||
|         -l eng \ |  | ||||||
|         -o /output \ |  | ||||||
|         --keep_intermediates \ |  | ||||||
|         --nCores 8 \ |  | ||||||
|         --skip-binarisation |  | ||||||
| ``` | ``` | ||||||
|  | 4. Check your results in the `/<my_data_location>/output` directory. | ||||||
|   | |||||||
							
								
								
									
										35
									
								
								hocr-combine
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										35
									
								
								hocr-combine
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,35 @@ | |||||||
|  | #!/usr/bin/env python3.7 | ||||||
|  | # coding=utf-8 | ||||||
|  |  | ||||||
|  | """"Combine multiple hOCR files.""" | ||||||
|  |  | ||||||
|  | from argparse import ArgumentParser | ||||||
|  | from lxml import html | ||||||
|  |  | ||||||
|  |  | ||||||
|  | parser = ArgumentParser(description='Combine multiple hOCR files.') | ||||||
|  | parser.add_argument('file', help='Input file(s)', nargs='+') | ||||||
|  | parser.add_argument('-o', '--output-file', help='Output file', required=True) | ||||||
|  | args = parser.parse_args() | ||||||
|  |  | ||||||
|  |  | ||||||
|  | for file in args.file: | ||||||
|  |     files = [] | ||||||
|  |     if file.startswith('@'): | ||||||
|  |         with open(file[1:], 'r') as f: | ||||||
|  |             files += [x for x in f.read().split("\n") if x != ''] | ||||||
|  |     else: | ||||||
|  |         files.append(file) | ||||||
|  | if len(files) == 0: | ||||||
|  |     exit(1) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | hocr = html.parse(files[0]) | ||||||
|  | hocr_body = hocr.find('body') | ||||||
|  | for file in files[1:]: | ||||||
|  |     for ocr_page in html.parse(file).findall('//div[@class="ocr_page"]'): | ||||||
|  |         hocr_body.append(ocr_page) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | with open(args.output_file, 'wb') as f: | ||||||
|  |     hocr.write(f, encoding='UTF-8', method='html') | ||||||
							
								
								
									
										58
									
								
								hocr2tei
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										58
									
								
								hocr2tei
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,58 @@ | |||||||
|  | #!/usr/bin/env python3.7 | ||||||
|  | # coding=utf-8 | ||||||
|  |  | ||||||
|  | """"Convert hOCR to TEI XML.""" | ||||||
|  |  | ||||||
|  | from argparse import ArgumentParser | ||||||
|  | from lxml import html | ||||||
|  | from xml.sax.saxutils import escape | ||||||
|  | import re | ||||||
|  |  | ||||||
|  |  | ||||||
|  | parser = ArgumentParser(description='Convert hOCR to TEI XML.') | ||||||
|  | parser.add_argument('file', help='Input file') | ||||||
|  | parser.add_argument('-o', '--output-file', help='Output file', required=True) | ||||||
|  | args = parser.parse_args() | ||||||
|  |  | ||||||
|  |  | ||||||
|  | tei = '' | ||||||
|  | tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n' | ||||||
|  | tei += '  <teiHeader>\n' | ||||||
|  | tei += '    <fileDesc>\n' | ||||||
|  | tei += '      <titleStmt>\n' | ||||||
|  | tei += '        <title></title>\n' | ||||||
|  | tei += '      </titleStmt>\n' | ||||||
|  | tei += '      <publicationStmt>\n' | ||||||
|  | tei += '        <p></p>\n' | ||||||
|  | tei += '      </publicationStmt>\n' | ||||||
|  | tei += '      <sourceDesc>\n' | ||||||
|  | tei += '        <p></p>\n' | ||||||
|  | tei += '      </sourceDesc>\n' | ||||||
|  | tei += '    </fileDesc>\n' | ||||||
|  | tei += '  </teiHeader>\n' | ||||||
|  | tei += '  <text>\n' | ||||||
|  | tei += '    <body>\n' | ||||||
|  | hocr = html.parse(args.file) | ||||||
|  | for ocr_page in hocr.findall('.//div[@class="ocr_page"]'): | ||||||
|  |     ocr_page_title_attrib = ocr_page.attrib.get('title') | ||||||
|  |     facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1) | ||||||
|  |     page_number = re.search(r'ppageno (\d+)', ocr_page_title_attrib).group(1) | ||||||
|  |     tei += f'      <pb facs="{facsimile}" n="{page_number}"/>\n' | ||||||
|  |     for ocr_par in ocr_page.findall('.//p[@class="ocr_par"]'): | ||||||
|  |         tei += '      <p>\n' | ||||||
|  |         for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'): | ||||||
|  |             tei += '        <lb/>' | ||||||
|  |             indent = '' | ||||||
|  |             for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'): | ||||||
|  |                 if ocrx_word.text is not None: | ||||||
|  |                     tei += indent + escape(ocrx_word.text) | ||||||
|  |                     indent = ' ' | ||||||
|  |             tei += '\n' | ||||||
|  |         tei += '      </p>\n' | ||||||
|  | tei += '    </body>\n' | ||||||
|  | tei += '  </text>\n' | ||||||
|  | tei += '</TEI>\n' | ||||||
|  |  | ||||||
|  |  | ||||||
|  | with open(args.output_file, 'w') as f: | ||||||
|  |     f.write(tei) | ||||||
							
								
								
									
										58
									
								
								hocrtotei
									
									
									
									
									
								
							
							
						
						
									
										58
									
								
								hocrtotei
									
									
									
									
									
								
							| @@ -1,58 +0,0 @@ | |||||||
| #!/usr/bin/env python3.5 |  | ||||||
| # coding=utf-8 |  | ||||||
|  |  | ||||||
| from xml.sax.saxutils import escape |  | ||||||
| import argparse |  | ||||||
| import xml.etree.ElementTree as ET |  | ||||||
|  |  | ||||||
| parser = argparse.ArgumentParser( |  | ||||||
|     description='Merges several hOCR files in order of their occurrence on command line to one TEI result file.' |  | ||||||
| ) |  | ||||||
| parser.add_argument( |  | ||||||
|     'i', |  | ||||||
|     metavar='hOCR-sourcefile', |  | ||||||
|     nargs='+' |  | ||||||
| ) |  | ||||||
| parser.add_argument( |  | ||||||
|     'o', |  | ||||||
|     metavar='TEI-destfile', |  | ||||||
| ) |  | ||||||
| args = parser.parse_args() |  | ||||||
|  |  | ||||||
| output_file = open(args.o, 'w') |  | ||||||
|  |  | ||||||
| output_file.write( |  | ||||||
|       '<?xml version="1.0" encoding="UTF-8"?>\n' |  | ||||||
|     + '<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n' |  | ||||||
|     + '    <teiHeader>\n' |  | ||||||
|     + '        <fileDesc>\n' |  | ||||||
|     + '            <titleStmt/>\n' |  | ||||||
|     + '            <publicationStmt/>\n' |  | ||||||
|     + '            <sourceDesc/>\n' |  | ||||||
|     + '        </fileDesc>\n' |  | ||||||
|     + '        <encodingDesc/>\n' |  | ||||||
|     + '        <profileDesc/>\n' |  | ||||||
|     + '    </teiHeader>\n' |  | ||||||
|     + '    <text>\n' |  | ||||||
|     + '        <body>\n' |  | ||||||
| ) |  | ||||||
| for index, input_file in enumerate(args.i): |  | ||||||
|     tree = ET.parse(input_file) |  | ||||||
|     output_file.write('            <pb n="%i"/>\n' % (index + 1)) |  | ||||||
|     for para in tree.findall('.//*[@class="ocr_par"]'): |  | ||||||
|         output_file.write('            <p>\n') |  | ||||||
|         for line in para.findall('.//*[@class="ocr_line"]'): |  | ||||||
|             first_word_in_line = True |  | ||||||
|             for word in line.findall('.//*[@class="ocrx_word"]'): |  | ||||||
|                 if word.text is not None: |  | ||||||
|                     output_file.write(('                ' if first_word_in_line else ' ') + escape(word.text.strip())) |  | ||||||
|                     first_word_in_line = False |  | ||||||
|             if not first_word_in_line: |  | ||||||
|                 output_file.write('<lb/>\n') |  | ||||||
|         output_file.write('            </p>\n') |  | ||||||
| output_file.write( |  | ||||||
|       '        </body>\n' |  | ||||||
|     + '    </text>\n' |  | ||||||
|     + '</TEI>') |  | ||||||
|  |  | ||||||
| output_file.close() |  | ||||||
							
								
								
									
										57
									
								
								wrapper/ocr
									
									
									
									
									
								
							
							
						
						
									
										57
									
								
								wrapper/ocr
									
									
									
									
									
								
							| @@ -1,39 +1,44 @@ | |||||||
| #!/usr/bin/env python3 | #!/usr/bin/env python3 | ||||||
| # coding=utf-8 | # coding=utf-8 | ||||||
|  |  | ||||||
| import argparse | from argparse import ArgumentParser | ||||||
| import os | import os | ||||||
| import subprocess | import subprocess | ||||||
|  | import sys | ||||||
|  |  | ||||||
| container_image = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest' | CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:v0.1.0' | ||||||
| container_input_dir = '/input' | CONTAINER_INPUT_DIR = '/input' | ||||||
| container_output_dir = '/output' | CONTAINER_OUTPUT_DIR = '/output' | ||||||
| uid = str(os.getuid()) | CONTAINER_MODELS_DIR = '/usr/local/share/tessdata' | ||||||
| gid = str(os.getgid()) | CONTAINER_LOG_DIR = '/logs' | ||||||
|  | UID = str(os.getuid()) | ||||||
|  | GID = str(os.getgid()) | ||||||
|  |  | ||||||
| parser = argparse.ArgumentParser(add_help=False) | parser = ArgumentParser(add_help=False) | ||||||
| parser.add_argument( | parser.add_argument('-i', '--input-dir') | ||||||
|     '-i', | parser.add_argument('-o', '--output-dir') | ||||||
|     dest='input_dir', | parser.add_argument('-m', '--model', action='extend', dest='models', nargs='+') | ||||||
|     required=False | parser.add_argument('--log-dir') | ||||||
| ) |  | ||||||
| parser.add_argument( |  | ||||||
|     '-o', |  | ||||||
|     dest='output_dir', |  | ||||||
|     required=False |  | ||||||
| ) |  | ||||||
| args, remaining_args = parser.parse_known_args() | args, remaining_args = parser.parse_known_args() | ||||||
|  |  | ||||||
| cmd = ['docker', 'run', '--rm', '-it', '-u', uid + ':' + gid] | cmd = ['docker', 'run', '--rm', '-it', '-u', f'{UID}:{GID}'] | ||||||
| if args.input_dir is not None: | if args.input_dir is not None: | ||||||
|     host_input_dir = os.path.abspath(args.input_dir) |     mapping = f'{os.path.abspath(args.input_dir)}:{CONTAINER_INPUT_DIR}' | ||||||
|     cmd += ['-v', host_input_dir + ':' + container_input_dir] |     cmd += ['-v', mapping] | ||||||
|     remaining_args += ['-i', container_input_dir] |     remaining_args += ['-i', CONTAINER_INPUT_DIR] | ||||||
| if args.output_dir is not None: | if args.output_dir is not None: | ||||||
|     host_output_dir = os.path.abspath(args.output_dir) |     mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}' | ||||||
|     cmd += ['-v', host_output_dir + ':' + container_output_dir] |     cmd += ['-v', mapping] | ||||||
|     remaining_args += ['-o', container_output_dir] |     remaining_args += ['-o', CONTAINER_OUTPUT_DIR] | ||||||
| cmd.append(container_image) | if args.models is not None: | ||||||
|  |     for model in args.models: | ||||||
|  |         mapping = f'{os.path.abspath(model)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model)}'  # noqa | ||||||
|  |         cmd += ['-v', mapping] | ||||||
|  | if args.log_dir is not None: | ||||||
|  |     mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}' | ||||||
|  |     cmd += ['-v', mapping] | ||||||
|  |     remaining_args += ['--log-dir', CONTAINER_LOG_DIR] | ||||||
|  | cmd.append(CONTAINER_IMAGE) | ||||||
| cmd += remaining_args | cmd += remaining_args | ||||||
|  |  | ||||||
| subprocess.run(cmd) | sys.exit(subprocess.run(cmd).returncode) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user