mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
				synced 2025-10-20 18:52:06 +00:00 
			
		
		
		
	Compare commits
	
		
			40 Commits
		
	
	
		
			1.0
			...
			c057d324cf
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | c057d324cf | ||
|  | f51a8c4546 | ||
|  | c640d9743f | ||
|  | e3fd679b38 | ||
|  | 8a3816121c | ||
|  | e1b78b6ba4 | ||
|  | a0760487ae | ||
|  | a798457c43 | ||
|  | e2da0fb839 | ||
|  | e78f667438 | ||
|  | 41f70da8eb | ||
|  | 6db7f70446 | ||
|  | 947658a7d8 | ||
|  | acbf61be05 | ||
|  | 104598039e | ||
|  | da29659a9b | ||
|  | 613bceb4ff | ||
|  | ca7df6d0ed | ||
|  | 07635dcdfa | ||
|  | c0069d5453 | ||
|  | e941f64ee4 | ||
|  | cb68d6de2d | ||
|  | 4b84488fe6 | ||
|  | 7d52ad9f68 | ||
|  | ac4b5c2fd8 | ||
|  | 6d90d43699 | ||
|  | 4bd0d3bb01 | ||
|  | 15061bfaaf | ||
|  | 7cc8ebd666 | ||
|  | 82285a8e6c | ||
|  | 7322a5bc7c | ||
|  | 2b63ba9e59 | ||
|  | aee9628e5e | ||
|  | ec5b4eb521 | ||
|  | b77ca5914f | ||
|  | 018939ae55 | ||
|  | 64fe706126 | ||
|  | a75b32ca1d | ||
|  | 364e3d626d | ||
|  | 36a86887b0 | 
| @@ -1,44 +1,68 @@ | ||||
| image: docker:stable | ||||
| image: docker:19.03.13 | ||||
|  | ||||
| services: | ||||
|   - docker:stable-dind | ||||
|  | ||||
| variables: | ||||
|   DOCKER_DRIVER: overlay2 | ||||
|   - docker:19.03.13-dind | ||||
|  | ||||
| stages: | ||||
|   - build | ||||
|   - push | ||||
|  | ||||
| before_script: | ||||
|   - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY | ||||
| variables: | ||||
|   DOCKER_TLS_CERTDIR: "/certs" | ||||
|   INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME-$CI_COMMIT_SHA | ||||
|  | ||||
| Build: | ||||
| .reg_setup: | ||||
|   before_script: | ||||
|     - apk add --no-cache curl | ||||
|     - curl --fail --show-error --location "https://github.com/genuinetools/reg/releases/download/v$REG_VERSION/reg-linux-amd64" --output /usr/local/bin/reg | ||||
|     - echo "$REG_SHA256  /usr/local/bin/reg" | sha256sum -c - | ||||
|     - chmod a+x /usr/local/bin/reg | ||||
|   variables: | ||||
|     REG_SHA256: ade837fc5224acd8c34732bf54a94f579b47851cc6a7fd5899a98386b782e228 | ||||
|     REG_VERSION: 0.16.1 | ||||
|  | ||||
| build_image: | ||||
|   script: | ||||
|     - docker build --pull -t $CI_REGISTRY_IMAGE:tmp . | ||||
|     - docker push $CI_REGISTRY_IMAGE:tmp | ||||
|     - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY | ||||
|     - docker build -t $INTERMEDIATE_IMAGE_TAG . | ||||
|     - docker push $INTERMEDIATE_IMAGE_TAG | ||||
|   stage: build | ||||
|   tags: | ||||
|     - docker | ||||
|  | ||||
| Push latest: | ||||
| push_master: | ||||
|   extends: | ||||
|     - .reg_setup | ||||
|   only: | ||||
|     - master | ||||
|   script: | ||||
|     - docker pull $CI_REGISTRY_IMAGE:tmp | ||||
|     - docker tag $CI_REGISTRY_IMAGE:tmp $CI_REGISTRY_IMAGE:latest | ||||
|     - docker push $CI_REGISTRY_IMAGE:latest | ||||
|     - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY | ||||
|     - docker pull $INTERMEDIATE_IMAGE_TAG | ||||
|     - /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG | ||||
|     - docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG | ||||
|     - docker push $IMAGE_TAG | ||||
|   stage: push | ||||
|   tags: | ||||
|     - docker | ||||
|   variables: | ||||
|     IMAGE_TAG: $CI_REGISTRY_IMAGE:latest | ||||
|  | ||||
| Push tag: | ||||
| push_other: | ||||
|   extends: | ||||
|     - .reg_setup | ||||
|   except: | ||||
|     - master | ||||
|   only: | ||||
|     - branches | ||||
|     - tags | ||||
|   script: | ||||
|     - docker pull $CI_REGISTRY_IMAGE:tmp | ||||
|     - docker tag $CI_REGISTRY_IMAGE:tmp $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME | ||||
|     - docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME | ||||
|     - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY | ||||
|     - docker pull $INTERMEDIATE_IMAGE_TAG | ||||
|     - /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG | ||||
|     - docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG | ||||
|     - docker push $IMAGE_TAG | ||||
|   stage: push | ||||
|   tags: | ||||
|     - docker | ||||
|   variables: | ||||
|     IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME | ||||
|   | ||||
							
								
								
									
										116
									
								
								Dockerfile
									
									
									
									
									
								
							
							
						
						
									
										116
									
								
								Dockerfile
									
									
									
									
									
								
							| @@ -1,73 +1,85 @@ | ||||
| FROM debian:9-slim | ||||
| FROM debian:buster-slim | ||||
|  | ||||
|  | ||||
| # Define image metadata | ||||
| LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de" | ||||
| LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>" | ||||
|  | ||||
|  | ||||
| ENV LANG=C.UTF-8 | ||||
|  | ||||
|  | ||||
| # Install prerequisites | ||||
| RUN apt-get update \ | ||||
|  && apt-get install -y --no-install-recommends \ | ||||
|       apt-transport-https \ | ||||
|       ca-certificates \ | ||||
|       gnupg2 \ | ||||
|       imagemagick \ | ||||
|       poppler-utils \ | ||||
|       python2.7 \ | ||||
|       python3.5 \ | ||||
|  && apt-get install --no-install-recommends --yes \ | ||||
|       ghostscript \ | ||||
|       procps \ | ||||
|       python3.7 \ | ||||
|       python3-pip \ | ||||
|       rename \ | ||||
|       wget \ | ||||
|       zip \ | ||||
|  && rm -rf /var/lib/apt/lists/* | ||||
|  && python3 -m pip install lxml | ||||
|  | ||||
| ENV OCROPY_VERSION 1.3.3 | ||||
| ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" . | ||||
| RUN tar -xzf "v${OCROPY_VERSION}.tar.gz" \ | ||||
| # Install the OCR pipeline and it's dependencies # | ||||
| ## Install pyFlow ## | ||||
| ENV PYFLOW_VERSION=1.1.20 | ||||
| RUN wget --no-check-certificate --quiet \ | ||||
|       "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \ | ||||
|  && tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \ | ||||
|  && cd "pyflow-${PYFLOW_VERSION}" \ | ||||
|  && apt-get install --no-install-recommends --yes \ | ||||
|       python2.7 \ | ||||
|  && python2.7 setup.py build install \ | ||||
|  && cd - > /dev/null \ | ||||
|  && rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz" | ||||
|  | ||||
|  | ||||
| ## Install ocropy ## | ||||
| ENV OCROPY_VERSION=1.3.3 | ||||
| RUN wget --no-check-certificate --quiet \ | ||||
|       "https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" \ | ||||
|  && tar -xzf "v${OCROPY_VERSION}.tar.gz" \ | ||||
|  && cd "ocropy-${OCROPY_VERSION}" \ | ||||
|  && apt-get update \ | ||||
|  && apt-get install -y --no-install-recommends \ | ||||
|  && apt-get install --no-install-recommends --yes \ | ||||
|       python2.7 \ | ||||
|       python-pil \ | ||||
|       python-tk \ | ||||
|       $(cat PACKAGES) \ | ||||
|  && rm -rf /var/lib/apt/lists/* \ | ||||
|  && python2.7 setup.py install \ | ||||
|  && cd .. \ | ||||
|  && rm -rf \ | ||||
|       "ocropy-${OCROPY_VERSION}" \ | ||||
|       "v${OCROPY_VERSION}.tar.gz" | ||||
|  | ||||
| ENV PYFLOW_VERSION=1.1.20 | ||||
| ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" . | ||||
| RUN tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \ | ||||
|  && cd "pyflow-${PYFLOW_VERSION}" \ | ||||
|  && python2.7 setup.py build install \ | ||||
|  && cd .. \ | ||||
|  && rm -rf \ | ||||
|       "pyflow-${PYFLOW_VERSION}" \ | ||||
|       "pyflow-${PYFLOW_VERSION}.tar.gz" | ||||
|  | ||||
| RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list \ | ||||
|  && wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - \ | ||||
|  && apt-get update \ | ||||
|  && apt-get install -y --no-install-recommends \ | ||||
|       tesseract-ocr  \ | ||||
|       tesseract-ocr-deu \ | ||||
|       tesseract-ocr-eng \ | ||||
|       tesseract-ocr-enm \ | ||||
|       tesseract-ocr-fra \ | ||||
|       tesseract-ocr-frk \ | ||||
|       tesseract-ocr-frm \ | ||||
|       tesseract-ocr-ita \ | ||||
|       tesseract-ocr-por \ | ||||
|       tesseract-ocr-spa \ | ||||
|  && rm -rf /var/lib/apt/lists/* | ||||
|  && cd - > /dev/null \ | ||||
|  && rm -r "ocropy-${OCROPY_VERSION}" "v${OCROPY_VERSION}.tar.gz" | ||||
|  | ||||
|  | ||||
| # Install OCR pipeline | ||||
| COPY hocrtotei /usr/local/bin | ||||
| COPY ocr /usr/local/bin | ||||
| ## Install Tesseract OCR ## | ||||
| ENV TESSERACT_VERSION=5.0.0 | ||||
| RUN wget --no-check-certificate --quiet \ | ||||
|       "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \ | ||||
|  && tar -xzf "${TESSERACT_VERSION}.tar.gz" \ | ||||
|  && cd "tesseract-${TESSERACT_VERSION}" \ | ||||
|  && apt-get install --no-install-recommends --yes \ | ||||
|       autoconf \ | ||||
|       automake \ | ||||
|       g++ \ | ||||
|       libjpeg62-turbo-dev \ | ||||
|       libleptonica-dev \ | ||||
|       libtiff5-dev \ | ||||
|       libtool \ | ||||
|       libpng-dev \ | ||||
|       make \ | ||||
|       pkg-config \ | ||||
|       zlib1g-dev \ | ||||
|  && ./autogen.sh \ | ||||
|  && ./configure --disable-openmp --disable-shared 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic' \ | ||||
|  && make \ | ||||
|  && make install \ | ||||
|  && ldconfig \ | ||||
|  && cd - > /dev/null \ | ||||
|  && rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz" | ||||
|  | ||||
|  | ||||
| RUN rm -r /var/lib/apt/lists/* | ||||
|  | ||||
|  | ||||
| ## Install Pipeline ## | ||||
| COPY hocr2tei hocr-combine ocr /usr/local/bin/ | ||||
|  | ||||
|  | ||||
| ENTRYPOINT ["ocr"] | ||||
|   | ||||
							
								
								
									
										21
									
								
								LICENSE
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								LICENSE
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,21 @@ | ||||
| MIT License | ||||
|  | ||||
| Copyright (c) 2021 Bielefeld University - CRC 1288 - INF | ||||
|  | ||||
| Permission is hereby granted, free of charge, to any person obtaining a copy | ||||
| of this software and associated documentation files (the "Software"), to deal | ||||
| in the Software without restriction, including without limitation the rights | ||||
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||||
| copies of the Software, and to permit persons to whom the Software is | ||||
| furnished to do so, subject to the following conditions: | ||||
|  | ||||
| The above copyright notice and this permission notice shall be included in all | ||||
| copies or substantial portions of the Software. | ||||
|  | ||||
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||||
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||||
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||||
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||||
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||||
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||
| SOFTWARE. | ||||
							
								
								
									
										123
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										123
									
								
								README.md
									
									
									
									
									
								
							| @@ -1,96 +1,33 @@ | ||||
| # OCR | ||||
| # OCR - Optical Character Recognition | ||||
|  | ||||
| ## Build image | ||||
| This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided. The pipeline is designed to run on Linux operating systems, but with some tweaks it should also run on Windows with WSL installed. | ||||
|  | ||||
| 1. Clone this repository and navigate into it: | ||||
| ``` | ||||
| git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git && cd ocr | ||||
| ``` | ||||
|  | ||||
| 2. Build image: | ||||
| ``` | ||||
| docker build -t sfb1288inf/ocr:latest . | ||||
| ``` | ||||
|  | ||||
| Alternatively build from the GitLab repository without cloning: | ||||
|  | ||||
| 1. Build image: | ||||
| ``` | ||||
| docker build -t sfb1288inf/ocr:latest https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git | ||||
| ``` | ||||
|  | ||||
| ## Download prebuilt image | ||||
|  | ||||
| The GitLab registry provides a prebuilt image. It is automatically created, utilizing the conquaire build servers. | ||||
|  | ||||
| 1. Download image: | ||||
| ``` | ||||
| docker pull gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest | ||||
| ``` | ||||
|  | ||||
| ## Run | ||||
|  | ||||
| 1. Create input and output directories for the OCR software: | ||||
| ``` | ||||
| mkdir -p /<mydatalocation>/files_for_ocr /<mydatalocation>/files_from_ocr | ||||
| ``` | ||||
|  | ||||
| 2. Place your files inside the `/<mydatalocation>/files_for_ocr` directory. Files can either be PDF (.pdf) or multipage TIFF (.tiff, .tif) files. Files should all contain text of the same language. | ||||
|  | ||||
| 3. Start the OCR process. | ||||
| ``` | ||||
| docker run \ | ||||
|     --rm \ | ||||
|     -it \ | ||||
|     -u $(id -u $USER):$(id -g $USER) \ | ||||
|     -v /<mydatalocation>/files_for_ocr:/input \ | ||||
|     -v /<mydatalocation>/files_from_ocr:/output \ | ||||
|     sfb1288inf/ocr:latest \ | ||||
|         -i /input \ | ||||
|         -l <languagecode> \ | ||||
|         -o /output | ||||
| ``` | ||||
| The arguments below `sfb1288inf/ocr:latest` are described in the [OCR arguments](#ocr-arguments) part. | ||||
|  | ||||
| If you want to use the prebuilt image, replace `sfb1288inf/ocr:latest` with `gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest`. | ||||
|  | ||||
| 4. Check your results in the `/<mydatalocation>/files_from_ocr` directory. | ||||
|  | ||||
| ### OCR arguments | ||||
|  | ||||
| `-l languagecode` | ||||
| * Tells tesseract which language will be used. | ||||
| * options = deu (German), eng (English), enm (Middle englisch), fra (French), frk (German Fraktur), frm (Middle french), ita (Italian), por (Portuguese), spa (Spanish) | ||||
| * required = True | ||||
|  | ||||
| `--keep-intermediates` | ||||
| * If set, all intermediate files created during the OCR process will be | ||||
| kept. | ||||
| * default = False | ||||
| * required = False | ||||
|  | ||||
| `--nCores corenumber` | ||||
| * Sets the number of CPU cores being used during the OCR process. | ||||
| * default = min(4, multiprocessing.cpu_count()) | ||||
| * required = False | ||||
|  | ||||
| `--skip-binarisation` | ||||
| * Used to skip binarization with ocropus. If skipped, only the tesseract binarization is used. | ||||
| * default = False | ||||
|  | ||||
| Example with all arguments used: | ||||
| ``` | ||||
| docker run \ | ||||
|     --rm \ | ||||
|     -it \ | ||||
|     -u $(id -u $USER):$(id -g $USER) \ | ||||
|     -v "$HOME"/ocr/files_for_ocr:/input \ | ||||
|     -v "$HOME"/ocr/files_from_ocr:/output \ | ||||
|     sfb1288inf/ocr:latest \ | ||||
|         -i /input \ | ||||
|         -l eng \ | ||||
|         -o /output \ | ||||
|         --keep_intermediates \ | ||||
|         --nCores 8 \ | ||||
|         --skip-binarisation | ||||
| ## Software used in this pipeline implementation | ||||
|  | ||||
| - Official Debian Docker image (buster-slim): https://hub.docker.com/_/debian | ||||
|   - Software from Debian Buster's free repositories | ||||
| - ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3 | ||||
| - pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20 | ||||
| - Tesseract OCR (5.0.0): https://github.com/tesseract-ocr/tesseract/releases/tag/5.0.0 | ||||
|  | ||||
| ## Installation | ||||
|  | ||||
| 1. Install Docker and Python 3. | ||||
| 2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git` | ||||
| 2. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:v0.1.0 ocr` | ||||
| 2. Add the wrapper script (`wrapper/ocr` relative to this README file) to your `${PATH}`. | ||||
| 3. Create working directories for the pipeline: `mkdir -p /<my_data_location>/{input,models,output}`. | ||||
| 4. Place your Tesseract OCR model(s) inside `/<my_data_location>/models`. | ||||
|  | ||||
| ## Use the Pipeline | ||||
|  | ||||
| 1. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language. | ||||
| 2. Clear your `/<my_data_location>/output` directory. | ||||
| 3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details. | ||||
| ```bash | ||||
| cd /<my_data_location> | ||||
| ocr -i input -o output -m models/<model_name> -l <language_code> <optional_pipeline_arguments> | ||||
| # or | ||||
| ocr -i input -o output -m models/* -l <language_code> <optional_pipeline_arguments> | ||||
| ``` | ||||
| 4. Check your results in the `/<my_data_location>/output` directory. | ||||
|   | ||||
							
								
								
									
										35
									
								
								hocr-combine
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										35
									
								
								hocr-combine
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,35 @@ | ||||
| #!/usr/bin/env python3.7 | ||||
| # coding=utf-8 | ||||
|  | ||||
| """"Combine multiple hOCR files.""" | ||||
|  | ||||
| from argparse import ArgumentParser | ||||
| from lxml import html | ||||
|  | ||||
|  | ||||
| parser = ArgumentParser(description='Combine multiple hOCR files.') | ||||
| parser.add_argument('file', help='Input file(s)', nargs='+') | ||||
| parser.add_argument('-o', '--output-file', help='Output file', required=True) | ||||
| args = parser.parse_args() | ||||
|  | ||||
|  | ||||
| for file in args.file: | ||||
|     files = [] | ||||
|     if file.startswith('@'): | ||||
|         with open(file[1:], 'r') as f: | ||||
|             files += [x for x in f.read().split("\n") if x != ''] | ||||
|     else: | ||||
|         files.append(file) | ||||
| if len(files) == 0: | ||||
|     exit(1) | ||||
|  | ||||
|  | ||||
| hocr = html.parse(files[0]) | ||||
| hocr_body = hocr.find('body') | ||||
| for file in files[1:]: | ||||
|     for ocr_page in html.parse(file).findall('//div[@class="ocr_page"]'): | ||||
|         hocr_body.append(ocr_page) | ||||
|  | ||||
|  | ||||
| with open(args.output_file, 'wb') as f: | ||||
|     hocr.write(f, encoding='UTF-8', method='html') | ||||
							
								
								
									
										58
									
								
								hocr2tei
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										58
									
								
								hocr2tei
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,58 @@ | ||||
| #!/usr/bin/env python3.7 | ||||
| # coding=utf-8 | ||||
|  | ||||
| """"Convert hOCR to TEI XML.""" | ||||
|  | ||||
| from argparse import ArgumentParser | ||||
| from lxml import html | ||||
| from xml.sax.saxutils import escape | ||||
| import re | ||||
|  | ||||
|  | ||||
| parser = ArgumentParser(description='Convert hOCR to TEI XML.') | ||||
| parser.add_argument('file', help='Input file') | ||||
| parser.add_argument('-o', '--output-file', help='Output file', required=True) | ||||
| args = parser.parse_args() | ||||
|  | ||||
|  | ||||
| tei = '' | ||||
| tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n' | ||||
| tei += '  <teiHeader>\n' | ||||
| tei += '    <fileDesc>\n' | ||||
| tei += '      <titleStmt>\n' | ||||
| tei += '        <title></title>\n' | ||||
| tei += '      </titleStmt>\n' | ||||
| tei += '      <publicationStmt>\n' | ||||
| tei += '        <p></p>\n' | ||||
| tei += '      </publicationStmt>\n' | ||||
| tei += '      <sourceDesc>\n' | ||||
| tei += '        <p></p>\n' | ||||
| tei += '      </sourceDesc>\n' | ||||
| tei += '    </fileDesc>\n' | ||||
| tei += '  </teiHeader>\n' | ||||
| tei += '  <text>\n' | ||||
| tei += '    <body>\n' | ||||
| hocr = html.parse(args.file) | ||||
| for ocr_page in hocr.findall('.//div[@class="ocr_page"]'): | ||||
|     ocr_page_title_attrib = ocr_page.attrib.get('title') | ||||
|     facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1) | ||||
|     page_number = re.search(r'ppageno (\d+)', ocr_page_title_attrib).group(1) | ||||
|     tei += f'      <pb facs="{facsimile}" n="{page_number}"/>\n' | ||||
|     for ocr_par in ocr_page.findall('.//p[@class="ocr_par"]'): | ||||
|         tei += '      <p>\n' | ||||
|         for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'): | ||||
|             tei += '        <lb/>' | ||||
|             indent = '' | ||||
|             for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'): | ||||
|                 if ocrx_word.text is not None: | ||||
|                     tei += indent + escape(ocrx_word.text) | ||||
|                     indent = ' ' | ||||
|             tei += '\n' | ||||
|         tei += '      </p>\n' | ||||
| tei += '    </body>\n' | ||||
| tei += '  </text>\n' | ||||
| tei += '</TEI>\n' | ||||
|  | ||||
|  | ||||
| with open(args.output_file, 'w') as f: | ||||
|     f.write(tei) | ||||
							
								
								
									
										58
									
								
								hocrtotei
									
									
									
									
									
								
							
							
						
						
									
										58
									
								
								hocrtotei
									
									
									
									
									
								
							| @@ -1,58 +0,0 @@ | ||||
| #!/usr/bin/env python3.5 | ||||
| # coding=utf-8 | ||||
|  | ||||
| from xml.sax.saxutils import escape | ||||
| import argparse | ||||
| import xml.etree.ElementTree as ET | ||||
|  | ||||
| parser = argparse.ArgumentParser( | ||||
|     description='Merges several hOCR files in order of their occurrence on command line to one TEI result file.' | ||||
| ) | ||||
| parser.add_argument( | ||||
|     'i', | ||||
|     metavar='hOCR-sourcefile', | ||||
|     nargs='+' | ||||
| ) | ||||
| parser.add_argument( | ||||
|     'o', | ||||
|     metavar='TEI-destfile', | ||||
| ) | ||||
| args = parser.parse_args() | ||||
|  | ||||
| output_file = open(args.o, 'w') | ||||
|  | ||||
| output_file.write( | ||||
|       '<?xml version="1.0" encoding="UTF-8"?>\n' | ||||
|     + '<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n' | ||||
|     + '    <teiHeader>\n' | ||||
|     + '        <fileDesc>\n' | ||||
|     + '            <titleStmt/>\n' | ||||
|     + '            <publicationStmt/>\n' | ||||
|     + '            <sourceDesc/>\n' | ||||
|     + '        </fileDesc>\n' | ||||
|     + '        <encodingDesc/>\n' | ||||
|     + '        <profileDesc/>\n' | ||||
|     + '    </teiHeader>\n' | ||||
|     + '    <text>\n' | ||||
|     + '        <body>\n' | ||||
| ) | ||||
| for index, input_file in enumerate(args.i): | ||||
|     tree = ET.parse(input_file) | ||||
|     output_file.write('            <pb n="%i"/>\n' % (index + 1)) | ||||
|     for para in tree.findall('.//*[@class="ocr_par"]'): | ||||
|         output_file.write('            <p>\n') | ||||
|         for line in para.findall('.//*[@class="ocr_line"]'): | ||||
|             first_word_in_line = True | ||||
|             for word in line.findall('.//*[@class="ocrx_word"]'): | ||||
|                 if word.text is not None: | ||||
|                     output_file.write(('                ' if first_word_in_line else ' ') + escape(word.text.strip())) | ||||
|                     first_word_in_line = False | ||||
|             if not first_word_in_line: | ||||
|                 output_file.write('<lb/>\n') | ||||
|         output_file.write('            </p>\n') | ||||
| output_file.write( | ||||
|       '        </body>\n' | ||||
|     + '    </text>\n' | ||||
|     + '</TEI>') | ||||
|  | ||||
| output_file.close() | ||||
							
								
								
									
										57
									
								
								wrapper/ocr
									
									
									
									
									
								
							
							
						
						
									
										57
									
								
								wrapper/ocr
									
									
									
									
									
								
							| @@ -1,39 +1,44 @@ | ||||
| #!/usr/bin/env python3 | ||||
| # coding=utf-8 | ||||
|  | ||||
| import argparse | ||||
| from argparse import ArgumentParser | ||||
| import os | ||||
| import subprocess | ||||
| import sys | ||||
|  | ||||
| container_image = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest' | ||||
| container_input_dir = '/input' | ||||
| container_output_dir = '/output' | ||||
| uid = str(os.getuid()) | ||||
| gid = str(os.getgid()) | ||||
| CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:v0.1.0' | ||||
| CONTAINER_INPUT_DIR = '/input' | ||||
| CONTAINER_OUTPUT_DIR = '/output' | ||||
| CONTAINER_MODELS_DIR = '/usr/local/share/tessdata' | ||||
| CONTAINER_LOG_DIR = '/logs' | ||||
| UID = str(os.getuid()) | ||||
| GID = str(os.getgid()) | ||||
|  | ||||
| parser = argparse.ArgumentParser(add_help=False) | ||||
| parser.add_argument( | ||||
|     '-i', | ||||
|     dest='input_dir', | ||||
|     required=False | ||||
| ) | ||||
| parser.add_argument( | ||||
|     '-o', | ||||
|     dest='output_dir', | ||||
|     required=False | ||||
| ) | ||||
| parser = ArgumentParser(add_help=False) | ||||
| parser.add_argument('-i', '--input-dir') | ||||
| parser.add_argument('-o', '--output-dir') | ||||
| parser.add_argument('-m', '--model', action='extend', dest='models', nargs='+') | ||||
| parser.add_argument('--log-dir') | ||||
| args, remaining_args = parser.parse_known_args() | ||||
|  | ||||
| cmd = ['docker', 'run', '--rm', '-it', '-u', uid + ':' + gid] | ||||
| cmd = ['docker', 'run', '--rm', '-it', '-u', f'{UID}:{GID}'] | ||||
| if args.input_dir is not None: | ||||
|     host_input_dir = os.path.abspath(args.input_dir) | ||||
|     cmd += ['-v', host_input_dir + ':' + container_input_dir] | ||||
|     remaining_args += ['-i', container_input_dir] | ||||
|     mapping = f'{os.path.abspath(args.input_dir)}:{CONTAINER_INPUT_DIR}' | ||||
|     cmd += ['-v', mapping] | ||||
|     remaining_args += ['-i', CONTAINER_INPUT_DIR] | ||||
| if args.output_dir is not None: | ||||
|     host_output_dir = os.path.abspath(args.output_dir) | ||||
|     cmd += ['-v', host_output_dir + ':' + container_output_dir] | ||||
|     remaining_args += ['-o', container_output_dir] | ||||
| cmd.append(container_image) | ||||
|     mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}' | ||||
|     cmd += ['-v', mapping] | ||||
|     remaining_args += ['-o', CONTAINER_OUTPUT_DIR] | ||||
| if args.models is not None: | ||||
|     for model in args.models: | ||||
|         mapping = f'{os.path.abspath(model)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model)}'  # noqa | ||||
|         cmd += ['-v', mapping] | ||||
| if args.log_dir is not None: | ||||
|     mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}' | ||||
|     cmd += ['-v', mapping] | ||||
|     remaining_args += ['--log-dir', CONTAINER_LOG_DIR] | ||||
| cmd.append(CONTAINER_IMAGE) | ||||
| cmd += remaining_args | ||||
|  | ||||
| subprocess.run(cmd) | ||||
| sys.exit(subprocess.run(cmd).returncode) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user