Compare commits

...

2 Commits

Author SHA1 Message Date
Patrick Jentsch
aeab9b7802 Fix enumeration in readme 2022-01-18 13:46:52 +01:00
Patrick Jentsch
00c4b17018 Codestyle update 2022-01-18 13:45:17 +01:00
2 changed files with 10 additions and 10 deletions

View File

@ -14,10 +14,10 @@ This software implements a heavily parallelized pipeline to recognize text in PD
1. Install Docker and Python 3. 1. Install Docker and Python 3.
2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git` 2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git`
2. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:v0.1.0 ocr` 3. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:v0.1.0 ocr`
2. Add the wrapper script (`wrapper/ocr` relative to this README file) to your `${PATH}`. 4. Add the wrapper script (`wrapper/ocr` relative to this README file) to your `${PATH}`.
3. Create working directories for the pipeline: `mkdir -p /<my_data_location>/{input,models,output}`. 5. Create working directories for the pipeline: `mkdir -p /<my_data_location>/{input,models,output}`.
4. Place your Tesseract OCR model(s) inside `/<my_data_location>/models`. 6. Place your Tesseract OCR model(s) inside `/<my_data_location>/models`.
## Use the Pipeline ## Use the Pipeline

12
ocr
View File

@ -385,7 +385,6 @@ class MainWorkflow(WorkflowRunner):
self.input_dir = input_dir self.input_dir = input_dir
self.lang = lang self.lang = lang
self.output_dir = output_dir self.output_dir = output_dir
self.output_files = []
self.binarize = binarize self.binarize = binarize
self.jobs = [] self.jobs = []
@ -528,33 +527,34 @@ class MainWorkflow(WorkflowRunner):
create_txt_tasks.append(task) create_txt_tasks.append(task)
self.waitForTasks() self.waitForTasks()
output_files = []
for job in self.jobs: for job in self.jobs:
# Remove temporary directory # Remove temporary directory
os.rmdir(job.tmp_dir) os.rmdir(job.tmp_dir)
# Track output files # Track output files
relative_output_dir = os.path.relpath(job.output_dir, start=self.output_dir) # noqa relative_output_dir = os.path.relpath(job.output_dir, start=self.output_dir) # noqa
self.output_files.append( output_files.append(
{ {
'description': 'Post correction package (.png and .hocr).', 'description': 'Post correction package (.png and .hocr).',
'file': os.path.join(relative_output_dir, '{}.poco.zip'.format(job.name)), # noqa 'file': os.path.join(relative_output_dir, '{}.poco.zip'.format(job.name)), # noqa
'mimetype': 'application/zip' 'mimetype': 'application/zip'
} }
) )
self.output_files.append( output_files.append(
{ {
'description': 'PDF file with text layer.', 'description': 'PDF file with text layer.',
'file': os.path.join(relative_output_dir, '{}.pdf'.format(job.name)), # noqa 'file': os.path.join(relative_output_dir, '{}.pdf'.format(job.name)), # noqa
'mimetype': 'application/pdf' 'mimetype': 'application/pdf'
} }
) )
self.output_files.append( output_files.append(
{ {
'description': 'Plain text file.', 'description': 'Plain text file.',
'file': os.path.join(relative_output_dir, '{}.txt'.format(job.name)), # noqa 'file': os.path.join(relative_output_dir, '{}.txt'.format(job.name)), # noqa
'mimetype': 'text/plain' 'mimetype': 'text/plain'
} }
) )
self.output_files.append( output_files.append(
{ {
'description': 'TEI compliant XML file.', 'description': 'TEI compliant XML file.',
'file': os.path.join(relative_output_dir, '{}.tei.xml'.format(job.name)), # noqa 'file': os.path.join(relative_output_dir, '{}.tei.xml'.format(job.name)), # noqa
@ -562,7 +562,7 @@ class MainWorkflow(WorkflowRunner):
} }
) )
with open(os.path.join(self.output_dir, 'output_records.json'), 'w') as f: # noqa with open(os.path.join(self.output_dir, 'output_records.json'), 'w') as f: # noqa
json.dump(self.output_files, f, indent=4) json.dump(output_files, f, indent=4)
def parse_args(): def parse_args():