mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2025-01-18 19:30:34 +00:00
Compare commits
2 Commits
c057d324cf
...
aeab9b7802
Author | SHA1 | Date | |
---|---|---|---|
|
aeab9b7802 | ||
|
00c4b17018 |
@ -14,10 +14,10 @@ This software implements a heavily parallelized pipeline to recognize text in PD
|
|||||||
|
|
||||||
1. Install Docker and Python 3.
|
1. Install Docker and Python 3.
|
||||||
2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git`
|
2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git`
|
||||||
2. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:v0.1.0 ocr`
|
3. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:v0.1.0 ocr`
|
||||||
2. Add the wrapper script (`wrapper/ocr` relative to this README file) to your `${PATH}`.
|
4. Add the wrapper script (`wrapper/ocr` relative to this README file) to your `${PATH}`.
|
||||||
3. Create working directories for the pipeline: `mkdir -p /<my_data_location>/{input,models,output}`.
|
5. Create working directories for the pipeline: `mkdir -p /<my_data_location>/{input,models,output}`.
|
||||||
4. Place your Tesseract OCR model(s) inside `/<my_data_location>/models`.
|
6. Place your Tesseract OCR model(s) inside `/<my_data_location>/models`.
|
||||||
|
|
||||||
## Use the Pipeline
|
## Use the Pipeline
|
||||||
|
|
||||||
|
12
ocr
12
ocr
@ -385,7 +385,6 @@ class MainWorkflow(WorkflowRunner):
|
|||||||
self.input_dir = input_dir
|
self.input_dir = input_dir
|
||||||
self.lang = lang
|
self.lang = lang
|
||||||
self.output_dir = output_dir
|
self.output_dir = output_dir
|
||||||
self.output_files = []
|
|
||||||
self.binarize = binarize
|
self.binarize = binarize
|
||||||
self.jobs = []
|
self.jobs = []
|
||||||
|
|
||||||
@ -528,33 +527,34 @@ class MainWorkflow(WorkflowRunner):
|
|||||||
create_txt_tasks.append(task)
|
create_txt_tasks.append(task)
|
||||||
|
|
||||||
self.waitForTasks()
|
self.waitForTasks()
|
||||||
|
output_files = []
|
||||||
for job in self.jobs:
|
for job in self.jobs:
|
||||||
# Remove temporary directory
|
# Remove temporary directory
|
||||||
os.rmdir(job.tmp_dir)
|
os.rmdir(job.tmp_dir)
|
||||||
# Track output files
|
# Track output files
|
||||||
relative_output_dir = os.path.relpath(job.output_dir, start=self.output_dir) # noqa
|
relative_output_dir = os.path.relpath(job.output_dir, start=self.output_dir) # noqa
|
||||||
self.output_files.append(
|
output_files.append(
|
||||||
{
|
{
|
||||||
'description': 'Post correction package (.png and .hocr).',
|
'description': 'Post correction package (.png and .hocr).',
|
||||||
'file': os.path.join(relative_output_dir, '{}.poco.zip'.format(job.name)), # noqa
|
'file': os.path.join(relative_output_dir, '{}.poco.zip'.format(job.name)), # noqa
|
||||||
'mimetype': 'application/zip'
|
'mimetype': 'application/zip'
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
self.output_files.append(
|
output_files.append(
|
||||||
{
|
{
|
||||||
'description': 'PDF file with text layer.',
|
'description': 'PDF file with text layer.',
|
||||||
'file': os.path.join(relative_output_dir, '{}.pdf'.format(job.name)), # noqa
|
'file': os.path.join(relative_output_dir, '{}.pdf'.format(job.name)), # noqa
|
||||||
'mimetype': 'application/pdf'
|
'mimetype': 'application/pdf'
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
self.output_files.append(
|
output_files.append(
|
||||||
{
|
{
|
||||||
'description': 'Plain text file.',
|
'description': 'Plain text file.',
|
||||||
'file': os.path.join(relative_output_dir, '{}.txt'.format(job.name)), # noqa
|
'file': os.path.join(relative_output_dir, '{}.txt'.format(job.name)), # noqa
|
||||||
'mimetype': 'text/plain'
|
'mimetype': 'text/plain'
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
self.output_files.append(
|
output_files.append(
|
||||||
{
|
{
|
||||||
'description': 'TEI compliant XML file.',
|
'description': 'TEI compliant XML file.',
|
||||||
'file': os.path.join(relative_output_dir, '{}.tei.xml'.format(job.name)), # noqa
|
'file': os.path.join(relative_output_dir, '{}.tei.xml'.format(job.name)), # noqa
|
||||||
@ -562,7 +562,7 @@ class MainWorkflow(WorkflowRunner):
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
with open(os.path.join(self.output_dir, 'output_records.json'), 'w') as f: # noqa
|
with open(os.path.join(self.output_dir, 'output_records.json'), 'w') as f: # noqa
|
||||||
json.dump(self.output_files, f, indent=4)
|
json.dump(output_files, f, indent=4)
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
|
Loading…
x
Reference in New Issue
Block a user