31 Commits

Author SHA1 Message Date
48fe7c0702 Slight changes to services description 2024-06-20 12:53:48 +02:00
5a2723b617 updates and restructuring 2024-03-26 15:29:26 +01:00
4425d50140 more manual updates 2024-03-19 17:33:37 +01:00
39113a6f17 manual sections 01, 02, 06 2024-03-14 17:07:53 +01:00
a53f1d216b manual v.a. dashboard, services 2024-03-14 09:35:22 +01:00
ffd7a3ad91 Manual Ergänzungen Intro /Getting Started 2024-03-05 15:41:17 +01:00
5dce269736 Version number + original slogan font 2023-12-18 12:49:30 +01:00
13369296d3 rename docker-entrypoint.sh to docker-nopaque-entrypoint.sh 2023-12-15 13:56:03 +01:00
4f6e1c121f Add nopaque version config variable 2023-12-15 08:47:59 +01:00
438a257fe3 Update CI script 2023-12-15 08:47:46 +01:00
2e88d7d035 Merge branch 'query-builder' of gitlab.ub.uni-bielefeld.de:sfb1288inf/nopaque into query-builder 2023-12-15 08:37:02 +01:00
b338c33d42 Bump cwb version 2023-12-15 08:36:50 +01:00
d6cebddd92 Updated query builder gifs and instructions 2023-12-12 14:56:08 +01:00
07fda0e95a Merge branch 'query-builder' of gitlab.ub.uni-bielefeld.de:sfb1288inf/nopaque into query-builder 2023-12-07 22:35:41 +01:00
3927d9e4cd Edits in structural attributes section and others 2023-12-07 22:34:00 +01:00
8f5d5ffdec Merge branch 'query-builder' of gitlab.ub.uni-bielefeld.de:sfb1288inf/nopaque into query-builder 2023-12-07 12:46:48 +01:00
f02d1619e2 Try to implement anchor tags 2023-12-07 12:46:37 +01:00
892f1f799e Merge branch 'query-builder' of gitlab.ub.uni-bielefeld.de:sfb1288inf/nopaque into query-builder 2023-12-05 15:00:49 +01:00
f5e98ae655 Add badges to README 2023-12-05 15:00:21 +01:00
f790106e0e Merge branch 'query-builder' of gitlab.ub.uni-bielefeld.de:sfb1288inf/nopaque into query-builder 2023-12-05 14:54:05 +01:00
c57acc73d2 Manual changes 2023-12-05 14:42:38 +01:00
678a0767b7 Change Manual icon 2023-11-30 11:21:39 +01:00
17a9338d9f Fix job deletion from job page 2023-11-29 16:11:14 +01:00
a7cbce1eda Fix wrong spacy-nlp-pipeline version number 2023-11-29 10:45:35 +01:00
fa28c875e1 Merge branch 'query-builder' of gitlab.ub.uni-bielefeld.de:sfb1288inf/nopaque into query-builder 2023-11-28 12:40:05 +01:00
0927edcceb Bug Fixes 2023-11-28 12:39:54 +01:00
9c22370eea Implement force download parameter in model insert_defaults methods 2023-11-28 12:10:55 +01:00
bdcc80a66f Add new tesseract-ocr-pipeline version. Remove redundant spacy-nlp-pipeline version. 2023-11-28 10:34:30 +01:00
9be5ce6014 link logo to homepage 2023-11-23 13:32:54 +01:00
00e4c3ade3 Add logo to sidenav 2023-11-23 13:26:19 +01:00
79a16cae83 Add links to my profile page 2023-11-23 13:16:21 +01:00
39 changed files with 695 additions and 275 deletions

View File

@ -8,6 +8,6 @@
!.flaskenv !.flaskenv
!boot.sh !boot.sh
!config.py !config.py
!docker-entrypoint.sh !docker-nopaque-entrypoint.sh
!nopaque.py !nopaque.py
!requirements.txt !requirements.txt

View File

@ -1,3 +1,37 @@
include:
- template: Security/Container-Scanning.gitlab-ci.yml
##############################################################################
# Pipeline stages in order of execution #
##############################################################################
stages:
- build
- publish
- sca
##############################################################################
# Pipeline behavior #
##############################################################################
workflow:
rules:
# Run the pipeline on commits to the default branch
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
variables:
# Set the Docker image tag to `latest`
DOCKER_IMAGE: $CI_REGISTRY_IMAGE:latest
when: always
# Run the pipeline on tag creation
- if: $CI_COMMIT_TAG
variables:
# Set the Docker image tag to the Git tag name
DOCKER_IMAGE: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
when: always
# Don't run the pipeline on all other occasions
- when: never
##############################################################################
# Default values for pipeline jobs #
##############################################################################
default: default:
image: docker:24.0.6 image: docker:24.0.6
services: services:
@ -5,38 +39,46 @@ default:
tags: tags:
- docker - docker
##############################################################################
# CI/CD variables for all jobs in the pipeline #
##############################################################################
variables: variables:
DOCKER_TLS_CERTDIR: /certs DOCKER_TLS_CERTDIR: /certs
DOCKER_BUILD_PATH: .
DOCKERFILE: Dockerfile
build_image: ##############################################################################
# Pipeline jobs #
##############################################################################
build:
stage: build stage: build
rules:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
when: on_success
variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:latest
- if: $CI_COMMIT_TAG
when: "on_success"
variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
- when: never
before_script:
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
script: script:
- docker build -t $IMAGE_TAG . - docker build --tag $DOCKER_IMAGE --file $DOCKERFILE $DOCKER_BUILD_PATH
- docker push $IMAGE_TAG - docker save $DOCKER_IMAGE > docker_image.tar
artifacts:
paths:
- docker_image.tar
include: publish:
- template: Security/Container-Scanning.gitlab-ci.yml stage: publish
before_script:
- docker login --username gitlab-ci-token --password $CI_JOB_TOKEN $CI_REGISTRY
script:
- docker load --input docker_image.tar
- docker push $DOCKER_IMAGE
after_script:
- docker logout $CI_REGISTRY
container_scanning: container_scanning:
stage: sca
rules: rules:
# Run the job on commits to the default branch
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
when: on_success when: always
variables: # Run the job on tag creation
CS_IMAGE: $CI_REGISTRY_IMAGE:latest
- if: $CI_COMMIT_TAG - if: $CI_COMMIT_TAG
when: on_success when: always
variables: # Don't run the job on all other occasions
CS_IMAGE: ${CI_REGISTRY_IMAGE}:${CI_COMMIT_REF_NAME}
- when: never - when: never
variables:
CS_IMAGE: $DOCKER_IMAGE

View File

@ -17,9 +17,6 @@ RUN apt-get update \
&& rm --recursive /var/lib/apt/lists/* && rm --recursive /var/lib/apt/lists/*
COPY docker-entrypoint.sh /usr/local/bin/
RUN useradd --create-home --no-log-init nopaque \ RUN useradd --create-home --no-log-init nopaque \
&& groupadd docker \ && groupadd docker \
&& usermod --append --groups docker nopaque && usermod --append --groups docker nopaque
@ -47,7 +44,10 @@ RUN python3 -m pip install --requirement requirements.txt \
USER root USER root
COPY docker-nopaque-entrypoint.sh /usr/local/bin/
EXPOSE 5000 EXPOSE 5000
ENTRYPOINT ["docker-entrypoint.sh"] ENTRYPOINT ["docker-nopaque-entrypoint.sh"]

View File

@ -1,5 +1,8 @@
# nopaque # nopaque
![release badge](https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque/-/badges/release.svg)
![pipeline badge](https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque/badges/master/pipeline.svg?ignore_skipped=true)
nopaque bundles various tools and services that provide humanities scholars with DH methods and thus can support their various individual research processes. Using nopaque, researchers can subject digitized sources to Optical Character Recognition (OCR). The resulting text files can then be used as a data basis for Natural Language Processing (NLP). The texts are automatically subjected to various linguistic annotations. The data processed via NLP can then be summarized in the web application as corpora and analyzed by means of an information retrieval system through complex search queries. The range of functions of the web application will be successively extended according to the needs of the researchers. nopaque bundles various tools and services that provide humanities scholars with DH methods and thus can support their various individual research processes. Using nopaque, researchers can subject digitized sources to Optical Character Recognition (OCR). The resulting text files can then be used as a data basis for Natural Language Processing (NLP). The texts are automatically subjected to various linguistic annotations. The data processed via NLP can then be summarized in the web application as corpora and analyzed by means of an information retrieval system through complex search queries. The range of functions of the web application will be successively extended according to the needs of the researchers.
## Prerequisites and requirements ## Prerequisites and requirements

View File

@ -120,7 +120,6 @@
version: '3.4.0' version: '3.4.0'
compatible_service_versions: compatible_service_versions:
- '0.1.1' - '0.1.1'
- '0.1.2'
- title: 'German' - title: 'German'
description: 'German pipeline optimized for CPU. Components: tok2vec, tagger, morphologizer, parser, lemmatizer (trainable_lemmatizer), senter, ner.' description: 'German pipeline optimized for CPU. Components: tok2vec, tagger, morphologizer, parser, lemmatizer (trainable_lemmatizer), senter, ner.'
url: 'https://github.com/explosion/spacy-models/releases/download/de_core_news_md-3.4.0/de_core_news_md-3.4.0.tar.gz' url: 'https://github.com/explosion/spacy-models/releases/download/de_core_news_md-3.4.0/de_core_news_md-3.4.0.tar.gz'
@ -132,7 +131,6 @@
version: '3.4.0' version: '3.4.0'
compatible_service_versions: compatible_service_versions:
- '0.1.1' - '0.1.1'
- '0.1.2'
- title: 'Greek' - title: 'Greek'
description: 'Greek pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, lemmatizer (trainable_lemmatizer), senter, ner, attribute_ruler.' description: 'Greek pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, lemmatizer (trainable_lemmatizer), senter, ner, attribute_ruler.'
url: 'https://github.com/explosion/spacy-models/releases/download/el_core_news_md-3.4.0/el_core_news_md-3.4.0.tar.gz' url: 'https://github.com/explosion/spacy-models/releases/download/el_core_news_md-3.4.0/el_core_news_md-3.4.0.tar.gz'
@ -144,7 +142,6 @@
version: '3.4.0' version: '3.4.0'
compatible_service_versions: compatible_service_versions:
- '0.1.1' - '0.1.1'
- '0.1.2'
- title: 'English' - title: 'English'
description: 'English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer.' description: 'English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer.'
url: 'https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1.tar.gz' url: 'https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1.tar.gz'
@ -156,7 +153,6 @@
version: '3.4.1' version: '3.4.1'
compatible_service_versions: compatible_service_versions:
- '0.1.1' - '0.1.1'
- '0.1.2'
- title: 'Spanish' - title: 'Spanish'
description: 'Spanish pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.' description: 'Spanish pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.'
url: 'https://github.com/explosion/spacy-models/releases/download/es_core_news_md-3.4.0/es_core_news_md-3.4.0.tar.gz' url: 'https://github.com/explosion/spacy-models/releases/download/es_core_news_md-3.4.0/es_core_news_md-3.4.0.tar.gz'
@ -168,7 +164,6 @@
version: '3.4.0' version: '3.4.0'
compatible_service_versions: compatible_service_versions:
- '0.1.1' - '0.1.1'
- '0.1.2'
- title: 'French' - title: 'French'
description: 'French pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.' description: 'French pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.'
url: 'https://github.com/explosion/spacy-models/releases/download/fr_core_news_md-3.4.0/fr_core_news_md-3.4.0.tar.gz' url: 'https://github.com/explosion/spacy-models/releases/download/fr_core_news_md-3.4.0/fr_core_news_md-3.4.0.tar.gz'
@ -180,7 +175,6 @@
version: '3.4.0' version: '3.4.0'
compatible_service_versions: compatible_service_versions:
- '0.1.1' - '0.1.1'
- '0.1.2'
- title: 'Italian' - title: 'Italian'
description: 'Italian pipeline optimized for CPU. Components: tok2vec, morphologizer, tagger, parser, lemmatizer (trainable_lemmatizer), senter, ner' description: 'Italian pipeline optimized for CPU. Components: tok2vec, morphologizer, tagger, parser, lemmatizer (trainable_lemmatizer), senter, ner'
url: 'https://github.com/explosion/spacy-models/releases/download/it_core_news_md-3.4.0/it_core_news_md-3.4.0.tar.gz' url: 'https://github.com/explosion/spacy-models/releases/download/it_core_news_md-3.4.0/it_core_news_md-3.4.0.tar.gz'
@ -192,7 +186,6 @@
version: '3.4.0' version: '3.4.0'
compatible_service_versions: compatible_service_versions:
- '0.1.1' - '0.1.1'
- '0.1.2'
- title: 'Polish' - title: 'Polish'
description: 'Polish pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, lemmatizer (trainable_lemmatizer), tagger, senter, ner.' description: 'Polish pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, lemmatizer (trainable_lemmatizer), tagger, senter, ner.'
url: 'https://github.com/explosion/spacy-models/releases/download/pl_core_news_md-3.4.0/pl_core_news_md-3.4.0.tar.gz' url: 'https://github.com/explosion/spacy-models/releases/download/pl_core_news_md-3.4.0/pl_core_news_md-3.4.0.tar.gz'
@ -204,7 +197,6 @@
version: '3.4.0' version: '3.4.0'
compatible_service_versions: compatible_service_versions:
- '0.1.1' - '0.1.1'
- '0.1.2'
- title: 'Russian' - title: 'Russian'
description: 'Russian pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.' description: 'Russian pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.'
url: 'https://github.com/explosion/spacy-models/releases/download/ru_core_news_md-3.4.0/ru_core_news_md-3.4.0.tar.gz' url: 'https://github.com/explosion/spacy-models/releases/download/ru_core_news_md-3.4.0/ru_core_news_md-3.4.0.tar.gz'
@ -216,7 +208,6 @@
version: '3.4.0' version: '3.4.0'
compatible_service_versions: compatible_service_versions:
- '0.1.1' - '0.1.1'
- '0.1.2'
- title: 'Chinese' - title: 'Chinese'
description: 'Chinese pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler.' description: 'Chinese pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler.'
url: 'https://github.com/explosion/spacy-models/releases/download/zh_core_web_md-3.4.0/zh_core_web_md-3.4.0.tar.gz' url: 'https://github.com/explosion/spacy-models/releases/download/zh_core_web_md-3.4.0/zh_core_web_md-3.4.0.tar.gz'
@ -228,4 +219,3 @@
version: '3.4.0' version: '3.4.0'
compatible_service_versions: compatible_service_versions:
- '0.1.1' - '0.1.1'
- '0.1.2'

View File

@ -9,6 +9,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Amharic' # - title: 'Amharic'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/amh.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/amh.traineddata'
@ -20,6 +21,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
- title: 'Arabic' - title: 'Arabic'
description: '' description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ara.traineddata' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ara.traineddata'
@ -31,6 +33,7 @@
compatible_service_versions: compatible_service_versions:
- '0.1.0' - '0.1.0'
- '0.1.1' - '0.1.1'
- '0.1.2'
# - title: 'Assamese' # - title: 'Assamese'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/asm.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/asm.traineddata'
@ -42,6 +45,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Azerbaijani' # - title: 'Azerbaijani'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/aze.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/aze.traineddata'
@ -53,6 +57,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Azerbaijani - Cyrillic' # - title: 'Azerbaijani - Cyrillic'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/aze_cyrl.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/aze_cyrl.traineddata'
@ -64,6 +69,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Belarusian' # - title: 'Belarusian'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bel.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bel.traineddata'
@ -75,6 +81,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Bengali' # - title: 'Bengali'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ben.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ben.traineddata'
@ -86,6 +93,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Tibetan' # - title: 'Tibetan'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bod.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bod.traineddata'
@ -97,6 +105,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Bosnian' # - title: 'Bosnian'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bos.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bos.traineddata'
@ -108,6 +117,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Bulgarian' # - title: 'Bulgarian'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bul.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bul.traineddata'
@ -119,6 +129,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Catalan; Valencian' # - title: 'Catalan; Valencian'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/cat.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/cat.traineddata'
@ -130,6 +141,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Cebuano' # - title: 'Cebuano'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ceb.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ceb.traineddata'
@ -141,6 +153,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Czech' # - title: 'Czech'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ces.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ces.traineddata'
@ -152,6 +165,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Chinese - Simplified' # - title: 'Chinese - Simplified'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chi_sim.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chi_sim.traineddata'
@ -163,6 +177,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
- title: 'Chinese - Traditional' - title: 'Chinese - Traditional'
description: '' description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chi_tra.traineddata' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chi_tra.traineddata'
@ -174,6 +189,7 @@
compatible_service_versions: compatible_service_versions:
- '0.1.0' - '0.1.0'
- '0.1.1' - '0.1.1'
- '0.1.2'
# - title: 'Cherokee' # - title: 'Cherokee'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chr.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chr.traineddata'
@ -185,6 +201,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Welsh' # - title: 'Welsh'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/cym.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/cym.traineddata'
@ -196,6 +213,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
- title: 'Danish' - title: 'Danish'
description: '' description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/dan.traineddata' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/dan.traineddata'
@ -207,6 +225,7 @@
compatible_service_versions: compatible_service_versions:
- '0.1.0' - '0.1.0'
- '0.1.1' - '0.1.1'
- '0.1.2'
- title: 'German' - title: 'German'
description: '' description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/deu.traineddata' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/deu.traineddata'
@ -218,6 +237,7 @@
compatible_service_versions: compatible_service_versions:
- '0.1.0' - '0.1.0'
- '0.1.1' - '0.1.1'
- '0.1.2'
# - title: 'Dzongkha' # - title: 'Dzongkha'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/dzo.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/dzo.traineddata'
@ -229,6 +249,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
- title: 'Greek, Modern (1453-)' - title: 'Greek, Modern (1453-)'
description: '' description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ell.traineddata' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ell.traineddata'
@ -240,6 +261,7 @@
compatible_service_versions: compatible_service_versions:
- '0.1.0' - '0.1.0'
- '0.1.1' - '0.1.1'
- '0.1.2'
- title: 'English' - title: 'English'
description: '' description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/eng.traineddata' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/eng.traineddata'
@ -251,6 +273,7 @@
compatible_service_versions: compatible_service_versions:
- '0.1.0' - '0.1.0'
- '0.1.1' - '0.1.1'
- '0.1.2'
- title: 'English, Middle (1100-1500)' - title: 'English, Middle (1100-1500)'
description: '' description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/enm.traineddata' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/enm.traineddata'
@ -262,6 +285,7 @@
compatible_service_versions: compatible_service_versions:
- '0.1.0' - '0.1.0'
- '0.1.1' - '0.1.1'
- '0.1.2'
# - title: 'Esperanto' # - title: 'Esperanto'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/epo.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/epo.traineddata'
@ -273,6 +297,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Estonian' # - title: 'Estonian'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/est.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/est.traineddata'
@ -284,6 +309,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Basque' # - title: 'Basque'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/eus.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/eus.traineddata'
@ -295,6 +321,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Persian' # - title: 'Persian'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fas.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fas.traineddata'
@ -306,6 +333,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Finnish' # - title: 'Finnish'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fin.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fin.traineddata'
@ -317,6 +345,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
- title: 'French' - title: 'French'
description: '' description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fra.traineddata' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fra.traineddata'
@ -328,6 +357,7 @@
compatible_service_versions: compatible_service_versions:
- '0.1.0' - '0.1.0'
- '0.1.1' - '0.1.1'
- '0.1.2'
- title: 'German Fraktur' - title: 'German Fraktur'
description: '' description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/frk.traineddata' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/frk.traineddata'
@ -339,6 +369,7 @@
compatible_service_versions: compatible_service_versions:
- '0.1.0' - '0.1.0'
- '0.1.1' - '0.1.1'
- '0.1.2'
- title: 'French, Middle (ca. 1400-1600)' - title: 'French, Middle (ca. 1400-1600)'
description: '' description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/frm.traineddata' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/frm.traineddata'
@ -350,6 +381,7 @@
compatible_service_versions: compatible_service_versions:
- '0.1.0' - '0.1.0'
- '0.1.1' - '0.1.1'
- '0.1.2'
# - title: 'Irish' # - title: 'Irish'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/gle.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/gle.traineddata'
@ -361,6 +393,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Galician' # - title: 'Galician'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/glg.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/glg.traineddata'
@ -372,6 +405,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
- title: 'Greek, Ancient (-1453)' - title: 'Greek, Ancient (-1453)'
description: '' description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/grc.traineddata' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/grc.traineddata'
@ -383,6 +417,7 @@
compatible_service_versions: compatible_service_versions:
- '0.1.0' - '0.1.0'
- '0.1.1' - '0.1.1'
- '0.1.2'
# - title: 'Gujarati' # - title: 'Gujarati'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/guj.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/guj.traineddata'
@ -394,6 +429,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Haitian; Haitian Creole' # - title: 'Haitian; Haitian Creole'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hat.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hat.traineddata'
@ -405,6 +441,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Hebrew' # - title: 'Hebrew'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/heb.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/heb.traineddata'
@ -416,6 +453,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Hindi' # - title: 'Hindi'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hin.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hin.traineddata'
@ -427,6 +465,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Croatian' # - title: 'Croatian'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hrv.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hrv.traineddata'
@ -438,6 +477,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Hungarian' # - title: 'Hungarian'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hun.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hun.traineddata'
@ -449,6 +489,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Inuktitut' # - title: 'Inuktitut'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/iku.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/iku.traineddata'
@ -460,6 +501,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Indonesian' # - title: 'Indonesian'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ind.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ind.traineddata'
@ -471,6 +513,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Icelandic' # - title: 'Icelandic'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/isl.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/isl.traineddata'
@ -482,6 +525,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
- title: 'Italian' - title: 'Italian'
description: '' description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ita.traineddata' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ita.traineddata'
@ -493,6 +537,7 @@
compatible_service_versions: compatible_service_versions:
- '0.1.0' - '0.1.0'
- '0.1.1' - '0.1.1'
- '0.1.2'
- title: 'Italian - Old' - title: 'Italian - Old'
description: '' description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ita_old.traineddata' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ita_old.traineddata'
@ -504,6 +549,7 @@
compatible_service_versions: compatible_service_versions:
- '0.1.0' - '0.1.0'
- '0.1.1' - '0.1.1'
- '0.1.2'
# - title: 'Javanese' # - title: 'Javanese'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/jav.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/jav.traineddata'
@ -515,6 +561,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Japanese' # - title: 'Japanese'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/jpn.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/jpn.traineddata'
@ -526,6 +573,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Kannada' # - title: 'Kannada'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kan.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kan.traineddata'
@ -537,6 +585,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Georgian' # - title: 'Georgian'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kat.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kat.traineddata'
@ -548,6 +597,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Georgian - Old' # - title: 'Georgian - Old'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kat_old.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kat_old.traineddata'
@ -559,6 +609,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Kazakh' # - title: 'Kazakh'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kaz.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kaz.traineddata'
@ -570,6 +621,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Central Khmer' # - title: 'Central Khmer'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/khm.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/khm.traineddata'
@ -581,6 +633,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Kirghiz; Kyrgyz' # - title: 'Kirghiz; Kyrgyz'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kir.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kir.traineddata'
@ -592,6 +645,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Korean' # - title: 'Korean'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kor.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kor.traineddata'
@ -603,6 +657,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Kurdish' # - title: 'Kurdish'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kur.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kur.traineddata'
@ -614,6 +669,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Lao' # - title: 'Lao'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lao.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lao.traineddata'
@ -625,6 +681,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Latin' # - title: 'Latin'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lat.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lat.traineddata'
@ -636,6 +693,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Latvian' # - title: 'Latvian'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lav.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lav.traineddata'
@ -647,6 +705,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Lithuanian' # - title: 'Lithuanian'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lit.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lit.traineddata'
@ -658,6 +717,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Malayalam' # - title: 'Malayalam'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mal.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mal.traineddata'
@ -669,6 +729,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Marathi' # - title: 'Marathi'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mar.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mar.traineddata'
@ -680,6 +741,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Macedonian' # - title: 'Macedonian'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mkd.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mkd.traineddata'
@ -691,6 +753,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Maltese' # - title: 'Maltese'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mlt.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mlt.traineddata'
@ -702,6 +765,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Malay' # - title: 'Malay'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/msa.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/msa.traineddata'
@ -713,6 +777,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Burmese' # - title: 'Burmese'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mya.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mya.traineddata'
@ -724,6 +789,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Nepali' # - title: 'Nepali'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nep.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nep.traineddata'
@ -735,6 +801,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Dutch; Flemish' # - title: 'Dutch; Flemish'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nld.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nld.traineddata'
@ -746,6 +813,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Norwegian' # - title: 'Norwegian'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nor.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nor.traineddata'
@ -757,6 +825,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Oriya' # - title: 'Oriya'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ori.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ori.traineddata'
@ -768,6 +837,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Panjabi; Punjabi' # - title: 'Panjabi; Punjabi'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pan.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pan.traineddata'
@ -779,6 +849,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Polish' # - title: 'Polish'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pol.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pol.traineddata'
@ -790,6 +861,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
- title: 'Portuguese' - title: 'Portuguese'
description: '' description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/por.traineddata' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/por.traineddata'
@ -801,6 +873,7 @@
compatible_service_versions: compatible_service_versions:
- '0.1.0' - '0.1.0'
- '0.1.1' - '0.1.1'
- '0.1.2'
# - title: 'Pushto; Pashto' # - title: 'Pushto; Pashto'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pus.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pus.traineddata'
@ -812,6 +885,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Romanian; Moldavian; Moldovan' # - title: 'Romanian; Moldavian; Moldovan'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ron.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ron.traineddata'
@ -823,6 +897,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
- title: 'Russian' - title: 'Russian'
description: '' description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/rus.traineddata' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/rus.traineddata'
@ -834,6 +909,7 @@
compatible_service_versions: compatible_service_versions:
- '0.1.0' - '0.1.0'
- '0.1.1' - '0.1.1'
- '0.1.2'
# - title: 'Sanskrit' # - title: 'Sanskrit'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/san.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/san.traineddata'
@ -845,6 +921,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Sinhala; Sinhalese' # - title: 'Sinhala; Sinhalese'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/sin.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/sin.traineddata'
@ -856,6 +933,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Slovak' # - title: 'Slovak'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/slk.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/slk.traineddata'
@ -867,6 +945,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Slovenian' # - title: 'Slovenian'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/slv.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/slv.traineddata'
@ -878,6 +957,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
- title: 'Spanish; Castilian' - title: 'Spanish; Castilian'
description: '' description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/spa.traineddata' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/spa.traineddata'
@ -889,6 +969,7 @@
compatible_service_versions: compatible_service_versions:
- '0.1.0' - '0.1.0'
- '0.1.1' - '0.1.1'
- '0.1.2'
- title: 'Spanish; Castilian - Old' - title: 'Spanish; Castilian - Old'
description: '' description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/spa_old.traineddata' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/spa_old.traineddata'
@ -900,6 +981,7 @@
compatible_service_versions: compatible_service_versions:
- '0.1.0' - '0.1.0'
- '0.1.1' - '0.1.1'
- '0.1.2'
# - title: 'Albanian' # - title: 'Albanian'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/sqi.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/sqi.traineddata'
@ -911,6 +993,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Serbian' # - title: 'Serbian'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/srp.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/srp.traineddata'
@ -922,6 +1005,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Serbian - Latin' # - title: 'Serbian - Latin'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/srp_latn.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/srp_latn.traineddata'
@ -933,6 +1017,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Swahili' # - title: 'Swahili'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/swa.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/swa.traineddata'
@ -944,6 +1029,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Swedish' # - title: 'Swedish'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/swe.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/swe.traineddata'
@ -955,6 +1041,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Syriac' # - title: 'Syriac'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/syr.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/syr.traineddata'
@ -966,6 +1053,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Tamil' # - title: 'Tamil'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tam.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tam.traineddata'
@ -977,6 +1065,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Telugu' # - title: 'Telugu'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tel.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tel.traineddata'
@ -988,6 +1077,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Tajik' # - title: 'Tajik'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tgk.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tgk.traineddata'
@ -999,6 +1089,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Tagalog' # - title: 'Tagalog'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tgl.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tgl.traineddata'
@ -1010,6 +1101,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Thai' # - title: 'Thai'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tha.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tha.traineddata'
@ -1021,6 +1113,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Tigrinya' # - title: 'Tigrinya'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tir.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tir.traineddata'
@ -1032,6 +1125,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Turkish' # - title: 'Turkish'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tur.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tur.traineddata'
@ -1043,6 +1137,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Uighur; Uyghur' # - title: 'Uighur; Uyghur'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uig.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uig.traineddata'
@ -1054,6 +1149,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Ukrainian' # - title: 'Ukrainian'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ukr.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ukr.traineddata'
@ -1065,6 +1161,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Urdu' # - title: 'Urdu'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/urd.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/urd.traineddata'
@ -1076,6 +1173,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Uzbek' # - title: 'Uzbek'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uzb.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uzb.traineddata'
@ -1087,6 +1185,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Uzbek - Cyrillic' # - title: 'Uzbek - Cyrillic'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uzb_cyrl.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uzb_cyrl.traineddata'
@ -1098,6 +1197,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Vietnamese' # - title: 'Vietnamese'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/vie.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/vie.traineddata'
@ -1109,6 +1209,7 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'
# - title: 'Yiddish' # - title: 'Yiddish'
# description: '' # description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/yid.traineddata' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/yid.traineddata'
@ -1120,3 +1221,4 @@
# compatible_service_versions: # compatible_service_versions:
# - '0.1.0' # - '0.1.0'
# - '0.1.1' # - '0.1.1'
# - '0.1.2'

View File

@ -45,7 +45,7 @@ def _create_build_corpus_service(corpus):
''' ## Constraints ## ''' ''' ## Constraints ## '''
constraints = ['node.role==worker'] constraints = ['node.role==worker']
''' ## Image ## ''' ''' ## Image ## '''
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cwb:r1853' image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cwb:r1879'
''' ## Labels ## ''' ''' ## Labels ## '''
labels = { labels = {
'origin': current_app.config['SERVER_NAME'], 'origin': current_app.config['SERVER_NAME'],
@ -139,7 +139,7 @@ def _create_cqpserver_container(corpus):
''' ## Entrypoint ## ''' ''' ## Entrypoint ## '''
entrypoint = ['bash', '-c'] entrypoint = ['bash', '-c']
''' ## Image ## ''' ''' ## Image ## '''
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cwb:r1853' image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cwb:r1879'
''' ## Name ## ''' ''' ## Name ## '''
name = f'cqpserver_{corpus.id}' name = f'cqpserver_{corpus.id}'
''' ## Network ## ''' ''' ## Network ## '''

View File

@ -953,7 +953,7 @@ class TesseractOCRPipelineModel(FileMixin, HashidMixin, db.Model):
return self.user.hashid return self.user.hashid
@staticmethod @staticmethod
def insert_defaults(): def insert_defaults(force_download=False):
nopaque_user = User.query.filter_by(username='nopaque').first() nopaque_user = User.query.filter_by(username='nopaque').first()
defaults_file = os.path.join( defaults_file = os.path.join(
os.path.dirname(os.path.abspath(__file__)), os.path.dirname(os.path.abspath(__file__)),
@ -966,6 +966,7 @@ class TesseractOCRPipelineModel(FileMixin, HashidMixin, db.Model):
if model is not None: if model is not None:
model.compatible_service_versions = m['compatible_service_versions'] model.compatible_service_versions = m['compatible_service_versions']
model.description = m['description'] model.description = m['description']
model.filename = f'{model.id}.traineddata'
model.publisher = m['publisher'] model.publisher = m['publisher']
model.publisher_url = m['publisher_url'] model.publisher_url = m['publisher_url']
model.publishing_url = m['publishing_url'] model.publishing_url = m['publishing_url']
@ -973,7 +974,7 @@ class TesseractOCRPipelineModel(FileMixin, HashidMixin, db.Model):
model.is_public = True model.is_public = True
model.title = m['title'] model.title = m['title']
model.version = m['version'] model.version = m['version']
continue else:
model = TesseractOCRPipelineModel( model = TesseractOCRPipelineModel(
compatible_service_versions=m['compatible_service_versions'], compatible_service_versions=m['compatible_service_versions'],
description=m['description'], description=m['description'],
@ -990,6 +991,7 @@ class TesseractOCRPipelineModel(FileMixin, HashidMixin, db.Model):
db.session.flush(objects=[model]) db.session.flush(objects=[model])
db.session.refresh(model) db.session.refresh(model)
model.filename = f'{model.id}.traineddata' model.filename = f'{model.id}.traineddata'
if not os.path.exists(model.path) or force_download:
r = requests.get(m['url'], stream=True) r = requests.get(m['url'], stream=True)
pbar = tqdm( pbar = tqdm(
desc=f'{model.title} ({model.filename})', desc=f'{model.title} ({model.filename})',
@ -1080,7 +1082,7 @@ class SpaCyNLPPipelineModel(FileMixin, HashidMixin, db.Model):
return self.user.hashid return self.user.hashid
@staticmethod @staticmethod
def insert_defaults(): def insert_defaults(force_download=False):
nopaque_user = User.query.filter_by(username='nopaque').first() nopaque_user = User.query.filter_by(username='nopaque').first()
defaults_file = os.path.join( defaults_file = os.path.join(
os.path.dirname(os.path.abspath(__file__)), os.path.dirname(os.path.abspath(__file__)),
@ -1093,6 +1095,7 @@ class SpaCyNLPPipelineModel(FileMixin, HashidMixin, db.Model):
if model is not None: if model is not None:
model.compatible_service_versions = m['compatible_service_versions'] model.compatible_service_versions = m['compatible_service_versions']
model.description = m['description'] model.description = m['description']
model.filename = m['url'].split('/')[-1]
model.publisher = m['publisher'] model.publisher = m['publisher']
model.publisher_url = m['publisher_url'] model.publisher_url = m['publisher_url']
model.publishing_url = m['publishing_url'] model.publishing_url = m['publishing_url']
@ -1101,10 +1104,11 @@ class SpaCyNLPPipelineModel(FileMixin, HashidMixin, db.Model):
model.title = m['title'] model.title = m['title']
model.version = m['version'] model.version = m['version']
model.pipeline_name = m['pipeline_name'] model.pipeline_name = m['pipeline_name']
continue else:
model = SpaCyNLPPipelineModel( model = SpaCyNLPPipelineModel(
compatible_service_versions=m['compatible_service_versions'], compatible_service_versions=m['compatible_service_versions'],
description=m['description'], description=m['description'],
filename=m['url'].split('/')[-1],
publisher=m['publisher'], publisher=m['publisher'],
publisher_url=m['publisher_url'], publisher_url=m['publisher_url'],
publishing_url=m['publishing_url'], publishing_url=m['publishing_url'],
@ -1118,7 +1122,7 @@ class SpaCyNLPPipelineModel(FileMixin, HashidMixin, db.Model):
db.session.add(model) db.session.add(model)
db.session.flush(objects=[model]) db.session.flush(objects=[model])
db.session.refresh(model) db.session.refresh(model)
model.filename = m['url'].split('/')[-1] if not os.path.exists(model.path) or force_download:
r = requests.get(m['url'], stream=True) r = requests.get(m['url'], stream=True)
pbar = tqdm( pbar = tqdm(
desc=f'{model.title} ({model.filename})', desc=f'{model.title} ({model.filename})',

View File

@ -10,7 +10,7 @@ file-setup-pipeline:
tesseract-ocr-pipeline: tesseract-ocr-pipeline:
name: 'Tesseract OCR Pipeline' name: 'Tesseract OCR Pipeline'
publisher: 'Bielefeld University - CRC 1288 - INF' publisher: 'Bielefeld University - CRC 1288 - INF'
latest_version: '0.1.1' latest_version: '0.1.2'
versions: versions:
0.1.0: 0.1.0:
methods: methods:
@ -23,6 +23,12 @@ tesseract-ocr-pipeline:
- 'ocropus_nlbin_threshold' - 'ocropus_nlbin_threshold'
publishing_year: 2022 publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.1' url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.1'
0.1.2:
methods:
- 'binarization'
- 'ocropus_nlbin_threshold'
publishing_year: 2023
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.2'
transkribus-htr-pipeline: transkribus-htr-pipeline:
name: 'Transkribus HTR Pipeline' name: 'Transkribus HTR Pipeline'
publisher: 'Bielefeld University - CRC 1288 - INF' publisher: 'Bielefeld University - CRC 1288 - INF'
@ -41,7 +47,7 @@ transkribus-htr-pipeline:
spacy-nlp-pipeline: spacy-nlp-pipeline:
name: 'SpaCy NLP Pipeline' name: 'SpaCy NLP Pipeline'
publisher: 'Bielefeld University - CRC 1288 - INF' publisher: 'Bielefeld University - CRC 1288 - INF'
latest_version: '0.1.2' latest_version: '0.1.1'
versions: versions:
0.1.0: 0.1.0:
methods: methods:
@ -53,8 +59,3 @@ spacy-nlp-pipeline:
- 'encoding_detection' - 'encoding_detection'
publishing_year: 2022 publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/spacy-nlp-pipeline/-/releases/v0.1.1' url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/spacy-nlp-pipeline/-/releases/v0.1.1'
0.1.2:
methods:
- 'encoding_detection'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/spacy-nlp-pipeline/-/releases/v0.1.2'

Binary file not shown.

Before

Width:  |  Height:  |  Size: 222 KiB

After

Width:  |  Height:  |  Size: 123 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 378 KiB

After

Width:  |  Height:  |  Size: 402 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 720 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 854 KiB

After

Width:  |  Height:  |  Size: 589 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 436 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 189 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 511 KiB

After

Width:  |  Height:  |  Size: 381 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1009 KiB

After

Width:  |  Height:  |  Size: 759 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 903 KiB

After

Width:  |  Height:  |  Size: 750 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 413 KiB

After

Width:  |  Height:  |  Size: 524 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

View File

@ -168,6 +168,14 @@ nopaque.App = class App {
let manualModalTocElement = document.querySelector('#manual-modal-toc'); let manualModalTocElement = document.querySelector('#manual-modal-toc');
let manualModalToc = M.Tabs.getInstance(manualModalTocElement); let manualModalToc = M.Tabs.getInstance(manualModalTocElement);
manualModalToc.select(modalTriggerElement.dataset.manualModalChapter); manualModalToc.select(modalTriggerElement.dataset.manualModalChapter);
// TODO: Make this work.
// if ('manualModalChapterAnchor' in modalTriggerElement.dataset) {
// let manualModalChapterAnchor = document.querySelector(`#${modalTriggerElement.dataset.manualModalChapterAnchor}`);
// let xCoord = manualModalChapterAnchor.getBoundingClientRect().left;
// let yCoord = manualModalChapterAnchor.getBoundingClientRect().top;
// let modalContentElement = modalElement.querySelector('.modal-content');
// modalContentElement.scroll(xCoord, yCoord);
// }
} }
} }
} }

View File

@ -7,13 +7,21 @@ nopaque.corpus_analysis.query_builder.QueryBuilder = class QueryBuilder {
this.addEventListenersToNAndMInputSubmit(); this.addEventListenersToNAndMInputSubmit();
this.elements.deleteQueryButton.addEventListener('click', () => {this.resetQueryInputField()}); this.elements.deleteQueryButton.addEventListener('click', () => {this.resetQueryInputField()});
this.expertModeQueryBuilderSwitchHandler(); this.expertModeQueryBuilderSwitchHandler();
this.extensions = { this.extensions = {
structuralAttributeBuilderFunctions: new nopaque.corpus_analysis.query_builder.StructuralAttributeBuilderFunctions(this), structuralAttributeBuilderFunctions: new nopaque.corpus_analysis.query_builder.StructuralAttributeBuilderFunctions(this),
tokenAttributeBuilderFunctions: new nopaque.corpus_analysis.query_builder.TokenAttributeBuilderFunctions(this), tokenAttributeBuilderFunctions: new nopaque.corpus_analysis.query_builder.TokenAttributeBuilderFunctions(this),
}; };
this.dropdown = M.Dropdown.init(
document.querySelector('.dropdown-trigger[data-toggle-area="token-incidence-modifiers"]'),
{
onCloseStart: () => {
this.unselectChipElement(this.elements.queryInputField.querySelector('.chip.teal'));
}
}
)
} }
addEventListenersToQueryElementTarget() { addEventListenersToQueryElementTarget() {
@ -108,7 +116,7 @@ nopaque.corpus_analysis.query_builder.QueryBuilder = class QueryBuilder {
prettyQueryText = nopaque.Utils.escape(prettyQueryText); prettyQueryText = nopaque.Utils.escape(prettyQueryText);
let queryChipElement = nopaque.Utils.HTMLToElement( let queryChipElement = nopaque.Utils.HTMLToElement(
` `
<span class="chip query-component" data-type="${dataType}" data-query="${queryText}" draggable="true" data-closing-tag="${isClosingTag}"> <span class="chip query-component" data-type="${dataType}" data-query="${queryText}" draggable="true"">
${prettyQueryText}${isEditable ? '<i class="material-icons chip-action-button" data-chip-action="edit" style="padding-left:5px; font-size:18px; cursor:pointer;">edit</i>': ''} ${prettyQueryText}${isEditable ? '<i class="material-icons chip-action-button" data-chip-action="edit" style="padding-left:5px; font-size:18px; cursor:pointer;">edit</i>': ''}
${isClosingTag ? '' : '<i class="material-icons close chip-action-button" data-chip-action="delete">close</i>'} ${isClosingTag ? '' : '<i class="material-icons close chip-action-button" data-chip-action="delete">close</i>'}
</span> </span>
@ -152,8 +160,6 @@ nopaque.corpus_analysis.query_builder.QueryBuilder = class QueryBuilder {
this.deleteChipElement(queryChipElement); this.deleteChipElement(queryChipElement);
} else if (event.target.dataset.chipAction === 'edit') { } else if (event.target.dataset.chipAction === 'edit') {
this.editChipElement(queryChipElement); this.editChipElement(queryChipElement);
} else if (event.target.dataset.chipAction === 'lock') {
this.lockClosingChipElement(queryChipElement);
} }
}); });
}); });
@ -292,24 +298,36 @@ nopaque.corpus_analysis.query_builder.QueryBuilder = class QueryBuilder {
} }
selectChipElement(attr) { selectChipElement(attr) {
document.querySelectorAll('.chip.teal').forEach(element => { if (attr.classList.contains('teal')) {
if (element !== attr) { return;
element.classList.remove('teal', 'lighten-2');
this.toggleClass(['token-incidence-modifiers'], 'disabled', 'add');
} }
});
this.toggleClass(['token-incidence-modifiers'], 'disabled', 'toggle'); this.toggleClass(['token-incidence-modifiers'], 'disabled', 'toggle');
attr.classList.toggle('teal'); attr.classList.toggle('teal');
attr.classList.toggle('lighten-5'); attr.classList.toggle('lighten-5');
M.Dropdown.getInstance(document.querySelector('.dropdown-trigger[data-toggle-area="token-incidence-modifiers"]')).open();
} }
tokenIncidenceModifierHandler(incidenceModifier, incidenceModifierPretty) { unselectChipElement(attr) {
let nModalInstance = M.Modal.getInstance(document.querySelector('#corpus-analysis-concordance-exactly-n-token-modal'));
let nmModalInstance = M.Modal.getInstance(document.querySelector('#corpus-analysis-concordance-between-nm-token-modal'));
if (nModalInstance.isOpen || nmModalInstance.isOpen) {
return;
}
attr.classList.remove('teal', 'lighten-5');
this.toggleClass(['token-incidence-modifiers'], 'disabled', 'add');
}
tokenIncidenceModifierHandler(incidenceModifier, incidenceModifierPretty, nOrNM = false) {
// Adds a token incidence modifier to the query input field. // Adds a token incidence modifier to the query input field.
let selectedChip = this.elements.queryInputField.querySelector('.chip.teal'); let selectedChip = this.elements.queryInputField.querySelector('.chip.teal');
let selectedChipIndex = Array.from(this.elements.queryChipElements).indexOf(selectedChip); let selectedChipIndex = Array.from(this.elements.queryChipElements).indexOf(selectedChip);
if (nOrNM) {
this.unselectChipElement(selectedChip);
}
this.submitQueryChipElement('token-incidence-modifier', incidenceModifierPretty, incidenceModifier, selectedChipIndex); this.submitQueryChipElement('token-incidence-modifier', incidenceModifierPretty, incidenceModifier, selectedChipIndex);
this.selectChipElement(selectedChip);
} }
tokenNMSubmitHandler(modalId) { tokenNMSubmitHandler(modalId) {
@ -327,7 +345,7 @@ nopaque.corpus_analysis.query_builder.QueryBuilder = class QueryBuilder {
let instance = M.Modal.getInstance(modal); let instance = M.Modal.getInstance(modal);
instance.close(); instance.close();
this.tokenIncidenceModifierHandler(input, pretty_input); this.tokenIncidenceModifierHandler(input, pretty_input, true);
} }
expertModeQueryBuilderSwitchHandler() { expertModeQueryBuilderSwitchHandler() {
@ -368,15 +386,13 @@ nopaque.corpus_analysis.query_builder.QueryBuilder = class QueryBuilder {
this.resetQueryInputField(); this.resetQueryInputField();
let expertModeInputFieldValue = document.querySelector('#corpus-analysis-concordance-form-query').value; let expertModeInputFieldValue = document.querySelector('#corpus-analysis-concordance-form-query').value;
let chipElements = this.parseTextToChip(expertModeInputFieldValue); let chipElements = this.parseTextToChip(expertModeInputFieldValue);
let closingTagElements = ['end-sentence', 'end-entity'];
let editableElements = ['start-entity', 'token']; let editableElements = ['start-entity', 'token'];
for (let chipElement of chipElements) { for (let chipElement of chipElements) {
let isClosingTag = closingTagElements.includes(chipElement['type']);
let isEditable = editableElements.includes(chipElement['type']); let isEditable = editableElements.includes(chipElement['type']);
if (chipElement['query'] === '[]'){ if (chipElement['query'] === '[]'){
isEditable = false; isEditable = false;
} }
this.submitQueryChipElement(chipElement['type'], chipElement['pretty'], chipElement['query'], null, isClosingTag, isEditable); this.submitQueryChipElement(chipElement['type'], chipElement['pretty'], chipElement['query'], null, false, isEditable);
} }
} }

View File

@ -52,14 +52,14 @@ nopaque.corpus_analysis.query_builder.TokenAttributeBuilderFunctions = class Tok
let input = this.tokenInputCheck(this.elements.tokenBuilderContent); let input = this.tokenInputCheck(this.elements.tokenBuilderContent);
switch (elem) { switch (elem) {
case 'option-group': case 'option-group':
input.value += '(option1|option2)'; this.cursorPositionInputfieldHandler(input, '(option1|option2)');
let firstIndex = input.value.indexOf('option1'); let firstIndex = input.value.indexOf('option1');
let lastIndex = firstIndex + 'option1'.length; let lastIndex = firstIndex + 'option1'.length;
input.focus();
input.setSelectionRange(firstIndex, lastIndex); input.setSelectionRange(firstIndex, lastIndex);
break; break;
case 'wildcard-char': case 'wildcard-char':
input.value += '.'; this.cursorPositionInputfieldHandler(input, '.');
input.focus();
break; break;
case 'and': case 'and':
this.conditionHandler('and'); this.conditionHandler('and');
@ -73,9 +73,19 @@ nopaque.corpus_analysis.query_builder.TokenAttributeBuilderFunctions = class Tok
this.optionToggleHandler(); this.optionToggleHandler();
} }
cursorPositionInputfieldHandler(input, addedInput) {
let cursorPosition = input.selectionStart;
let textBeforeCursor = input.value.substring(0, cursorPosition);
let textAfterCursor = input.value.substring(cursorPosition);
let newInputValue = textBeforeCursor + addedInput + textAfterCursor;
input.value = newInputValue;
let newCursorPosition = cursorPosition + addedInput.length;
input.setSelectionRange(newCursorPosition, newCursorPosition);
}
characterIncidenceModifierHandler(elem) { characterIncidenceModifierHandler(elem) {
let input = this.tokenInputCheck(this.elements.tokenBuilderContent); let input = this.tokenInputCheck(this.elements.tokenBuilderContent);
input.value += elem.dataset.token; this.cursorPositionInputfieldHandler(input, elem.dataset.token);
} }
characterNMSubmitHandler(modalId) { characterNMSubmitHandler(modalId) {
@ -83,12 +93,12 @@ nopaque.corpus_analysis.query_builder.TokenAttributeBuilderFunctions = class Tok
let input_n = modal.querySelector('.n-m-input[data-value-type="n"]').value; let input_n = modal.querySelector('.n-m-input[data-value-type="n"]').value;
let input_m = modal.querySelector('.n-m-input[data-value-type="m"]') || undefined; let input_m = modal.querySelector('.n-m-input[data-value-type="m"]') || undefined;
input_m = input_m !== undefined ? ',' + input_m.value : ''; input_m = input_m !== undefined ? ',' + input_m.value : '';
let input = `${input_n}${input_m}`; let addedInput = `${input_n}${input_m}`;
let instance = M.Modal.getInstance(modal); let instance = M.Modal.getInstance(modal);
instance.close(); instance.close();
let tokenInput = this.tokenInputCheck(this.elements.tokenBuilderContent); let input = this.tokenInputCheck(this.elements.tokenBuilderContent);
tokenInput.value += '{' + input + '}'; this.cursorPositionInputfieldHandler(input, `{${addedInput}}`);
} }
conditionHandler(conditionText) { conditionHandler(conditionText) {

View File

@ -1,9 +1,34 @@
<h3 class="manual-chapter-title">Introduction</h3> <h3 class="manual-chapter-title">Introduction</h3>
<h4>Introduction</h4>
<p> <p>
nopaque is a web-based digital working environment. It implements a Nopaque is a web application that offers different services and tools to support
workflow based on the research process in the humanities and supports its researchers working with image and text-based data. These services are logically
users in processing their data in order to subsequently apply digital connected and build upon each other. They include:
analysis methods to them. All processes are implemented in a specially
provided cloud environment with established open source software. This
always ensures that no personal data of the users is disclosed.
</p> </p>
<ol style="list-style-type:disc; margin-left:2em; padding-bottom:0;">
<li><b>File setup</b>, which converts and merges different data (e.g., books, letters)
for further processing.</li>
<li><b>Image-to-text conversion tools:</b></li>
<ol style="list-style-type:circle; margin-left:1em; padding-bottom:0;"><li><b>Optical Character Recognition</b> converts photos and
scans into text data, making them machine-readable.</li>
<li><b>Transkribus HTR (Handwritten Text Recognition) Pipeline</b> (currently deactivated)*
also converts images into text data, making them machine-readable.</li>
</ol>
<li><b>Natural Language Processing</b> extracts information from your text via
computational linguistic data processing (tokenization, lemmatization, part-of-speech
tagging and named-entity recognition.</li>
<li><b>Corpus analysis</b> makes use of CQP Query Language to search through text
corpora with the aid of metadata and Natural Language Processing tags.</li>
</ol>
Nopaque also features a <b>Social Area</b>, where researchers can create a personal profile, connect with other users and share corpora if desired.
These services can be accessed from the sidebar in nopaque.
All processes are implemented in a specially provided cloud environment with established open-source software.
This always ensures that no personal data of the users is disclosed.
<p>
*Note: the Transkribus HTR Pipeline is currently
deactivated; we are working on an alternative solution. You can try using Tesseract OCR,
though the results will likely be poor.
</p>

View File

@ -0,0 +1,104 @@
<h3 class="manual-chapter-title">Getting Started</h3>
<h4>Getting Started</h4>
<p>
In this section, we will take you through all the steps you need to start analyzing your data with nopaque.
</p>
<div style="border: 1px solid; padding-left: 20px; margin-right: 400px; margin-bottom: 40px;">
<h5>Content</h5>
<ol style="list-style-type:disc">
<li><a href="#registration-and-login">Registration and login</a></li>
<li><a href="#preparing-files">Preparing files for analysis</a></li>
<li><a href="#converting-a-pdf-into-text">Converting a PDF into text data</a></li>
<li><a href="#extracting-linguistic-data">Extracting linguistic data from text</a></li>
<li><a href="#creating-a-corpus">Creating a corpus</a></li>
<li><a href="#analyzing-a-corpus">Analyzing a corpus</a></li>
</ol>
</div>
<p></p>
<h5 id="registration-and-login">Registration and login</h5>
<p>Before you can begin using nopaque, you will need to create a personal user account.
Open the menu (three dots) at the top right of the screen and choose “Register.” Enter
the required details listed on the registration page (username, password, email address).
After verifying your account via the link sent to your email, you can log in.</p>
<h5 id="preparing-files">Preparing files for analysis</h5>
<p>A few steps need to be taken before images, scans, or other text data are ready for
analysis in nopaque. The SpaCy NLP Pipeline service can only extract linguistic data
from texts in plain text (.txt) format. If your text is already in this format, you
can skip the next steps and go directly to <b>Extracting linguistic data from text</b>.
Otherwise, the next steps assume that you are starting off with image data.</p>
<p>
First, all data needs to be converted into PDF format. Using the <b>File Setup</b> service,
you can bundle images together even of different formats and convert them all into
one PDF file. Note that the File Setup service will sort the images based on their file
name in ascending order. It is thus recommended to name them accordingly, for example:
page-01.png, page-02.jpg, page-03.tiff.
</p>
<p>
Add a title and description to your job and select the File Setup version* you want to use.
After uploading the images and completing the File Setup job, the list of files added
can be seen under “Inputs.” Further below, under “Results,” you can find and download
the PDF output.</p>
<h5 id="converting-a-pdf-into-text">Converting a PDF into text data</h5>
<p>Select an image-to-text conversion tool depending on whether your PDF is primarily
composed of handwritten text or printed text. For printed text, select the <b>Tesseract OCR
Pipeline</b>. For handwritten text, select the <b>Transkribus HTR Pipeline</b>. Select the desired
language model or upload your own. Select the version* of Tesseract OCR you want to use
and click on submit to start the conversion. When the job is finished, various output
files can be seen and downloaded further below, under “Results.” You may want to review
the text output for errors and coherence. (Note: the Transkribus HTR Pipeline is currently
deactivated; we are working on an alternative solution. You can try using Tesseract OCR,
though the results will likely be poor.)
</p>
<h5 id="extracting-linguistic-data">Extracting linguistic data from text</h5>
<p>The <b>SpaCy NLP Pipeline</b> service extracts linguistic information from plain text files
(in .txt format). Select the corresponding .txt file, the language model, and the
version* you want to use. When the job is finished, find and download the files in
<b>.json</b> and <b>.vrt</b> format under “Results.”</p>
<h5 id="creating-a-corpus">Creating a corpus</h5>
<p>Now, using the files in .vrt format, you can create a corpus. This can be done
in the <a href="{{ url_for('main.dashboard') }}">Dashboard</a> or
<a href="{{ url_for('services.corpus_analysis') }}">Corpus Analysis</a> sections under “My Corpora.” Click on “Create corpus”
and add a title and description for your corpus. After submitting, you will automatically
be taken to the corpus overview page (which can be called up again via the corpus lists)
of your new, still empty corpus. </p>
<p>
Further down in the “Corpus files” section, you can add texts in .vrt format
(results of the NLP service) to your new corpus. To do this, use the "Add Corpus File"
button and fill in the form that appears. Here, you can add
metadata to each text. After adding all texts to the corpus, it must
be prepared for analysis. This process can be initiated by clicking on the
"Build" button under "Actions".
On the corpus overview page, you can see information about the current status of
the corpus in the upper right corner. After the build process, the status "built" should be shown here.
Now, your corpus is ready for analysis.</p>
<h5 id="analyzing-a-corpus">Analyzing a corpus</h5>
<p>Navigate to the corpus you would like to analyze and click on the Analyze button.
This will take you to an analysis overview page for your corpus. Here, you can find a
visualization of general linguistic information of your corpus, including tokens,
sentences, unique words, unique lemmas, unique parts of speech and unique simple parts
of speech. You will also find a pie chart of the proportional textual makeup of your
corpus and can view the linguistic information for each individual text file. A more
detailed visualization of token frequencies with a search option is also on this page.</p>
<p>From the corpus analysis overview page, you can navigate to other analysis modules:
the <b>Query Builder</b> (under <b>Concordance</b>) and the <b>Reader</b>. With the Reader, you can read
your corpus texts tokenized with the associated linguistic information. The tokens can
be shown as lemmas, parts of speech, words, and can be displayed in different ways:
visually as plain text with the option of highlighted entities or as chips.</p>
<p>The <b>Concordance</b> module allows for more specific, query-oriented text analyses.
Here, you can filter out text parameters and structural attributes in different
combinations. This is explained in more detail in the Query Builder section of the
manual.</p>
<br>
<br>
*For all services, it is recommended to use the latest version unless you need a model
only available in an earlier version or are looking to reproduce data that was originally generated
using an older version.

View File

@ -1,18 +0,0 @@
<h3 class="manual-chapter-title">Registration and Log in</h3>
<div class="row">
<div class="col s12 m4">
<img alt="Registration and Log in" class="materialboxed responsive-img" src="{{ url_for('static', filename='images/manual/registration-and-log-in.png') }}">
</div>
<div class="col s12 m8">
<p>
Before you can start using the web platform, you need to create a user
account. This requires only a few details: just a user name, an e-mail
address and a password are needed. In order to register yourself, fill out
the form on the <a href="{{ url_for('auth.register') }}">registration page</a>. After successful registration, the
created account must be verified. To do this, follow the instructions
given in the automatically sent e-mail. Afterwards, you can log in as
usual with your username/email address and password in the log-in form
located next to the registration button.
</p>
</div>
</div>

View File

@ -1,15 +1,22 @@
<h3 class="manual-chapter-title">Dashboard</h3> <h3 class="manual-chapter-title">Dashboard</h3>
<h4>About the dashboard</h4>
<br>
<div class="row"> <div class="row">
<div class="col s12 m4"> <div class="col s12 m4">
<img alt="Dashboard" class="materialboxed responsive-img" src="{{ url_for('static', filename='images/manual/dashboard.png') }}"> <img alt="Dashboard" class="materialboxed responsive-img" src="{{ url_for('static', filename='images/manual/dashboard.png') }}">
</div> </div>
<div class="col s12 m8"> <div class="col s12 m8">
<p> <p>
The <a href="{{ url_for('main.dashboard') }}">dashboard</a> provides a central overview of all resources assigned to the The <a href="{{ url_for('main.dashboard') }}">dashboard</a> provides a central
user. These are <a href="{{ url_for('main.dashboard', _anchor='corpora') }}">corpora</a> and created <a href="{{ url_for('main.dashboard', _anchor='jobs') }}">jobs</a>. Corpora are freely composable overview of all user-specific resources.
annotated text collections and jobs are the initiated file processing These are <a href="{{ url_for('main.dashboard', _anchor='corpora') }}">corpora</a>,
procedures. Both the job and the corpus listings can be searched using created <a href="{{ url_for('main.dashboard', _anchor='jobs') }}">jobs</a>, and
the search field displayed above them. model <a href="{{ url_for('main.dashboard', _anchor='contributions') }}"">contributions</a>.
A <b>corpus</b> is a freely composable annotated text collection.
A <b>job</b> is an initiated file processing procedure.
A <b>model</b> is a mathematical system for pattern recognition based on data examples that have been processed by AI. One can search for jobs as
well as corpus listings using the search field displayed above them on the dashboard.
Uploaded models can be found and edited by clicking on the corresponding service under <b>My Contributions</b>.
</p> </p>
</div> </div>
<div class="col s12">&nbsp;</div> <div class="col s12">&nbsp;</div>
@ -20,10 +27,10 @@
<p> <p>
A corpus is a collection of texts that can be analyzed using the A corpus is a collection of texts that can be analyzed using the
Corpus Analysis service. All texts must be in the verticalized text Corpus Analysis service. All texts must be in the verticalized text
file format, which can be obtained via the Natrual Language file format, which can be obtained via the Natural Language
Processing service. It contains, in addition to the actual text, Processing service. It contains, in addition to the text,
further annotations that are searchable in combination with optional further annotations that are searchable in combination with optional
addable metadata during your analysis. metadata that can be added during your analysis.
</p> </p>
</div> </div>
</div> </div>

View File

@ -1,52 +1,107 @@
<h3 class="manual-chapter-title">Services</h5> <h3 class="manual-chapter-title">Services</h5>
<h4>Services</h4>
<p>
In this section, we will describe the different services nopaque has to offer.
</p>
<div class="row"> <div class="row">
<div class="col s12 m4"> <div class="col s12 m4">
<img alt="Services" class="materialboxed responsive-img" src="{{ url_for('static', filename='images/manual/services.png') }}"> <img alt="Services" class="materialboxed responsive-img" src="{{ url_for('static', filename='images/manual/services.png') }}">
</div> </div>
<div class="col s12 m8"> <div class="col s12 m8">
<p> <p>
nopaque was designed from the ground up to be modular. This modularity Nopaque was designed to be modular. Its modules are implemented in
means that the offered workflow provides variable entry and exit points, self-contained <b>services</b>, each of which represents a step in the
so that different starting points and goals can be flexibly addressed. workflow. The typical workflow involves using services one after another,
Each of these modules are implemented in a self-contained service, each of consecutively.
which represents a step in the workflow. The services are coordinated in The typical workflow order can be taken from the listing of the
such a way that they can be used consecutively. The order can either be services in the left sidebar or from the nopaque manual (accessible via the pink
taken from the listing of the services in the left sidebar or from the button in the upper right corner).
roadmap (accessible via the pink compass in the upper right corner). All The services can also be applied at different starting and ending points,
services are versioned, so the data generated with nopaque is always which allows you to conduct your work flexibly.
All services are versioned, so the data generated with nopaque is always
reproducible. reproducible.
<p>For all services, it is recommended to use the latest version (selected
in the drop-down menu on the service page) unless you need a model
only available in an earlier version or are looking to reproduce data that was originally generated
using an older version.</p>
</p> </p>
</div> </div>
</div> </div>
<h4 class="manual-chapter-title">File Setup</h4>
<h4>File Setup</h4>
<p> <p>
The <a href="{{ url_for('services.file_setup_pipeline') }}">File Setup Service</a> bundles image data, such as scans and photos, The <a href="{{ url_for('services.file_setup_pipeline') }}">File Setup Service</a> bundles image data, such as scans and photos,
together in a handy PDF file. To use this service, use the job form to together in a handy PDF file. To use this service, use the job form to
select the images to be bundled, choose the desired service version, and select the images to be bundled, choose the desired service version, and
specify a title and description. Please note that the service sorts the specify a title and description.
images into the resulting PDF file based on the file names. So naming the Note that the File Setup service will sort the images based on their file name in
images correctly is of great importance. It has proven to be a good practice ascending order. It is thus important and highly recommended to name
to name the files according to the following scheme: them accordingly, for example:
page-01.png, page-02.jpg, page-03.tiff, etc. In general, you can assume page-01.png, page-02.jpg, page-03.tiff. Generally, you can assume
that the images will be sorted in the order in which the file explorer of that the images will be sorted in the order in which the file explorer of
your operating system lists them when you view the files in a folder your operating system lists them when you view the files in a folder
sorted in ascending order by file name. sorted in ascending order by file name.
</p> </p>
<h4>Optical Character Recognition (OCR)</h4> <h4>Optical Character Recognition (OCR)</h4>
<p>Comming soon...</p> <p>
The <a href="{{ url_for('services.tesseract_ocr_pipeline') }}">Tesseract OCR Pipeline</a>
converts image data - like photos and scans - into text data, making them machine-readable.
This step enables you to proceed with the computational analysis of your documents.
To use this service, use the job form to select the file you want to convert into text data.
Then, choose the language model and service version you would like to use. Enter a title and description for your file and then
submit your job. Once the job is finished, the results can be found and downloaded further below on the page, under
the section labeled "Inputs."
</p>
<h4>Handwritten Text Recognition (HTR)</h4> <h4>Handwritten Text Recognition (HTR)</h4>
<p>Comming soon...</p> <p>The Transkribus HTR Pipeline is currently
deactivated. We are working on an alternative solution. In the meantime, you can
try using Tesseract OCR, though the results will likely be poor.</p>
<h4>Natural Language Processing (NLP)</h4> <h4>Natural Language Processing (NLP)</h4>
<p>Comming soon...</p> <p>The <a href="{{ url_for('services.spacy_nlp_pipeline') }}">SpaCy NLP Pipeline</a> extracts
information from plain text files (.txt format) via computational linguistic data processing
(tokenization, lemmatization, part-of-speech tagging and named-entity recognition).
To use this service, select the .txt file that you want to extract this information from.
Then select the language model and the version you want to use. Once the job is finished, you can find and download the files in
<b>.json</b> and <b>.vrt</b> format under the section labeled “Results.”</p>
<h4>Corpus Analysis</h4> <h4>Corpus Analysis</h4>
<p> <p>
With the corpus analysis service, it is possible to create a text corpus With the <a href="{{ url_for('services.corpus_analysis') }}">Corpus Analysis</a>
and then explore it in an analysis session. The analysis session is realized service, it is possible to create a text corpus
and then explore through it with analytical tools. The analysis session is realized
on the server side by the Open Corpus Workbench software, which enables on the server side by the Open Corpus Workbench software, which enables
efficient and complex searches with the help of the CQP Query Language. efficient and complex searches with the help of the CQP Query Language.</p>
<p>
To use this service, navigate to the corpus you would like to analyze and click on the Analyze button.
This will take you to an analysis overview page for your corpus. Here, you can find
a visualization of general linguistic information of your corpus, including tokens,
sentences, unique words, unique lemmas, unique parts of speech and unique simple
parts of speech. You will also find a pie chart of the proportional textual makeup
of your corpus and can view the linguistic information for each individual text file.
A more detailed visualization of token frequencies with a search option is also on
this page.
</p>
<p>
From the corpus analysis overview page, you can navigate to other analysis modules:
the Query Builder (under Concordance) and the Reader.
</p>
<p>
With the <b>Reader</b>, you can read your corpus texts tokenized with the associated linguistic information. The tokens
can be shown as lemmas, parts of speech, words, and can be displayed in different
ways: visually as plain text with the option of highlighted entities or as chips.
</p>
<p>
The Concordance module allows for more specific, query-oriented text analyses.
Here, you can filter out text parameters and structural attributes in different
combinations. This is explained in more detail in the <b>Query Builder</b> section of the
manual.
</p>
</p> </p>

View File

@ -7,7 +7,7 @@
<div class="col s12 m8"> <div class="col s12 m8">
<p> <p>
To <a href="{{ url_for('corpora.create_corpus') }}">create a corpus</a>, you To <a href="{{ url_for('corpora.create_corpus') }}">create a corpus</a>, you
can use the "New Corpus" button, which can be found on both, the Corpus can use the "New Corpus" button, which can be found on both the Corpus
Analysis Service page and the Dashboard below the corpus list. Fill in the input Analysis Service page and the Dashboard below the corpus list. Fill in the input
mask to Create a corpus. After you have completed the input mask, you will mask to Create a corpus. After you have completed the input mask, you will
be automatically taken to the corpus overview page (which can be called up be automatically taken to the corpus overview page (which can be called up
@ -43,5 +43,5 @@
the way of how a token is displayed, by using the text style switch. The the way of how a token is displayed, by using the text style switch. The
concordance module offers some more options regarding the context size of concordance module offers some more options regarding the context size of
search results. If the context does not provide enough information you can search results. If the context does not provide enough information you can
hop into the reader module by using the lupe icon next to a match. hop into the reader module by using the magnifier icon next to a match.
</p> </p>

View File

@ -1,5 +1,22 @@
<h3 class="manual-chapter-title">CQP Query Language</h3> <h3 class="manual-chapter-title">CQP Query Language</h3>
<p>Within the Corpus Query Language, a distinction is made between two types of annotations: positional attributes and structural attributes. Positional attributes refer to a token, e.g. the word "book" is assigned the part-of-speech tag "NN", the lemma "book" and the simplified part-of-speech tag "NOUN" within the token structure. Structural attributes refer to text structure-giving elements such as sentence and entity markup. For example, the markup of a sentence is represented in the background as follows:</p> <h4 id="cqp-query-language">CQP Query Language</h4>
<p>In this section, we will provide some functional explanations of the properties of the Corpus Query Language. This includes
the types of linguistic attributes one can work with and how to use them in your query.</p>
<div style="border: 1px solid; padding-left: 20px; margin-right: 400px; margin-bottom: 40px;">
<h5>Content</h5>
<ol style="list-style-type:disc">
<li><a href="#overview-annotations">Overview of annotation types</a></li>
<li><a href="#positional-attributes">Positional attributes</a></li>
<li><a href="#searching-positional-attributes">How to search for positional attributes</a></li>
<li><a href="#structural-attributes">Structural attributes</a></li>
<li><a href="#searching-structural-attributes">How to search for structural attributes</a></li>
</ol>
</div>
<h4 id="overview-annotations">Overview of annotation types</h4>
<p>Within the Corpus Query Language, a distinction is made between two types of annotations: <b>positional attributes</b> and <b>structural attributes</b>. Positional attributes refer to a token, e.g. the word "book" is assigned the part-of-speech tag "NN", the lemma "book" and the simplified part-of-speech tag "NOUN" within the token structure. Structural attributes refer to text structure-giving elements such as sentence and entity markup. For example, the markup of a sentence is represented in the background as follows:</p>
<pre> <pre>
<code> <code>
<span class="green-text">&lt;s&gt; structural attribute</span> <span class="green-text">&lt;s&gt; structural attribute</span>
@ -13,7 +30,7 @@
</code> </code>
</pre> </pre>
<h4>Positional attributes</h4> <h4 id="positional-attributes">Positional attributes</h4>
<p>Before you can start searching for positional attributes (also called tokens), it is necessary to know what properties they contain.</p> <p>Before you can start searching for positional attributes (also called tokens), it is necessary to know what properties they contain.</p>
<ol> <ol>
<li><span class="blue-text"><b>word</b></span>: The string as it is also found in the original text</li> <li><span class="blue-text"><b>word</b></span>: The string as it is also found in the original text</li>
@ -33,7 +50,7 @@
</li> </li>
</ol> </ol>
<h5>Searching for positional attributes</h5> <h5 id="searching-positional-attributes">How to search for positional attributes</h5>
<div> <div>
<p> <p>
<b>Token with no condition on any property (also called <span class="blue-text">wildcard token</span>)</b><br> <b>Token with no condition on any property (also called <span class="blue-text">wildcard token</span>)</b><br>
@ -118,7 +135,7 @@
<pre style="margin-top: 0;" ><code> ^ ^ the braces indicate the start and end of an option group</code></pre> <pre style="margin-top: 0;" ><code> ^ ^ the braces indicate the start and end of an option group</code></pre>
</div> </div>
<h4>Structural attributes</h4> <h4 id="structural-attributes">Structural attributes</h4>
<p>nopaque provides several structural attributes for query. A distinction is made between attributes with and without value.</p> <p>nopaque provides several structural attributes for query. A distinction is made between attributes with and without value.</p>
<ol> <ol>
<li><span class="green-text"><b>s</b></span>: Annotates a sentence</li> <li><span class="green-text"><b>s</b></span>: Annotates a sentence</li>
@ -153,7 +170,7 @@
</li> </li>
</ol> </ol>
<h5>Searching for structural attributes</h5> <h5 id="searching-structural-attributes">How to search for structural attributes</h5>
<pre><code>&lt;ent&gt; [] &lt;/ent&gt;; A one token long entity of any type</code></pre> <pre><code>&lt;ent&gt; [] &lt;/ent&gt;; A one token long entity of any type</code></pre>
<pre><code>&lt;ent_type="PERSON"&gt; [] &lt;/ent_type&gt;; A one token long entity of type PERSON</code></pre> <pre><code>&lt;ent_type="PERSON"&gt; [] &lt;/ent_type&gt;; A one token long entity of type PERSON</code></pre>
<pre><code>&lt;ent_type="PERSON"&gt; []* &lt;/ent_type&gt;; Entity of any length of type PERSON</code></pre> <pre><code>&lt;ent_type="PERSON"&gt; []* &lt;/ent_type&gt;; Entity of any length of type PERSON</code></pre>

View File

@ -1,26 +1,46 @@
<h3 class="manual-chapter-title">Query Builder Tutorial</h3> <h3 class="manual-chapter-title">Query Builder Tutorial</h3>
<h4>Query Builder</h4>
<p>The query builder helps you to make a query in the form of the Corpus Query <p>In this section, we will provide you with more detailed instructions on how to use the Query Builder -
Language (CQL) to your text. You can use the CQL to filter out various types of nopaque's main user-friendly tool for finding and analyzing different linguistic elements of your texts.</p>
text parameters, for example, a specific word, a lemma, or you can set part-of-speech
tags (pos) that indicate the type of word you are looking for (a noun, an
adjective, etc.). In addition, you can also search for structural attributes,
or specify your query for a token (word, lemma, pos) via entity typing. And of
course everything can be combined. You can find examples for different queries
under the tab "Examples".</p>
<p></p>
<br>
<div style="border: 1px solid; padding-left: 20px; margin-right: 400px; margin-bottom: 40px;"> <div style="border: 1px solid; padding-left: 20px; margin-right: 400px; margin-bottom: 40px;">
<h5>Content</h5> <h5>Content</h5>
<ol style="list-style-type:disc"> <ol style="list-style-type:disc">
<li><a href="#add-new-token-tutorial">Add new token to your query</a></li> <li><a href="#general-overview">General Overview</a></li>
<li><a href="#edit-options-tutorial">Options to edit your query</a></li> <li><a href="#add-new-token-tutorial">Add a new token to your query</a></li>
<li><a href="#add-structural-attribute-tutorial">Add structural Attributes to your query</a></li> <li><a href="#edit-options-tutorial">Options for editing your query</a></li>
<li><a href="#add-structural-attribute-tutorial">Add structural attributes to your query</a></li>
<li><a href="#general-options-query-builder">General options</a></li> <li><a href="#general-options-query-builder">General options</a></li>
</ol> </ol>
</div> </div>
<h4 id="general-overview">General Overview</h4>
<p>The Query Builder can be accessed via <a href=" {{ url_for('main.dashboard') }}">My Corpora</a> or <a href=" {{ url_for('services.corpus_analysis') }}">Corpus Analysis</a> in the sidebar options.
Click on the corpus you wish to analyze. You will be sent to its corpus overview page.
Here, click on <b>Analyze</b> to reach the analysis page.
The analysis page features different options for analyzing your corpus, including
visualizations and a <b>Reader</b> module. In this case, we want to open the query builder.
To do so, click on the <b>Concordance</b> button on the top of the page.</p>
<p>The query builder uses the <b>Corpus Query Language (CQL)</b> to help you make a query for analyzing your texts.
In this way, it is possible to filter out various types of text parameters, for
example, a specific word, a lemma, or you can set part-of-speech
tags (pos) that indicate the type of word you are looking for (a noun, an
adjective, etc.). In addition, you can also search for structural attributes,
or specify your query for a token (word, lemma, pos) via entity typing. And of
course, the different text parameters can be combined.</p>
<p>Tokens and structural attributes can be added by clicking on the <b>"+"</b> button
(what we call the "input marker") in the input field or the labeled buttons below it. Elements
added are shown as chips. These can be reorganized using drag and drop. The input
marker can also be moved in this way. Its position shows where new elements will be added. <br>
A "translation" of your query into Corpus Query Language (CQL) will be displayed underneath the query field.</p>
<p>For more information, see our <b>manual section for the Corpus Query Language.</b>
<br>
Advanced users can make direct use of CQL by switching to <b>expert mode</b> via the toggle button.
</p>
<p>The entire input field can be cleared using the red trash icon on the right.</p>
<br>
{# Add Token Tutorial #} {# Add Token Tutorial #}
<div> <div>
<hr> <hr>
@ -29,7 +49,7 @@ under the tab "Examples".</p>
<h4 id="add-new-token-tutorial">Add new token to your Query</h4> <h4 id="add-new-token-tutorial">Add new token to your Query</h4>
<p>If you are only looking for a specific token, you can click on the left <p>If you are only looking for a specific token, you can click on the left
button and select the type of token you are looking for from the drop-down menu. button and select the type of token you are looking for from the drop-down menu.
By default "Word" is selected. </p> "Word" is selected by default. </p>
<h5>Word and Lemma</h5> <h5>Word and Lemma</h5>
<p>If you want to search for a specific word or lemma and the respective <p>If you want to search for a specific word or lemma and the respective
@ -46,13 +66,13 @@ under the tab "Examples".</p>
"simple_pos" to search for different parts-of-speech. You can find an overview "simple_pos" to search for different parts-of-speech. You can find an overview
of all tags under the "Tagsets" tab.</p> of all tags under the "Tagsets" tab.</p>
<img src="{{ url_for('static', filename='images/manual/query_builder/pos.gif') }}" alt="part-of-speech-tag explanation" width="100%;" style="margin-bottom:20px;"> <img src="{{ url_for('static', filename='images/manual/query_builder/pos.gif') }}" alt="part-of-speech-tag explanation" width="100%;" style="margin-bottom:20px;">
<br>
<h5>Empty Token</h5> <h5>Empty Token</h5>
<p>Here you can search for an empty token. This selection should never stand <p>Here you can search for a token with unspecified attributes (also called wildcard token). This
alone and should always be extended with an incidence modifier or stand in a selection should never stand alone and should always be extended with an incidence modifier or stand in a
larger query, because otherwise all possible tokens would be searched for and larger query, because otherwise all possible tokens would be searched for and
the program would crash.</p> the program would crash.</p>
<p></p>
<br> <br>
</div> </div>
@ -61,8 +81,8 @@ under the tab "Examples".</p>
<hr> <hr>
<p></p> <p></p>
<br> <br>
<h4 id="edit-options-tutorial">Options to edit your token</h4> <h4 id="edit-options-tutorial">Options for editing your query</h4>
<p>You have the possibility to extend or specify your searched token with <p>You have the possibility to extend or specify the token you are searching for with
certain factors. For this the query builder offers some fixed options. You can certain factors. For this the query builder offers some fixed options. You can
find more information about the options in the Corpus Query Language Tutorial.</p> find more information about the options in the Corpus Query Language Tutorial.</p>
<br> <br>
@ -76,7 +96,6 @@ under the tab "Examples".</p>
variants are not limited, so you can manually enter more options in the same variants are not limited, so you can manually enter more options in the same
format. "Option1" and "option2" must be replaced accordingly. </p> format. "Option1" and "option2" must be replaced accordingly. </p>
<img src="{{ url_for('static', filename='images/manual/query_builder/option_group.gif') }}" alt="option group explanation" width="100%;" style="margin-bottom:20px;"> <img src="{{ url_for('static', filename='images/manual/query_builder/option_group.gif') }}" alt="option group explanation" width="100%;" style="margin-bottom:20px;">
<p></p>
<br> <br>
<h5>Incidence Modifiers</h5> <h5>Incidence Modifiers</h5>
@ -85,7 +104,7 @@ under the tab "Examples".</p>
not at all or once: <br> not at all or once: <br>
[word = "is"] [word="it"] [word="your"] [word="litte"]? [word = "dog"] <br> [word = "is"] [word="it"] [word="your"] [word="litte"]? [word = "dog"] <br>
Here the word "little" should occur either once or not at all. With Here the word "little" should occur either once or not at all. With
[word="dogs?"] the search is for "dog "or "dogs". </p> [word="dogs?"] the search is for "dog "or "dogs".</p>
<br> <br>
<h5>Ignore Case</h5> <h5>Ignore Case</h5>
@ -101,7 +120,10 @@ under the tab "Examples".</p>
this case. For this you can simply string them together: <br> this case. For this you can simply string them together: <br>
[word="I"] [word="will" & simple_pos="VERB"] [word="go"].</p> [word="I"] [word="will" & simple_pos="VERB"] [word="go"].</p>
<img src="{{ url_for('static', filename='images/manual/query_builder/or_and.gif') }}" alt="OR/AND explanation" width="100%;" style="margin-bottom:20px;"> <img src="{{ url_for('static', filename='images/manual/query_builder/or_and.gif') }}" alt="OR/AND explanation" width="100%;" style="margin-bottom:20px;">
<p></p> <p>Tokens that have already been added can also be modified by clicking on the corresponding
pen icon. Click on the "ignore case" box, for example, and the query builder will
not differentiate between upper- and lower- case letters for that respective token.
New conditions added apply to the most recent token information.</p>
<br> <br>
</div> </div>
@ -120,26 +142,33 @@ under the tab "Examples".</p>
This search can of course be specified if you search for particular tokens or This search can of course be specified if you search for particular tokens or
entities between the sentence tags (<s></s>). For example, you can search for entities between the sentence tags (<s></s>). For example, you can search for
sentences that contain only a noun, verb, and adjective. <br> sentences that contain only a noun, verb, and adjective. <br>
After clicking on Sentence you will see a <div class="chip" style="background-color:#FD9720;">Sentence Start</div>. Click on Sentence to add the sentence chips: <div class="chip" style="background-color:#FD9720;">Sentence Start</div>
When you are done with your query or the content and <div class="chip" style="background-color:#FD9720;">Sentence End</div>.
between the Sentence tags, you have to click the Sentence button one more time These mark where the sentence starts and ends. Use drag-and-drop to place them accordingly. When
to close it. The corresponding button is called the Sentence attribute is added, the input marker will automatically be
<div class="chip" style="background-color:#FD9720;">Sentence End</div>.<br> moved between the sentence chips. Use drag-and-drop as needed to continue your query
at a different position.
<br> <br>
<h5>Entities</h5> <h5>Entities</h5>
<p>With entities, i.e. units of meaning, you search for text sections that <p>With entities, i.e. units of meaning, you can search for text sections that
follow a certain code. For example, persons, dates, certain events. You can contain more specific information, for example, persons, dates, or events. The
select the codes using the drop-down menus. You can find an explanation of codes for these categories can be selected using the drop-down menus. You can find an explanation of
the respective abbreviations under the tab "Tagsets". <br> these abbreviations under the tab "Tagsets". <br>
You can also search for unspecified entities by selecting "Add entity of any type".</p> You can also search for unspecified entities by selecting "Add entity of any type".</p>
To close the entity query you started, you have to click the entity button one more time. This will make the <div class="chip" style="background-color:#A6E22D;">Entity End</div> element appear in your query. Click on the Entity button to add the entity chips <div class="chip" style="background-color:#A6E22D;">Entity Type=</div> and <div class="chip" style="background-color:#A6E22D;">Entity End</div>.
<p>The entity type can be changed by clicking on the pen symbol on the chip. When
the Entity attribute is added, the input marker will automatically be
moved between the entity chips. Use drag-and-drop as needed to continue your query
at a different position.</p>
<img src="{{ url_for('static', filename='images/manual/query_builder/entity.gif') }}" alt="entity explanation" width="100%;" style="margin-bottom:20px;"> <img src="{{ url_for('static', filename='images/manual/query_builder/entity.gif') }}" alt="entity explanation" width="100%;" style="margin-bottom:20px;">
<p></p> <p></p>
<br> <br>
<h5>Meta Data</h5> <h5>Meta Data (currently unavailable)</h5>
<p>With the meta data you can annotate your text and add specific conditions. <p>The meta data function is being worked on and cannot currently be used!
<br>
With the meta data you can annotate your text and add specific conditions.
You can select a category on the left and enter your desired value on the right. You can select a category on the left and enter your desired value on the right.
The selected metadata will apply to your entire request and will be added at the end.</p> The selected metadata will apply to your entire request and will be added at the end.</p>
<img src="{{ url_for('static', filename='images/manual/query_builder/meta_data.gif') }}" alt="meta data explanation" width="100%;" style="margin-bottom:20px;"> <img src="{{ url_for('static', filename='images/manual/query_builder/meta_data.gif') }}" alt="meta data explanation" width="100%;" style="margin-bottom:20px;">
@ -155,14 +184,39 @@ under the tab "Examples".</p>
<br> <br>
<h4 id="general-options-query-builder">General Options of the query builder</h4> <h4 id="general-options-query-builder">General Options of the query builder</h4>
<p>You have several options to edit your query after adding it to the preview.</p> <p>You have several options to edit your query after adding it to the preview.</p>
<br>
<h5>Editing the elements</h5>
<p>You can edit your query chips by clicking on the pen icon.</p>
<img src="{{ url_for('static', filename='images/manual/query_builder/editing_chips.gif') }}" alt="editing explanation" width="100%;" style="margin-bottom:20px;">
<br>
<h5>Deleting the elements</h5> <h5>Deleting the elements</h5>
<p>You can delete the added elements from the query by clicking the X behind the respective content.</p> <p>You can delete the added elements from the query by clicking the X behind the respective content.</p>
<img src="{{ url_for('static', filename='images/manual/query_builder/delete.gif') }}" alt="delete explanation" width="100%;" style="margin-bottom:20px;"> <img src="{{ url_for('static', filename='images/manual/query_builder/delete.gif') }}" alt="delete explanation" width="100%;" style="margin-bottom:20px;">
<br>
<h5>Move the elements of your query</h5> <h5>Move the elements of your query</h5>
<p>You can drag and drop elements to customize your query.</p> <p>You can drag and drop elements to customize your query.</p>
<img src="{{ url_for('static', filename='images/manual/query_builder/drag_and_drop.gif') }}" alt="Drag&Drop explanation" width="100%;" style="margin-bottom:20px;"> <img src="{{ url_for('static', filename='images/manual/query_builder/drag_and_drop.gif') }}" alt="Drag&Drop explanation" width="100%;" style="margin-bottom:20px;">
<br>
<h5>Setting an incidence modifier</h5>
<p>With the incidence modifier option, you can specify the amount of
times a token should appear in your query. This is particularly relevant for empty
tokens (tokens with unspecified attributes). Click on a token (blue chip) and
select the desired option from the list to add an incidence modifier. To
close the list without adding anything, click on the token again.</p>
<img src="{{ url_for('static', filename='images/manual/query_builder/incidence_modifier.gif') }}" alt="incidence modifier explanation" width="100%;" style="margin-bottom:20px;">
<br>
<h5>Switching between Query Builder and Expert mode</h5>
<p>To work with the plain Corpus Query Language instead of using the Query Builder, click on the "expert mode"
switch. Your query can be entered into the input field. All elements previously added will be carried over
into expert mode. Click on the switch again to switch back to the Query Builder if desired. All recognized elements
will be parsed into chips; those not recognized will be deleted from the query.</p>
<img src="{{ url_for('static', filename='images/manual/query_builder/expert_mode.gif') }}" alt="expert mode explanation" width="100%;" style="margin-bottom:20px;">
</div> </div>

View File

@ -3,21 +3,22 @@
<h2>Manual</h2> <h2>Manual</h2>
<ul class="tabs" id="manual-modal-toc"> <ul class="tabs" id="manual-modal-toc">
<li class="tab"><a href="#manual-modal-introduction">Introduction</a></li> <li class="tab"><a href="#manual-modal-introduction">Introduction</a></li>
<li class="tab"><a href="#manual-modal-registration-and-log-in">Registration and Log in</a></li> <li class="tab"><a href="#manual-modal-getting-started">Getting Started</a></li>
<li class="tab"><a href="#manual-modal-dashboard">Dashboard</a></li> <li class="tab"><a href="#manual-modal-dashboard">Dashboard</a></li>
<li class="tab"><a href="#manual-modal-services">Services</a></li> <li class="tab"><a href="#manual-modal-services">Services</a></li>
<li class="tab"><a href="#manual-modal-a-closer-look-at-the-corpus-analysis">A closer look at the Corpus Analysis</a></li> <!-- <li class="tab"><a href="#manual-modal-a-closer-look-at-the-corpus-analysis">A closer look at the Corpus Analysis</a></li> -->
<li class="tab"><a href="#manual-modal-cqp-query-language">CQP Query Language</a></li>
<li class="tab"><a href="#manual-modal-query-builder">Query Builder</a></li> <li class="tab"><a href="#manual-modal-query-builder">Query Builder</a></li>
<li class="tab"><a href="#manual-modal-cqp-query-language">CQP Query Language</a></li>
<li class="tab"><a href="#manual-modal-tagsets">Tagsets</a></li> <li class="tab"><a href="#manual-modal-tagsets">Tagsets</a></li>
</ul> </ul>
<div id="manual-modal-introduction"> <div id="manual-modal-introduction">
<br> <br>
{% include "_base/_modals/_manual/01_introduction.html.j2" %} {% include "_base/_modals/_manual/01_introduction.html.j2" %}
</div> </div>
<div id="manual-modal-registration-and-log-in"> <div id="manual-modal-getting-started">
<br> <br>
{% include "_base/_modals/_manual/02_registration_and_log_in.html.j2" %} {% include "_base/_modals/_manual/02_getting_started.html.j2" %}
</div> </div>
<div id="manual-modal-dashboard"> <div id="manual-modal-dashboard">
<br> <br>
@ -27,10 +28,10 @@
<br> <br>
{% include "_base/_modals/_manual/06_services.html.j2" %} {% include "_base/_modals/_manual/06_services.html.j2" %}
</div> </div>
<div id="manual-modal-a-closer-look-at-the-corpus-analysis"> <!-- <div id="manual-modal-a-closer-look-at-the-corpus-analysis">
<br> <br>
{% include "_base/_modals/_manual/07_a_closer_look_at_the_corpus_analysis.html.j2" %} {% include "_base/_modals/_manual/07_a_closer_look_at_the_corpus_analysis.html.j2" %}
</div> </div> -->
<div id="manual-modal-cqp-query-language"> <div id="manual-modal-cqp-query-language">
<br> <br>
{% include "_base/_modals/_manual/08_cqp_query_language.html.j2" %} {% include "_base/_modals/_manual/08_cqp_query_language.html.j2" %}

View File

@ -27,7 +27,10 @@
<div class="col s12 m3"> <div class="col s12 m3">
<span>© 2020 Bielefeld University</span> <span>© 2020 Bielefeld University</span>
</div> </div>
<div class="col s12 m9 right-align"> <div class="col s12 m2">
<span class="right"><b>Version {{ config.NOPAQUE_VERSION }}</b></span>
</div>
<div class="col s12 m7 right-align">
<a class="btn-small primary-variant-color waves-effect waves-light" href="{{ url_for('main.faq') }}"><i class="left material-icons">info_outline</i>Frequently Asked Questions</a> <a class="btn-small primary-variant-color waves-effect waves-light" href="{{ url_for('main.faq') }}"><i class="left material-icons">info_outline</i>Frequently Asked Questions</a>
<a class="btn-small primary-variant-color waves-effect waves-light" href="mailto:{{ config.NOPAQUE_SERVICE_DESK }}"><i class="left material-icons">mail</i>Report an issue</a> <a class="btn-small primary-variant-color waves-effect waves-light" href="mailto:{{ config.NOPAQUE_SERVICE_DESK }}"><i class="left material-icons">mail</i>Report an issue</a>
<a class="btn-small primary-variant-color waves-effect waves-light" href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque" target="_blank"><i class="left material-icons">code</i>GitLab</a> <a class="btn-small primary-variant-color waves-effect waves-light" href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque" target="_blank"><i class="left material-icons">code</i>GitLab</a>

View File

@ -5,7 +5,6 @@
<a href="#" data-target="sidenav" class="sidenav-trigger"><i class="material-icons">menu</i></a> <a href="#" data-target="sidenav" class="sidenav-trigger"><i class="material-icons">menu</i></a>
{% endif %} {% endif %}
<a href="{{ url_for('main.index') }}" class="brand-logo" style="height: 100%; overflow: hidden;"> <a href="{{ url_for('main.index') }}" class="brand-logo" style="height: 100%; overflow: hidden;">
<img class="hide-on-small-only" src="{{ url_for('static', filename='images/nopaque_-_logo_name_slogan.svg') }}" style="height: 128px; margin-top: -32px; margin-left: -32px;">
<img class="hide-on-med-and-up" src="{{ url_for('static', filename='images/nopaque_-_logo.svg') }}" style="height: 128px; margin-top: -32px; margin-left: -32px;"> <img class="hide-on-med-and-up" src="{{ url_for('static', filename='images/nopaque_-_logo.svg') }}" style="height: 128px; margin-top: -32px; margin-left: -32px;">
</a> </a>
<ul class="right hide-on-med-and-down"> <ul class="right hide-on-med-and-down">
@ -31,14 +30,14 @@
{% endif %} {% endif %}
{%- endfor -%} {%- endfor -%}
</ul> </ul>
<a class="btn-floating btn-large halfway-fab modal-trigger pink tooltipped waves-effect waves-light" data-tooltip="Manual" href="#manual-modal"><i class="material-icons">help</i></a> <a class="btn-floating btn-large halfway-fab modal-trigger pink tooltipped waves-effect waves-light" data-tooltip="Manual" href="#manual-modal"><i class="material-icons">school</i></a>
</div> </div>
</nav> </nav>
</div> </div>
<ul class="dropdown-content" id="nav-more-dropdown"> <ul class="dropdown-content" id="nav-more-dropdown">
{# <li><a href="{{ url_for('main.user_manual') }}"><i class="material-icons left">help</i>Manual</a></li> #}
{% if current_user.is_authenticated %} {% if current_user.is_authenticated %}
<li><a href="{{ url_for('users.user', user_id=current_user.id) }}"><i class="material-icons left">person</i>My Profile</a></li>
<li><a href="{{ url_for('settings.settings') }}"><i class="material-icons left">settings</i>Settings</a></li> <li><a href="{{ url_for('settings.settings') }}"><i class="material-icons left">settings</i>Settings</a></li>
<li class="divider" tabindex="-1"></li> <li class="divider" tabindex="-1"></li>
<li><a href="{{ url_for('auth.logout') }}">Log out</a></li> <li><a href="{{ url_for('auth.logout') }}">Log out</a></li>

View File

@ -1,19 +1,18 @@
<ul class="sidenav sidenav-fixed" id="sidenav"> <ul class="sidenav sidenav-fixed" id="sidenav">
<li> <li class="primary-color hide-on-small-only">
<div class="user-view"> <div style="overflow: hidden; height: 64px; width: 250px;">
<div class="background primary-color"></div> <a href="{{ url_for('main.index') }}">
<img class="hide-on-small-only" src="{{ url_for('static', filename='images/nopaque_-_logo_name_slogan.svg') }}" style="height: 128px; margin-top: -32px;">
</a>
</div> </div>
</li> </li>
{# <li class="primary-color"> <li class="primary-variant-color center-align hide-on-small-only" style="padding-top: 8px; height:48px;">
<div style="overflow: hidden;height: 64px; width: 250px;"> <img src="{{ url_for('static', filename='images/nopaque_slogan_transparent.png') }}" style="width:85%">
<img class="hide-on-small-only" src="{{ url_for('static', filename='images/nopaque_-_logo_name_slogan.svg') }}" style="height: 128px; margin-top: -32px; margin-left: ;"> </li>
</div> <li class="hide-on-med-and-up"><a class="waves-effect" href="{{ url_for('main.index') }}"><i class="material-icons left">home</i>nopaque</a></li>
</li> #}
<li><a href="{{ url_for('main.index') }}"><i class="material-icons left">home</i>nopaque</a></li>
<li> <li>
<a class="waves-effect" href="{{ url_for('main.news') }}"><i class="material-icons left">email</i>News</a> <a class="waves-effect" href="{{ url_for('main.news') }}"><i class="material-icons left">email</i>News</a>
</li> </li>
{# <li><a href="{{ url_for('main.user_manual') }}"><i class="material-icons">help</i>Manual</a></li> #}
<li> <li>
<a class="waves-effect" class="waves-effect" href="{{ url_for('main.dashboard') }}"><i class="material-icons">dashboard</i>Dashboard</a> <a class="waves-effect" class="waves-effect" href="{{ url_for('main.dashboard') }}"><i class="material-icons">dashboard</i>Dashboard</a>
<ul> <ul>
@ -51,8 +50,9 @@
<li> <li>
<a class="waves-effect" class="waves-effect" href="{{ url_for('main.social_area') }}"><i class="material-icons">rocket_launch</i>Social Area</a> <a class="waves-effect" class="waves-effect" href="{{ url_for('main.social_area') }}"><i class="material-icons">rocket_launch</i>Social Area</a>
<ul> <ul>
<li><a href="{{ url_for('users.user', user_id=current_user.id) }}" style="padding-left: 47px;"><i class="material-icons left">person</i>My Profile</a></li>
<li> <li>
<a class="waves-effect" href="{{ url_for('main.social_area', _anchor='public-users') }}" style="padding-left: 47px;"><i class="material-icons">person</i>Public Users</a> <a class="waves-effect" href="{{ url_for('main.social_area', _anchor='public-users') }}" style="padding-left: 47px;"><i class="material-icons">group</i>Public Users</a>
</li> </li>
<li> <li>
<a class="waves-effect" href="{{ url_for('main.social_area', _anchor='public-corpora') }}" style="padding-left: 47px;"><i class="nopaque-icons">I</i>Public Corpora</a> <a class="waves-effect" href="{{ url_for('main.social_area', _anchor='public-corpora') }}" style="padding-left: 47px;"><i class="nopaque-icons">I</i>Public Corpora</a>

View File

@ -40,6 +40,7 @@
<a class="btn-small waves-effect waves-light tooltipped modal-trigger" href="#corpus-analysis-concordance-positional-attr-modal" data-position="bottom" data-tooltip="Search for any token, for example a word, a lemma or a part-of-speech tag">Add new token to your query</a> <a class="btn-small waves-effect waves-light tooltipped modal-trigger" href="#corpus-analysis-concordance-positional-attr-modal" data-position="bottom" data-tooltip="Search for any token, for example a word, a lemma or a part-of-speech tag">Add new token to your query</a>
<a class="btn-small waves-effect waves-light tooltipped modal-trigger" href="#corpus-analysis-concordance-structural-attr-modal" data-position="bottom" data-tooltip="Structure your query with structural attributes, for example sentences, entities or annotate the text">Add structural attributes to your query</a> <a class="btn-small waves-effect waves-light tooltipped modal-trigger" href="#corpus-analysis-concordance-structural-attr-modal" data-position="bottom" data-tooltip="Structure your query with structural attributes, for example sentences, entities or annotate the text">Add structural attributes to your query</a>
<a class="btn-small waves-effect waves-light tooltipped dropdown-trigger disabled" data-target="corpus-analysis-concordance-token-incidence-modifiers-dropdown" data-toggle-area="token-incidence-modifiers" data-position="top" data-tooltip="Incidence Modifiers are special characters or patterns, <br>which determine how often a character represented previously should occur.">incidence modifiers</a> <a class="btn-small waves-effect waves-light tooltipped dropdown-trigger disabled" data-target="corpus-analysis-concordance-token-incidence-modifiers-dropdown" data-toggle-area="token-incidence-modifiers" data-position="top" data-tooltip="Incidence Modifiers are special characters or patterns, <br>which determine how often a character represented previously should occur.">incidence modifiers</a>
<a class="modal-trigger" data-manual-modal-chapter="manual-modal-query-builder" href="#manual-modal"><i class="material-icons left" style="color:black">help_outline</i></a>
</div> </div>
</div> </div>
<div class="row"> <div class="row">
@ -66,7 +67,7 @@
<div id="corpus-analysis-concordance-structural-attr-modal" class="modal"> <div id="corpus-analysis-concordance-structural-attr-modal" class="modal">
<div class="modal-content"> <div class="modal-content">
<div class="attr-modal-header"> <div class="attr-modal-header">
<h5>Which structural attribute do you want to add to your query?<a class="modal-trigger" data-manual-modal-chapter="manual-modal-query-builder" href="#manual-modal"><i class="material-icons left" id="corpus-analysis-concordance-add-structural-attribute-tutorial-info-icon">help_outline</i></a></h5> <h5>Which structural attribute do you want to add to your query?<a class="modal-trigger" data-manual-modal-chapter="manual-modal-query-builder" data-manual-modal-chapter-anchor="add-structural-attribute-tutorial" href="#manual-modal"><i class="material-icons left" id="corpus-analysis-concordance-add-structural-attribute-tutorial-info-icon">help_outline</i></a></h5>
</div> </div>
<p></p> <p></p>
<br> <br>
@ -127,7 +128,7 @@
<div class="row attr-modal-header"> <div class="row attr-modal-header">
<p></p> <p></p>
<div class="col s12"> <div class="col s12">
<h5>Which kind of token are you looking for? <a class="modal-trigger" data-manual-modal-chapter="manual-modal-query-builder" href="#manual-modal"><i class="material-icons left" id="corpus-analysis-concordance-token-tutorial-info-icon">help_outline</i></a></h5> <h5>Which kind of token are you looking for? <a class="modal-trigger" data-manual-modal-chapter="manual-modal-query-builder" data-manual-modal-chapter-anchor="add-new-token-tutorial" href="#manual-modal"><i class="material-icons left" id="corpus-analysis-concordance-token-tutorial-info-icon">help_outline</i></a></h5>
</div> </div>
<div class="input-field col s3" style="margin-left:42px;"> <div class="input-field col s3" style="margin-left:42px;">
<select id="corpus-analysis-concordance-positional-attr-selection"> <select id="corpus-analysis-concordance-positional-attr-selection">
@ -336,7 +337,7 @@
</div> </div>
<div id="corpus-analysis-concordance-token-edit-options" data-toggle-area="input-field-options"> <div id="corpus-analysis-concordance-token-edit-options" data-toggle-area="input-field-options">
<div class="row"> <div class="row">
<h6>Options to edit your token: <a class="modal-trigger" data-manual-modal-chapter="manual-modal-query-builder" href="#manual-modal"><i class="material-icons left" id="corpus-analysis-concordance-edit-options-tutorial-info-icon">help_outline</i></a></h6> <h6>Options to edit your token: <a class="modal-trigger" data-manual-modal-chapter="manual-modal-query-builder" href="#manual-modal" data-manual-modal-chapter-anchor="edit-options-tutorial"><i class="material-icons left" id="corpus-analysis-concordance-edit-options-tutorial-info-icon">help_outline</i></a></h6>
</div> </div>
<p></p> <p></p>
<div class="row"> <div class="row">
@ -409,9 +410,3 @@
</div> </div>
</div> </div>
{% endmacro %} {% endmacro %}
{# {% macro scripts(id_prefix) %}
<script>
const concordanceQueryBuilder = new ConcordanceQueryBuilder();
</script>
{% endmacro %} #}

View File

@ -153,16 +153,16 @@
let deleteJobRequestElement = document.querySelector('#delete-job-request'); let deleteJobRequestElement = document.querySelector('#delete-job-request');
let restartJobRequestElement = document.querySelector('#restart-job-request'); let restartJobRequestElement = document.querySelector('#restart-job-request');
deleteJobRequestElement.addEventListener('click', (event) => { deleteJobRequestElement.addEventListener('click', (event) => {
requests.jobs.entity.delete({{ job.hashid|tojson }}); nopaque.requests.jobs.entity.delete({{ job.hashid|tojson }});
}); });
restartJobRequestElement.addEventListener('click', (event) => { restartJobRequestElement.addEventListener('click', (event) => {
requests.jobs.entity.restart({{ job.hashid|tojson }}); nopaque.requests.jobs.entity.restart({{ job.hashid|tojson }});
}); });
if ({{ current_user.is_administrator()|tojson }}) { if ({{ current_user.is_administrator()|tojson }}) {
let jobLogButtonElement = document.querySelector('#job-log-button'); let jobLogButtonElement = document.querySelector('#job-log-button');
jobLogButtonElement.addEventListener('click', (event) => { jobLogButtonElement.addEventListener('click', (event) => {
requests.jobs.entity.log({{ job.hashid|tojson }}) nopaque.requests.jobs.entity.log({{ job.hashid|tojson }})
.then( .then(
(response) => { (response) => {
response.json() response.json()

View File

@ -115,6 +115,8 @@ class Config:
NOPAQUE_READCOOP_USERNAME = os.environ.get('NOPAQUE_READCOOP_USERNAME') NOPAQUE_READCOOP_USERNAME = os.environ.get('NOPAQUE_READCOOP_USERNAME')
NOPAQUE_READCOOP_PASSWORD = os.environ.get('NOPAQUE_READCOOP_PASSWORD') NOPAQUE_READCOOP_PASSWORD = os.environ.get('NOPAQUE_READCOOP_PASSWORD')
NOPAQUE_VERSION='1.0.0'
@staticmethod @staticmethod
def init_app(app: Flask): def init_app(app: Flask):
# Set up logging according to the corresponding (NOPAQUE_LOG_*) # Set up logging according to the corresponding (NOPAQUE_LOG_*)