Merge branch 'access-pipeline' of gitlab.ub.uni-bielefeld.de:sfb1288inf/nopaque into access-pipeline

Job output list implementation
2026-02-27 17:16:39 +00:00 · 2024-03-14 10:12:41 +01:00 · 2024-03-14 10:09:45 +01:00 · 2024-03-14 10:08:52 +01:00 · 2024-03-13 12:58:39 +01:00 · 2024-03-07 15:49:04 +01:00
46 changed files with 2321 additions and 2005 deletions
--- a/27
+++ b/27
@@ -4,11 +4,13 @@ FROM python:3.10.13-slim-bookworm
 LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>"


+# Set environment variables
 ENV LANG="C.UTF-8"
 ENV PYTHONDONTWRITEBYTECODE="1"
 ENV PYTHONUNBUFFERED="1"


+# Install system dependencies
 RUN apt-get update \
 && apt-get install --no-install-recommends --yes \
      build-essential \
@@ -17,37 +19,42 @@ RUN apt-get update \
 && rm --recursive /var/lib/apt/lists/*


+# Create a non-root user
 RUN useradd --create-home --no-log-init nopaque \
 && groupadd docker \
 && usermod --append --groups docker nopaque

-
 USER nopaque
 WORKDIR /home/nopaque


+# Create a Python virtual environment
 ENV NOPAQUE_PYTHON3_VENV_PATH="/home/nopaque/.venv"
 RUN python3 -m venv "${NOPAQUE_PYTHON3_VENV_PATH}"
 ENV PATH="${NOPAQUE_PYTHON3_VENV_PATH}/bin:${PATH}"


+# Install Python dependencies
+COPY --chown=nopaque:nopaque requirements.txt requirements.txt
+RUN python3 -m pip install --requirement requirements.txt \
+ && rm requirements.txt
+
+
+# Install the application
+COPY docker-nopaque-entrypoint.sh /usr/local/bin/
+
 COPY --chown=nopaque:nopaque app app
 COPY --chown=nopaque:nopaque migrations migrations
 COPY --chown=nopaque:nopaque tests tests
 COPY --chown=nopaque:nopaque .flaskenv boot.sh config.py nopaque.py requirements.txt ./

-
-RUN python3 -m pip install --requirement requirements.txt \
- && mkdir logs
-
-
-USER root
-
-
-COPY docker-nopaque-entrypoint.sh /usr/local/bin/
+RUN mkdir logs


 EXPOSE 5000


+USER root
+
+
 ENTRYPOINT ["docker-nopaque-entrypoint.sh"]
--- a/app/init.py
+++ b/app/init.py
@@ -57,6 +57,9 @@ def create_app(config: Config = Config) -> Flask:
    scheduler.init_app(app)
    socketio.init_app(app, message_queue=app.config['NOPAQUE_SOCKETIO_MESSAGE_QUEUE_URI'])  # noqa

+    from .models.event_listeners import register_event_listeners
+    register_event_listeners()
+
    from .admin import bp as admin_blueprint
    default_breadcrumb_root(admin_blueprint, '.admin')
    app.register_blueprint(admin_blueprint, url_prefix='/admin')
--- a/app/contributions/spacy_nlp_pipeline_models/forms.py
+++ b/app/contributions/spacy_nlp_pipeline_models/forms.py
@@ -16,8 +16,8 @@ class CreateSpaCyNLPPipelineModelForm(ContributionBaseForm):
    )

    def validate_spacy_model_file(self, field):
-        if not field.data.filename.lower().endswith('.tar.gz'):
-            raise ValidationError('.tar.gz files only!')
+        if not field.data.filename.lower().endswith(('.tar.gz', ('.whl'))):
+            raise ValidationError('.tar.gz or .whl files only!')

    def __init__(self, *args, **kwargs):
        if 'prefix' not in kwargs:
--- a/app/converters/sandpaper.py
+++ b/app/converters/sandpaper.py
@@ -2,80 +2,69 @@ from flask import current_app
 from app import db
 from app.models import User, Corpus, CorpusFile
 from datetime import datetime
+from pathlib import Path
+from typing import Dict, List
 import json
-import os
 import shutil


 class SandpaperConverter:
-    def __init__(self, json_db_file, data_dir):
+    def __init__(self, json_db_file: Path, data_dir: Path):
        self.json_db_file = json_db_file
        self.data_dir = data_dir

    def run(self):
-        with open(self.json_db_file, 'r') as f:
-            json_db = json.loads(f.read())
+        with self.json_db_file.open('r') as f:
+            json_db: List[Dict] = json.load(f)

        for json_user in json_db:
            if not json_user['confirmed']:
                current_app.logger.info(f'Skip unconfirmed user {json_user["username"]}')
                continue
-            user_dir = os.path.join(self.data_dir, str(json_user['id']))
+            user_dir = self.data_dir / f'{json_user["id"]}'
            self.convert_user(json_user, user_dir)
            db.session.commit()


-    def convert_user(self, json_user, user_dir):
+    def convert_user(self, json_user: Dict, user_dir: Path):
        current_app.logger.info(f'Create User {json_user["username"]}...')
-        user = User(
-            confirmed=json_user['confirmed'],
-            email=json_user['email'],
-            last_seen=datetime.fromtimestamp(json_user['last_seen']),
-            member_since=datetime.fromtimestamp(json_user['member_since']),
-            password_hash=json_user['password_hash'],  # TODO: Needs to be added manually
-            username=json_user['username']
-        )
-        db.session.add(user)
-        db.session.flush(objects=[user])
-        db.session.refresh(user)
        try:
-            user.makedirs()
-        except OSError as e:
-            current_app.logger.error(e)
-            db.session.rollback()
+            user = User.create(
+                confirmed=json_user['confirmed'],
+                email=json_user['email'],
+                last_seen=datetime.fromtimestamp(json_user['last_seen']),
+                member_since=datetime.fromtimestamp(json_user['member_since']),
+                password_hash=json_user['password_hash'],  # TODO: Needs to be added manually
+                username=json_user['username']
+            )
+        except OSError:
            raise Exception('Internal Server Error')
        for json_corpus in json_user['corpora'].values():
            if not json_corpus['files'].values():
                current_app.logger.info(f'Skip empty corpus {json_corpus["title"]}')
                continue
-            corpus_dir = os.path.join(user_dir, 'corpora', str(json_corpus['id']))
+            corpus_dir = user_dir / 'corpora' / f'{json_corpus["id"]}'
            self.convert_corpus(json_corpus, user, corpus_dir)
        current_app.logger.info('Done')


-    def convert_corpus(self, json_corpus, user, corpus_dir):
+    def convert_corpus(self, json_corpus: Dict, user: User, corpus_dir: Path):
        current_app.logger.info(f'Create Corpus {json_corpus["title"]}...')
-        corpus = Corpus(
-            user=user,
-            creation_date=datetime.fromtimestamp(json_corpus['creation_date']),
-            description=json_corpus['description'],
-            title=json_corpus['title']
-        )
-        db.session.add(corpus)
-        db.session.flush(objects=[corpus])
-        db.session.refresh(corpus)
        try:
-            corpus.makedirs()
-        except OSError as e:
-            current_app.logger.error(e)
-            db.session.rollback()
+            corpus = Corpus.create(
+                user=user,
+                creation_date=datetime.fromtimestamp(json_corpus['creation_date']),
+                description=json_corpus['description'],
+                title=json_corpus['title']
+            )
+        except OSError:
            raise Exception('Internal Server Error')
        for json_corpus_file in json_corpus['files'].values():
            self.convert_corpus_file(json_corpus_file, corpus, corpus_dir)
        current_app.logger.info('Done')


-    def convert_corpus_file(self, json_corpus_file, corpus, corpus_dir):
+    def convert_corpus_file(self, json_corpus_file: Dict, corpus: Corpus, corpus_dir: Path):
        current_app.logger.info(f'Create CorpusFile {json_corpus_file["title"]}...')
        corpus_file = CorpusFile(
            corpus=corpus,
@@ -99,13 +88,13 @@ class SandpaperConverter:
        db.session.refresh(corpus_file)
        try:
            shutil.copy2(
-                os.path.join(corpus_dir, json_corpus_file['filename']),
+                corpus_dir / json_corpus_file['filename'],
                corpus_file.path
            )
        except:
            current_app.logger.warning(
                'Can not convert corpus file: '
-                f'{os.path.join(corpus_dir, json_corpus_file["filename"])}'
+                f'{corpus_dir / json_corpus_file["filename"]}'
                ' -> '
                f'{corpus_file.path}'
            )
--- a/app/corpora/cli.py
+++ b/app/corpora/cli.py
@@ -1,7 +1,7 @@
-from app.models import Corpus, CorpusStatus
-import os
+from flask import current_app
 import shutil
 from app import db
+from app.models import Corpus, CorpusStatus
 from . import bp


@@ -18,10 +18,17 @@ def reset():
    ]
    for corpus in [x for x in Corpus.query.all() if x.status in status]:
        print(f'Resetting corpus {corpus}')
-        shutil.rmtree(os.path.join(corpus.path, 'cwb'), ignore_errors=True)
-        os.mkdir(os.path.join(corpus.path, 'cwb'))
-        os.mkdir(os.path.join(corpus.path, 'cwb', 'data'))
-        os.mkdir(os.path.join(corpus.path, 'cwb', 'registry'))
+        corpus_cwb_dir = corpus.path / 'cwb'
+        corpus_cwb_data_dir = corpus_cwb_dir / 'data'
+        corpus_cwb_registry_dir = corpus_cwb_dir / 'registry'
+        try:
+            shutil.rmtree(corpus.path / 'cwb', ignore_errors=True)
+            corpus_cwb_dir.mkdir()
+            corpus_cwb_data_dir.mkdir()
+            corpus_cwb_registry_dir.mkdir()
+        except OSError as e:
+            current_app.logger.error(e)
+            raise
        corpus.status = CorpusStatus.UNPREPARED
        corpus.num_analysis_sessions = 0
    db.session.commit()
--- a/app/corpora/cqi_over_sio/extensions.py
+++ b/app/corpora/cqi_over_sio/extensions.py
@@ -12,7 +12,6 @@ from typing import Dict, List
 import gzip
 import json
 import math
-import os
 from app import db
 from app.models import Corpus
 from .utils import lookups_by_cpos, partial_export_subcorpus, export_subcorpus
@@ -42,9 +41,9 @@ def ext_corpus_static_data(corpus: str) -> Dict:
    db_corpus_id: int = session['cqi_over_sio']['db_corpus_id']
    db_corpus: Corpus = Corpus.query.get(db_corpus_id)

-    static_data_file_path: str = os.path.join(db_corpus.path, 'cwb', 'static.json.gz')
-    if os.path.exists(static_data_file_path):
-        with open(static_data_file_path, 'rb') as f:
+    static_data_file_path = db_corpus.path / 'cwb' / 'static.json.gz'
+    if static_data_file_path.exists():
+        with static_data_file_path.open('rb') as f:
            return f.read()

    cqi_client: CQiClient = session['cqi_over_sio']['cqi_client']
--- a/app/corpora/files/routes.py
+++ b/app/corpora/files/routes.py
@@ -7,7 +7,6 @@ from flask import (
    url_for
 )
 from flask_breadcrumbs import register_breadcrumb
-import os
 from app import db
 from app.models import Corpus, CorpusFile, CorpusStatus
 from ..decorators import corpus_follower_permission_required
@@ -92,8 +91,8 @@ def corpus_file(corpus_id, corpus_file_id):
 def download_corpus_file(corpus_id, corpus_file_id):
    corpus_file = CorpusFile.query.filter_by(corpus_id=corpus_id, id=corpus_file_id).first_or_404()
    return send_from_directory(
-        os.path.dirname(corpus_file.path),
-        os.path.basename(corpus_file.path),
+        corpus_file.path.parent,
+        corpus_file.path.name,
        as_attachment=True,
        attachment_filename=corpus_file.filename,
        mimetype=corpus_file.mimetype
--- a/app/ext/flask_sqlalchemy/init.py
+++ b/app/ext/flask_sqlalchemy/init.py
@@ -0,0 +1,2 @@
+from .container_column import ContainerColumn
+from .int_enum_column import IntEnumColumn
--- a/app/ext/flask_sqlalchemy/container_column.py
+++ b/app/ext/flask_sqlalchemy/container_column.py
@@ -0,0 +1,21 @@
+import json
+from app import db
+
+
+class ContainerColumn(db.TypeDecorator):
+    impl = db.String
+
+    def __init__(self, container_type, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.container_type = container_type
+
+    def process_bind_param(self, value, dialect):
+        if isinstance(value, self.container_type):
+            return json.dumps(value)
+        elif isinstance(value, str) and isinstance(json.loads(value), self.container_type):
+            return value
+        else:
+            return TypeError()
+
+    def process_result_value(self, value, dialect):
+        return json.loads(value)
--- a/app/ext/flask_sqlalchemy/int_enum_column.py
+++ b/app/ext/flask_sqlalchemy/int_enum_column.py
@@ -0,0 +1,22 @@
+from app import db
+
+
+class IntEnumColumn(db.TypeDecorator):
+    impl = db.Integer
+
+    def __init__(self, enum_type, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.enum_type = enum_type
+
+    def process_bind_param(self, value, dialect):
+        if isinstance(value, self.enum_type) and isinstance(value.value, int):
+            return value.value
+        elif isinstance(value, int):
+            return self.enum_type(value).value
+        elif isinstance(value, str):
+            return self.enum_type[value].value
+        else:
+            return TypeError()
+
+    def process_result_value(self, value, dialect):
+        return self.enum_type(value)
--- a/app/jobs/json_routes.py
+++ b/app/jobs/json_routes.py
@@ -1,7 +1,6 @@
 from flask import abort, current_app
 from flask_login import current_user
 from threading import Thread
-import os
 from app import db
 from app.decorators import admin_required, content_negotiation
 from app.models import Job, JobStatus
@@ -39,7 +38,7 @@ def job_log(job_id):
    if job.status not in [JobStatus.COMPLETED, JobStatus.FAILED]:
        response = {'errors': {'message': 'Job status is not completed or failed'}}
        return response, 409
-    with open(os.path.join(job.path, 'pipeline_data', 'logs', 'pyflow_log.txt')) as log_file:
+    with open(job.path / 'pipeline_data' / 'logs' / 'pyflow_log.txt') as log_file:
        log = log_file.read()
    response_data = {
        'jobLog': log
--- a/app/jobs/routes.py
+++ b/app/jobs/routes.py
@@ -7,7 +7,6 @@ from flask import (
 )
 from flask_breadcrumbs import register_breadcrumb
 from flask_login import current_user
-import os
 from app.models import Job, JobInput, JobResult
 from . import bp
 from .utils import job_dynamic_list_constructor as job_dlc
@@ -38,8 +37,8 @@ def download_job_input(job_id, job_input_id):
    if not (job_input.job.user == current_user or current_user.is_administrator()):
        abort(403)
    return send_from_directory(
-        os.path.dirname(job_input.path),
-        os.path.basename(job_input.path),
+        job_input.path.parent,
+        job_input.path.name,
        as_attachment=True,
        attachment_filename=job_input.filename,
        mimetype=job_input.mimetype
@@ -52,8 +51,8 @@ def download_job_result(job_id, job_result_id):
    if not (job_result.job.user == current_user or current_user.is_administrator()):
        abort(403)
    return send_from_directory(
-        os.path.dirname(job_result.path),
-        os.path.basename(job_result.path),
+        job_result.path.parent,
+        job_result.path.name,
        as_attachment=True,
        attachment_filename=job_result.filename,
        mimetype=job_result.mimetype
--- a/app/main/cli.py
+++ b/app/main/cli.py
@@ -1,6 +1,7 @@
 from flask import current_app
 from flask_migrate import upgrade
-import os
+from pathlib import Path
+from typing import List
 from app.models import (
    CorpusFollowerRole,
    Role,
@@ -17,16 +18,15 @@ def deploy():
    # Make default directories
    print('Make default directories')
    base_dir = current_app.config['NOPAQUE_DATA_DIR']
-    default_dirs = [
-        os.path.join(base_dir, 'tmp'),
-        os.path.join(base_dir, 'users')
+    default_dirs: List[Path] = [
+        base_dir / 'tmp',
+        base_dir / 'users'
    ]
-    for dir in default_dirs:
-        if os.path.exists(dir):
-            if not os.path.isdir(dir):
-                raise NotADirectoryError(f'{dir} is not a directory')
-        else:
-            os.mkdir(dir)
+    for default_dir in default_dirs:
+        if not default_dir.exists():
+            default_dir.mkdir()
+        if not default_dir.is_dir():
+            raise NotADirectoryError(f'{default_dir} is not a directory')

    # migrate database to latest revision
    print('Migrate database to latest revision')
--- a/app/models.py
+++ b/app/models.py
--- a/app/models/init.py
+++ b/app/models/init.py
@@ -0,0 +1,19 @@
+from .avatar import *
+from .corpus_file import *
+from .corpus_follower_association import *
+from .corpus_follower_role import *
+from .corpus import *
+from .job_input import *
+from .job_result import *
+from .job import *
+from .role import *
+from .spacy_nlp_pipeline_model import *
+from .tesseract_ocr_pipeline_model import *
+from .token import *
+from .user import *
+from app import login
+
+
+@login.user_loader
+def load_user(user_id):
+    return User.query.get(int(user_id))
--- a/app/models/avatar.py
+++ b/app/models/avatar.py
@@ -0,0 +1,40 @@
+from flask import current_app
+from flask_hashids import HashidMixin
+from pathlib import Path
+from app import db
+from .file_mixin import FileMixin
+
+
+class Avatar(HashidMixin, FileMixin, db.Model):
+    __tablename__ = 'avatars'
+    # Primary key
+    id = db.Column(db.Integer, primary_key=True)
+    # Foreign keys
+    user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
+    # Relationships
+    user = db.relationship('User', back_populates='avatar')
+
+    @property
+    def path(self) -> Path:
+        return self.user.path / 'avatar'
+        # return os.path.join(self.user.path, 'avatar')
+
+    def delete(self):
+        try:
+            self.path.unlink(missing_ok=True)
+        except OSError as e:
+            current_app.logger.error(e)
+            raise
+        db.session.delete(self)
+
+    def to_json_serializeable(self, backrefs=False, relationships=False):
+        json_serializeable = {
+            'id': self.hashid,
+            **self.file_mixin_to_json_serializeable()
+        }
+        if backrefs:
+            json_serializeable['user'] = \
+                self.user.to_json_serializeable(backrefs=True)
+        if relationships:
+            pass
+        return json_serializeable
--- a/app/models/corpus.py
+++ b/app/models/corpus.py
@@ -0,0 +1,200 @@
+from datetime import datetime
+from enum import IntEnum
+from flask import current_app, url_for
+from flask_hashids import HashidMixin
+from sqlalchemy.ext.associationproxy import association_proxy
+from typing import Union
+from pathlib import Path
+import shutil
+import xml.etree.ElementTree as ET
+from app import db
+from app.converters.vrt import normalize_vrt_file
+from app.ext.flask_sqlalchemy import IntEnumColumn
+from .corpus_follower_association import CorpusFollowerAssociation
+
+
+class CorpusStatus(IntEnum):
+    UNPREPARED = 1
+    SUBMITTED = 2
+    QUEUED = 3
+    BUILDING = 4
+    BUILT = 5
+    FAILED = 6
+    STARTING_ANALYSIS_SESSION = 7
+    RUNNING_ANALYSIS_SESSION = 8
+    CANCELING_ANALYSIS_SESSION = 9
+
+    @staticmethod
+    def get(corpus_status: Union['CorpusStatus', int, str]) -> 'CorpusStatus':
+        if isinstance(corpus_status, CorpusStatus):
+            return corpus_status
+        if isinstance(corpus_status, int):
+            return CorpusStatus(corpus_status)
+        if isinstance(corpus_status, str):
+            return CorpusStatus[corpus_status]
+        raise TypeError('corpus_status must be CorpusStatus, int, or str')
+
+
+class Corpus(HashidMixin, db.Model):
+    '''
+    Class to define a corpus.
+    '''
+    __tablename__ = 'corpora'
+    # Primary key
+    id = db.Column(db.Integer, primary_key=True)
+    # Foreign keys
+    user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
+    # Fields
+    creation_date = db.Column(db.DateTime(), default=datetime.utcnow)
+    description = db.Column(db.String(255))
+    status = db.Column(
+        IntEnumColumn(CorpusStatus),
+        default=CorpusStatus.UNPREPARED
+    )
+    title = db.Column(db.String(32))
+    num_analysis_sessions = db.Column(db.Integer, default=0)
+    num_tokens = db.Column(db.Integer, default=0)
+    is_public = db.Column(db.Boolean, default=False)
+    # Relationships
+    files = db.relationship(
+        'CorpusFile',
+        back_populates='corpus',
+        lazy='dynamic',
+        cascade='all, delete-orphan'
+    )
+    corpus_follower_associations = db.relationship(
+        'CorpusFollowerAssociation',
+        back_populates='corpus',
+        cascade='all, delete-orphan'
+    )
+    followers = association_proxy(
+        'corpus_follower_associations',
+        'follower',
+        creator=lambda u: CorpusFollowerAssociation(follower=u)
+    )
+    user = db.relationship('User', back_populates='corpora')
+    # "static" attributes
+    max_num_tokens = 2_147_483_647
+
+    def __repr__(self):
+        return f'<Corpus {self.title}>'
+
+    @property
+    def analysis_url(self):
+        return url_for('corpora.analysis', corpus_id=self.id)
+
+    @property
+    def jsonpatch_path(self):
+        return f'{self.user.jsonpatch_path}/corpora/{self.hashid}'
+
+    @property
+    def path(self) -> Path:
+        return self.user.path / 'corpora' / f'{self.id}'
+
+    @property
+    def url(self):
+        return url_for('corpora.corpus', corpus_id=self.id)
+
+    @property
+    def user_hashid(self):
+        return self.user.hashid
+
+    @staticmethod
+    def create(**kwargs):
+        corpus = Corpus(**kwargs)
+        db.session.add(corpus)
+        db.session.flush(objects=[corpus])
+        db.session.refresh(corpus)
+        corpus_files_dir = corpus.path / 'files'
+        corpus_cwb_dir = corpus.path / 'cwb'
+        corpus_cwb_data_dir = corpus_cwb_dir / 'data'
+        corpus_cwb_registry_dir = corpus_cwb_dir / 'registry'
+        try:
+            corpus.path.mkdir()
+            corpus_files_dir.mkdir()
+            corpus_cwb_dir.mkdir()
+            corpus_cwb_data_dir.mkdir()
+            corpus_cwb_registry_dir.mkdir()
+        except OSError as e:
+            # TODO: Potential leftover cleanup
+            current_app.logger.error(e)
+            db.session.rollback()
+            raise
+        return corpus
+
+    def build(self):
+        corpus_cwb_dir = self.path / 'cwb'
+        corpus_cwb_data_dir = corpus_cwb_dir / 'data'
+        corpus_cwb_registry_dir = corpus_cwb_dir / 'registry'
+        try:
+            shutil.rmtree(corpus_cwb_dir, ignore_errors=True)
+            corpus_cwb_dir.mkdir()
+            corpus_cwb_data_dir.mkdir()
+            corpus_cwb_registry_dir.mkdir()
+        except OSError as e:
+            current_app.logger.error(e)
+            self.status = CorpusStatus.FAILED
+            raise
+        corpus_element = ET.fromstring('<corpus>\n</corpus>')
+        for corpus_file in self.files:
+            normalized_vrt_path = corpus_cwb_dir / f'{corpus_file.id}.norm.vrt'
+            try:
+                normalize_vrt_file(corpus_file.path, normalized_vrt_path)
+            except:
+                self.status = CorpusStatus.FAILED
+                return
+            element_tree = ET.parse(normalized_vrt_path)
+            text_element = element_tree.getroot()
+            text_element.set('author', corpus_file.author)
+            text_element.set('title', corpus_file.title)
+            text_element.set(
+                'publishing_year',
+                f'{corpus_file.publishing_year}'
+            )
+            text_element.set('address', corpus_file.address or 'NULL')
+            text_element.set('booktitle', corpus_file.booktitle or 'NULL')
+            text_element.set('chapter', corpus_file.chapter or 'NULL')
+            text_element.set('editor', corpus_file.editor or 'NULL')
+            text_element.set('institution', corpus_file.institution or 'NULL')
+            text_element.set('journal', corpus_file.journal or 'NULL')
+            text_element.set('pages', f'{corpus_file.pages}' or 'NULL')
+            text_element.set('publisher', corpus_file.publisher or 'NULL')
+            text_element.set('school', corpus_file.school or 'NULL')
+            text_element.tail = '\n'
+            # corpus_element.insert(1, text_element)
+            corpus_element.append(text_element)
+        ET.ElementTree(corpus_element).write(
+            corpus_cwb_dir / 'corpus.vrt',
+            encoding='utf-8'
+        )
+        self.status = CorpusStatus.SUBMITTED
+
+    def delete(self):
+        shutil.rmtree(self.path, ignore_errors=True)
+        db.session.delete(self)
+
+    def to_json_serializeable(self, backrefs=False, relationships=False):
+        json_serializeable = {
+            'id': self.hashid,
+            'creation_date': f'{self.creation_date.isoformat()}Z',
+            'description': self.description,
+            'max_num_tokens': self.max_num_tokens,
+            'num_analysis_sessions': self.num_analysis_sessions,
+            'num_tokens': self.num_tokens,
+            'status': self.status.name,
+            'title': self.title,
+            'is_public': self.is_public
+        }
+        if backrefs:
+            json_serializeable['user'] = \
+                self.user.to_json_serializeable(backrefs=True)
+        if relationships:
+            json_serializeable['corpus_follower_associations'] = {
+                x.hashid: x.to_json_serializeable()
+                for x in self.corpus_follower_associations
+            }
+            json_serializeable['files'] = {
+                x.hashid: x.to_json_serializeable(relationships=True)
+                for x in self.files
+            }
+        return json_serializeable
--- a/app/models/corpus_file.py
+++ b/app/models/corpus_file.py
@@ -0,0 +1,102 @@
+from flask import current_app, url_for
+from flask_hashids import HashidMixin
+from pathlib import Path
+from app import db
+from .corpus import CorpusStatus
+from .file_mixin import FileMixin
+
+
+class CorpusFile(FileMixin, HashidMixin, db.Model):
+    __tablename__ = 'corpus_files'
+    # Primary key
+    id = db.Column(db.Integer, primary_key=True)
+    # Foreign keys
+    corpus_id = db.Column(db.Integer, db.ForeignKey('corpora.id'))
+    # Fields
+    author = db.Column(db.String(255))
+    description = db.Column(db.String(255))
+    publishing_year = db.Column(db.Integer)
+    title = db.Column(db.String(255))
+    address = db.Column(db.String(255))
+    booktitle = db.Column(db.String(255))
+    chapter = db.Column(db.String(255))
+    editor = db.Column(db.String(255))
+    institution = db.Column(db.String(255))
+    journal = db.Column(db.String(255))
+    pages = db.Column(db.String(255))
+    publisher = db.Column(db.String(255))
+    school = db.Column(db.String(255))
+    # Relationships
+    corpus = db.relationship(
+        'Corpus',
+        back_populates='files'
+    )
+
+    @property
+    def download_url(self):
+        return url_for(
+            'corpora.download_corpus_file',
+            corpus_id=self.corpus_id,
+            corpus_file_id=self.id
+        )
+
+    @property
+    def jsonpatch_path(self):
+        return f'{self.corpus.jsonpatch_path}/files/{self.hashid}'
+
+    @property
+    def path(self) -> Path:
+        return self.corpus.path / 'files' / f'{self.id}'
+
+    @property
+    def url(self):
+        return url_for(
+            'corpora.corpus_file',
+            corpus_id=self.corpus_id,
+            corpus_file_id=self.id
+        )
+
+    @property
+    def user_hashid(self):
+        return self.corpus.user.hashid
+
+    @property
+    def user_id(self):
+        return self.corpus.user_id
+
+    def delete(self):
+        try:
+            self.path.unlink(missing_ok=True)
+        except OSError as e:
+            current_app.logger.error(e)
+            raise
+        db.session.delete(self)
+        self.corpus.status = CorpusStatus.UNPREPARED
+
+    def to_json_serializeable(self, backrefs=False, relationships=False):
+        json_serializeable = {
+            'id': self.hashid,
+            'address': self.address,
+            'author': self.author,
+            'description': self.description,
+            'booktitle': self.booktitle,
+            'chapter': self.chapter,
+            'editor': self.editor,
+            'institution': self.institution,
+            'journal': self.journal,
+            'pages': self.pages,
+            'publisher': self.publisher,
+            'publishing_year': self.publishing_year,
+            'school': self.school,
+            'title': self.title,
+            **self.file_mixin_to_json_serializeable(
+                backrefs=backrefs,
+                relationships=relationships
+            )
+        }
+        if backrefs:
+            json_serializeable['corpus'] = \
+                self.corpus.to_json_serializeable(backrefs=True)
+        if relationships:
+            pass
+        return json_serializeable
--- a/app/models/corpus_follower_association.py
+++ b/app/models/corpus_follower_association.py
@@ -0,0 +1,47 @@
+from flask_hashids import HashidMixin
+from app import db
+from .corpus_follower_role import CorpusFollowerRole
+
+
+class CorpusFollowerAssociation(HashidMixin, db.Model):
+    __tablename__ = 'corpus_follower_associations'
+    # Primary key
+    id = db.Column(db.Integer, primary_key=True)
+    # Foreign keys
+    corpus_id = db.Column(db.Integer, db.ForeignKey('corpora.id'))
+    follower_id = db.Column(db.Integer, db.ForeignKey('users.id'))
+    role_id = db.Column(db.Integer, db.ForeignKey('corpus_follower_roles.id'))
+    # Relationships
+    corpus = db.relationship(
+        'Corpus',
+        back_populates='corpus_follower_associations'
+    )
+    follower = db.relationship(
+        'User',
+        back_populates='corpus_follower_associations'
+    )
+    role = db.relationship(
+        'CorpusFollowerRole',
+        back_populates='corpus_follower_associations'
+    )
+
+    def __init__(self, **kwargs):
+        if 'role' not in kwargs:
+            kwargs['role'] = CorpusFollowerRole.query.filter_by(default=True).first()
+        super().__init__(**kwargs)
+
+    def __repr__(self):
+        return f'<CorpusFollowerAssociation {self.follower.__repr__()} ~ {self.role.__repr__()} ~ {self.corpus.__repr__()}>'
+
+    def to_json_serializeable(self, backrefs=False, relationships=False):
+        json_serializeable = {
+            'id': self.hashid,
+            'corpus': self.corpus.to_json_serializeable(backrefs=True),
+            'follower': self.follower.to_json_serializeable(),
+            'role': self.role.to_json_serializeable()
+        }
+        if backrefs:
+            pass
+        if relationships:
+            pass
+        return json_serializeable
--- a/app/models/corpus_follower_role.py
+++ b/app/models/corpus_follower_role.py
@@ -0,0 +1,107 @@
+from flask_hashids import HashidMixin
+from enum import IntEnum
+from typing import Union
+from app import db
+
+
+class CorpusFollowerPermission(IntEnum):
+    VIEW = 1
+    MANAGE_FILES = 2
+    MANAGE_FOLLOWERS = 4
+    MANAGE_CORPUS = 8
+
+    @staticmethod
+    def get(corpus_follower_permission: Union['CorpusFollowerPermission', int, str]) -> 'CorpusFollowerPermission':
+        if isinstance(corpus_follower_permission, CorpusFollowerPermission):
+            return corpus_follower_permission
+        if isinstance(corpus_follower_permission, int):
+            return CorpusFollowerPermission(corpus_follower_permission)
+        if isinstance(corpus_follower_permission, str):
+            return CorpusFollowerPermission[corpus_follower_permission]
+        raise TypeError('corpus_follower_permission must be CorpusFollowerPermission, int, or str')
+
+
+class CorpusFollowerRole(HashidMixin, db.Model):
+    __tablename__ = 'corpus_follower_roles'
+    # Primary key
+    id = db.Column(db.Integer, primary_key=True)
+    # Fields
+    name = db.Column(db.String(64), unique=True)
+    default = db.Column(db.Boolean, default=False, index=True)
+    permissions = db.Column(db.Integer, default=0)
+    # Relationships
+    corpus_follower_associations = db.relationship(
+        'CorpusFollowerAssociation',
+        back_populates='role'
+    )
+
+    def __repr__(self):
+        return f'<CorpusFollowerRole {self.name}>'
+
+    def has_permission(self, permission: Union[CorpusFollowerPermission, int, str]):
+        perm = CorpusFollowerPermission.get(permission)
+        return self.permissions & perm.value == perm.value
+    
+    def add_permission(self, permission: Union[CorpusFollowerPermission, int, str]):
+        perm = CorpusFollowerPermission.get(permission)
+        if not self.has_permission(perm):
+            self.permissions += perm.value
+    
+    def remove_permission(self, permission: Union[CorpusFollowerPermission, int, str]):
+        perm = CorpusFollowerPermission.get(permission)
+        if self.has_permission(perm):
+            self.permissions -= perm.value
+
+    def reset_permissions(self):
+        self.permissions = 0
+
+    def to_json_serializeable(self, backrefs=False, relationships=False):
+        json_serializeable = {
+            'id': self.hashid,
+            'default': self.default,
+            'name': self.name,
+            'permissions': [
+                x.name
+                for x in CorpusFollowerPermission
+                if self.has_permission(x)
+            ]
+        }
+        if backrefs:
+            pass
+        if relationships:
+            json_serializeable['corpus_follower_association'] = {
+                x.hashid: x.to_json_serializeable(relationships=True)
+                for x in self.corpus_follower_association
+            }
+        return json_serializeable
+
+    @staticmethod
+    def insert_defaults():
+        roles = {
+            'Anonymous': [],
+            'Viewer': [
+                CorpusFollowerPermission.VIEW
+            ],
+            'Contributor': [
+                CorpusFollowerPermission.VIEW,
+                CorpusFollowerPermission.MANAGE_FILES
+            ],
+            'Administrator': [
+                CorpusFollowerPermission.VIEW,
+                CorpusFollowerPermission.MANAGE_FILES,
+                CorpusFollowerPermission.MANAGE_FOLLOWERS,
+                CorpusFollowerPermission.MANAGE_CORPUS
+
+            ]
+        }
+        default_role_name = 'Viewer'
+        for role_name, permissions in roles.items():
+            role = CorpusFollowerRole.query.filter_by(name=role_name).first()
+            if role is None:
+                role = CorpusFollowerRole(name=role_name)
+            role.reset_permissions()
+            for permission in permissions:
+                role.add_permission(permission)
+            role.default = role.name == default_role_name
+            db.session.add(role)
+        db.session.commit()
--- a/app/models/default_records/spacy_nlp_pipeline_model.yml
+++ b/app/models/default_records/spacy_nlp_pipeline_model.yml
@@ -120,6 +120,7 @@
  version: '3.4.0'
  compatible_service_versions:
    - '0.1.1'
+    - '0.1.2'
 - title: 'German'
  description: 'German pipeline optimized for CPU. Components: tok2vec, tagger, morphologizer, parser, lemmatizer (trainable_lemmatizer), senter, ner.'
  url: 'https://github.com/explosion/spacy-models/releases/download/de_core_news_md-3.4.0/de_core_news_md-3.4.0.tar.gz'
@@ -131,6 +132,7 @@
  version: '3.4.0'
  compatible_service_versions:
    - '0.1.1'
+    - '0.1.2'
 - title: 'Greek'
  description: 'Greek pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, lemmatizer (trainable_lemmatizer), senter, ner, attribute_ruler.'
  url: 'https://github.com/explosion/spacy-models/releases/download/el_core_news_md-3.4.0/el_core_news_md-3.4.0.tar.gz'
@@ -142,6 +144,7 @@
  version: '3.4.0'
  compatible_service_versions:
    - '0.1.1'
+    - '0.1.2'
 - title: 'English'
  description: 'English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer.'
  url: 'https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1.tar.gz'
@@ -153,6 +156,7 @@
  version: '3.4.1'
  compatible_service_versions:
    - '0.1.1'
+    - '0.1.2'
 - title: 'Spanish'
  description: 'Spanish pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.'
  url: 'https://github.com/explosion/spacy-models/releases/download/es_core_news_md-3.4.0/es_core_news_md-3.4.0.tar.gz'
@@ -164,6 +168,7 @@
  version: '3.4.0'
  compatible_service_versions:
    - '0.1.1'
+    - '0.1.2'
 - title: 'French'
  description: 'French pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.'
  url: 'https://github.com/explosion/spacy-models/releases/download/fr_core_news_md-3.4.0/fr_core_news_md-3.4.0.tar.gz'
@@ -175,6 +180,7 @@
  version: '3.4.0'
  compatible_service_versions:
    - '0.1.1'
+    - '0.1.2'
 - title: 'Italian'
  description: 'Italian pipeline optimized for CPU. Components: tok2vec, morphologizer, tagger, parser, lemmatizer (trainable_lemmatizer), senter, ner'
  url: 'https://github.com/explosion/spacy-models/releases/download/it_core_news_md-3.4.0/it_core_news_md-3.4.0.tar.gz'
@@ -186,6 +192,7 @@
  version: '3.4.0'
  compatible_service_versions:
    - '0.1.1'
+    - '0.1.2'
 - title: 'Polish'
  description: 'Polish pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, lemmatizer (trainable_lemmatizer), tagger, senter, ner.'
  url: 'https://github.com/explosion/spacy-models/releases/download/pl_core_news_md-3.4.0/pl_core_news_md-3.4.0.tar.gz'
@@ -197,6 +204,7 @@
  version: '3.4.0'
  compatible_service_versions:
    - '0.1.1'
+    - '0.1.2'
 - title: 'Russian'
  description: 'Russian pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.'
  url: 'https://github.com/explosion/spacy-models/releases/download/ru_core_news_md-3.4.0/ru_core_news_md-3.4.0.tar.gz'
@@ -208,6 +216,7 @@
  version: '3.4.0'
  compatible_service_versions:
    - '0.1.1'
+    - '0.1.2'
 - title: 'Chinese'
  description: 'Chinese pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler.'
  url: 'https://github.com/explosion/spacy-models/releases/download/zh_core_web_md-3.4.0/zh_core_web_md-3.4.0.tar.gz'
@@ -219,3 +228,4 @@
  version: '3.4.0'
  compatible_service_versions:
    - '0.1.1'
+    - '0.1.2'
--- a/app/models/default_records/tesseract_ocr_pipeline_model.yml
+++ b/app/models/default_records/tesseract_ocr_pipeline_model.yml
--- a/app/models/event_listeners.py
+++ b/app/models/event_listeners.py
@@ -0,0 +1,133 @@
+from datetime import datetime
+from enum import Enum
+from app import db, mail, socketio
+from app.email import create_message
+from .corpus_file import CorpusFile
+from .corpus_follower_association import CorpusFollowerAssociation
+from .corpus import Corpus
+from .job_input import JobInput
+from .job_result import JobResult
+from .job import Job, JobStatus
+from .spacy_nlp_pipeline_model import SpaCyNLPPipelineModel
+from .tesseract_ocr_pipeline_model import TesseractOCRPipelineModel
+from .user import UserSettingJobStatusMailNotificationLevel
+
+
+def register_event_listeners():
+    resources = [
+        Corpus,
+        CorpusFile,
+        Job,
+        JobInput,
+        JobResult,
+        SpaCyNLPPipelineModel,
+        TesseractOCRPipelineModel
+    ]
+
+    for resource in resources:
+        db.event.listen(resource, 'after_delete', resource_after_delete)
+        db.event.listen(resource, 'after_insert', resource_after_insert)
+        db.event.listen(resource, 'after_update', resource_after_update)
+
+    db.event.listen(CorpusFollowerAssociation, 'after_delete', cfa_after_delete)
+    db.event.listen(CorpusFollowerAssociation, 'after_insert', cfa_after_insert)
+
+    db.event.listen(Job, 'after_update', job_after_update)
+
+
+def resource_after_delete(mapper, connection, resource):
+    jsonpatch = [
+        {
+            'op': 'remove',
+            'path': resource.jsonpatch_path
+        }
+    ]
+    room = f'/users/{resource.user_hashid}'
+    socketio.emit('PATCH', jsonpatch, room=room)
+
+
+def cfa_after_delete(mapper, connection, cfa):
+    jsonpatch_path = f'/users/{cfa.corpus.user.hashid}/corpora/{cfa.corpus.hashid}/corpus_follower_associations/{cfa.hashid}'
+    jsonpatch = [
+        {
+            'op': 'remove',
+            'path': jsonpatch_path
+        }
+    ]
+    room = f'/users/{cfa.corpus.user.hashid}'
+    socketio.emit('PATCH', jsonpatch, room=room)
+
+
+def resource_after_insert(mapper, connection, resource):
+    jsonpatch_value = resource.to_json_serializeable()
+    for attr in mapper.relationships:
+        jsonpatch_value[attr.key] = {}
+    jsonpatch = [
+        {
+            'op': 'add',
+            'path': resource.jsonpatch_path,
+            'value': jsonpatch_value
+        }
+    ]
+    room = f'/users/{resource.user_hashid}'
+    socketio.emit('PATCH', jsonpatch, room=room)
+
+
+def cfa_after_insert(mapper, connection, cfa):
+    jsonpatch_value = cfa.to_json_serializeable()
+    jsonpatch_path = f'/users/{cfa.corpus.user.hashid}/corpora/{cfa.corpus.hashid}/corpus_follower_associations/{cfa.hashid}'
+    jsonpatch = [
+        {
+            'op': 'add',
+            'path': jsonpatch_path,
+            'value': jsonpatch_value
+        }
+    ]
+    room = f'/users/{cfa.corpus.user.hashid}'
+    socketio.emit('PATCH', jsonpatch, room=room)
+
+
+def resource_after_update(mapper, connection, resource):
+    jsonpatch = []
+    for attr in db.inspect(resource).attrs:
+        if attr.key in mapper.relationships:
+            continue
+        if not attr.load_history().has_changes():
+            continue
+        jsonpatch_path = f'{resource.jsonpatch_path}/{attr.key}'
+        if isinstance(attr.value, datetime):
+            jsonpatch_value = f'{attr.value.isoformat()}Z'
+        elif isinstance(attr.value, Enum):
+            jsonpatch_value = attr.value.name
+        else:
+            jsonpatch_value = attr.value
+        jsonpatch.append(
+            {
+                'op': 'replace',
+                'path': jsonpatch_path,
+                'value': jsonpatch_value
+            }
+        )
+    if jsonpatch:
+        room = f'/users/{resource.user_hashid}'
+        socketio.emit('PATCH', jsonpatch, room=room)
+
+
+def job_after_update(mapper, connection, job):
+    for attr in db.inspect(job).attrs:
+        if attr.key != 'status':
+            continue
+        if not attr.load_history().has_changes():
+            return
+        if job.user.setting_job_status_mail_notification_level == UserSettingJobStatusMailNotificationLevel.NONE:
+            return
+        if job.user.setting_job_status_mail_notification_level == UserSettingJobStatusMailNotificationLevel.END:
+            if job.status not in [JobStatus.COMPLETED, JobStatus.FAILED]:
+                return
+        msg = create_message(
+            job.user.email,
+            f'Status update for your Job "{job.title}"',
+            'tasks/email/notification',
+            job=job
+        )
+        mail.send(msg)
--- a/app/models/file_mixin.py
+++ b/app/models/file_mixin.py
@@ -0,0 +1,40 @@
+from datetime import datetime
+from flask import current_app
+from werkzeug.utils import secure_filename
+from app import db
+
+
+class FileMixin:
+    '''
+    Mixin for db.Model classes. All file related models should use this.
+    '''
+    creation_date = db.Column(db.DateTime, default=datetime.utcnow)
+    filename = db.Column(db.String(255))
+    mimetype = db.Column(db.String(255))
+
+    def file_mixin_to_json_serializeable(self, backrefs=False, relationships=False):
+        return {
+            'creation_date': f'{self.creation_date.isoformat()}Z',
+            'filename': self.filename,
+            'mimetype': self.mimetype
+        }
+    
+    @classmethod
+    def create(cls, file_storage, **kwargs):
+        filename = kwargs.pop('filename', file_storage.filename)
+        mimetype = kwargs.pop('mimetype', file_storage.mimetype)
+        obj = cls(
+            filename=secure_filename(filename),
+            mimetype=mimetype,
+            **kwargs
+        )
+        db.session.add(obj)
+        db.session.flush(objects=[obj])
+        db.session.refresh(obj)
+        try:
+            file_storage.save(obj.path)
+        except (AttributeError, OSError) as e:
+            current_app.logger.error(e)
+            db.session.rollback()
+            raise e
+        return obj
--- a/app/models/job.py
+++ b/app/models/job.py
@@ -0,0 +1,172 @@
+from datetime import datetime
+from enum import IntEnum
+from flask import current_app, url_for
+from flask_hashids import HashidMixin
+from time import sleep
+from typing import Union
+from pathlib import Path
+import shutil
+from app import db
+from app.ext.flask_sqlalchemy import ContainerColumn, IntEnumColumn
+
+
+class JobStatus(IntEnum):
+    INITIALIZING = 1
+    SUBMITTED = 2
+    QUEUED = 3
+    RUNNING = 4
+    CANCELING = 5
+    CANCELED = 6
+    COMPLETED = 7
+    FAILED = 8
+
+    @staticmethod
+    def get(job_status: Union['JobStatus', int, str]) -> 'JobStatus':
+        if isinstance(job_status, JobStatus):
+            return job_status
+        if isinstance(job_status, int):
+            return JobStatus(job_status)
+        if isinstance(job_status, str):
+            return JobStatus[job_status]
+        raise TypeError('job_status must be JobStatus, int, or str')
+
+
+class Job(HashidMixin, db.Model):
+    '''
+    Class to define Jobs.
+    '''
+    __tablename__ = 'jobs'
+    # Primary key
+    id = db.Column(db.Integer, primary_key=True)
+    # Foreign keys
+    user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
+    # Fields
+    creation_date = \
+        db.Column(db.DateTime(), default=datetime.utcnow)
+    description = db.Column(db.String(255))
+    end_date = db.Column(db.DateTime())
+    service = db.Column(db.String(64))
+    service_args = db.Column(ContainerColumn(dict, 255))
+    service_version = db.Column(db.String(16))
+    status = db.Column(
+        IntEnumColumn(JobStatus),
+        default=JobStatus.INITIALIZING
+    )
+    title = db.Column(db.String(32))
+    # Relationships
+    inputs = db.relationship(
+        'JobInput',
+        back_populates='job',
+        cascade='all, delete-orphan',
+        lazy='dynamic'
+    )
+    results = db.relationship(
+        'JobResult',
+        back_populates='job',
+        cascade='all, delete-orphan',
+        lazy='dynamic'
+    )
+    user = db.relationship(
+        'User',
+        back_populates='jobs'
+    )
+
+    def __repr__(self):
+        return f'<Job {self.title}>'
+
+    @property
+    def jsonpatch_path(self):
+        return f'{self.user.jsonpatch_path}/jobs/{self.hashid}'
+
+    @property
+    def path(self) -> Path:
+        return self.user.path / 'jobs' / f'{self.id}'
+
+    @property
+    def url(self):
+        return url_for('jobs.job', job_id=self.id)
+
+    @property
+    def user_hashid(self):
+        return self.user.hashid
+
+    @staticmethod
+    def create(**kwargs):
+        job = Job(**kwargs)
+        db.session.add(job)
+        db.session.flush(objects=[job])
+        db.session.refresh(job)
+        job_inputs_dir = job.path / 'inputs'
+        job_pipeline_data_dir = job.path / 'pipeline_data'
+        job_results_dir = job.path / 'results'
+        try:
+            job.path.mkdir()
+            job_inputs_dir.mkdir()
+            job_pipeline_data_dir.mkdir()
+            job_results_dir.mkdir()
+        except OSError as e:
+            # TODO: Potential leftover cleanup
+            current_app.logger.error(e)
+            db.session.rollback()
+            raise
+        return job
+
+    def delete(self):
+        ''' Delete the job and its inputs and results from the database. '''
+        if self.status not in [JobStatus.COMPLETED, JobStatus.FAILED]:  # noqa
+            self.status = JobStatus.CANCELING
+            db.session.commit()
+            while self.status != JobStatus.CANCELED:
+                # In case the daemon handled a job in any way
+                if self.status != JobStatus.CANCELING:
+                    self.status = JobStatus.CANCELING
+                    db.session.commit()
+                sleep(1)
+                db.session.refresh(self)
+        try:
+            shutil.rmtree(self.path)
+        except OSError as e:
+            current_app.logger.error(e)
+            db.session.rollback()
+            raise e
+        db.session.delete(self)
+
+    def restart(self):
+        ''' Restart a job - only if the status is failed '''
+        if self.status != JobStatus.FAILED:
+            raise Exception('Job status is not "failed"')
+        shutil.rmtree(self.path / 'results', ignore_errors=True)
+        shutil.rmtree(self.path / 'pyflow.data', ignore_errors=True)
+        for result in self.results:
+            db.session.delete(result)
+        self.end_date = None
+        self.status = JobStatus.SUBMITTED
+
+    def to_json_serializeable(self, backrefs=False, relationships=False):
+        json_serializeable = {
+            'id': self.hashid,
+            'creation_date': f'{self.creation_date.isoformat()}Z',
+            'description': self.description,
+            'end_date': (
+                None if self.end_date is None
+                else f'{self.end_date.isoformat()}Z'
+            ),
+            'service': self.service,
+            'service_args': self.service_args,
+            'service_version': self.service_version,
+            'status': self.status.name,
+            'title': self.title
+        }
+        if backrefs:
+            json_serializeable['user'] = \
+                self.user.to_json_serializeable(backrefs=True)
+        if relationships:
+            json_serializeable['inputs'] = {
+                x.hashid: x.to_json_serializeable(relationships=True)
+                for x in self.inputs
+            }
+            json_serializeable['results'] = {
+                x.hashid: x.to_json_serializeable(relationships=True)
+                for x in self.results
+            }
+        return json_serializeable
--- a/app/models/job_input.py
+++ b/app/models/job_input.py
@@ -0,0 +1,65 @@
+from flask import url_for
+from flask_hashids import HashidMixin
+from pathlib import Path
+from app import db
+from .file_mixin import FileMixin
+
+
+class JobInput(FileMixin, HashidMixin, db.Model):
+    __tablename__ = 'job_inputs'
+    # Primary key
+    id = db.Column(db.Integer, primary_key=True)
+    # Foreign keys
+    job_id = db.Column(db.Integer, db.ForeignKey('jobs.id'))
+    # Relationships
+    job = db.relationship(
+        'Job',
+        back_populates='inputs'
+    )
+
+    def __repr__(self):
+        return f'<JobInput {self.filename}>'
+
+    @property
+    def content_url(self):
+        return url_for(
+            'jobs.download_job_input',
+            job_id=self.job.id,
+            job_input_id=self.id
+        )
+
+    @property
+    def jsonpatch_path(self):
+        return f'{self.job.jsonpatch_path}/inputs/{self.hashid}'
+
+    @property
+    def path(self) -> Path:
+        return self.job.path / 'inputs' / f'{self.id}'
+
+    @property
+    def url(self):
+        return url_for(
+            'jobs.job',
+            job_id=self.job_id,
+            _anchor=f'job-{self.job.hashid}-input-{self.hashid}'
+        )
+
+    @property
+    def user_hashid(self):
+        return self.job.user.hashid
+
+    @property
+    def user_id(self):
+        return self.job.user.id
+
+    def to_json_serializeable(self, backrefs=False, relationships=False):
+        json_serializeable = {
+            'id': self.hashid,
+            **self.file_mixin_to_json_serializeable()
+        }
+        if backrefs:
+            json_serializeable['job'] = \
+                self.job.to_json_serializeable(backrefs=True)
+        if relationships:
+            pass
+        return json_serializeable
--- a/app/models/job_result.py
+++ b/app/models/job_result.py
@@ -0,0 +1,71 @@
+from flask import url_for
+from flask_hashids import HashidMixin
+from pathlib import Path
+from app import db
+from .file_mixin import FileMixin
+
+
+class JobResult(FileMixin, HashidMixin, db.Model):
+    __tablename__ = 'job_results'
+    # Primary key
+    id = db.Column(db.Integer, primary_key=True)
+    # Foreign keys
+    job_id = db.Column(db.Integer, db.ForeignKey('jobs.id'))
+    # Fields
+    description = db.Column(db.String(255))
+    # Relationships
+    job = db.relationship(
+        'Job',
+        back_populates='results'
+    )
+
+    def __repr__(self):
+        return f'<JobResult {self.filename}>'
+
+    @property
+    def download_url(self):
+        return url_for(
+            'jobs.download_job_result',
+            job_id=self.job_id,
+            job_result_id=self.id
+        )
+
+    @property
+    def jsonpatch_path(self):
+        return f'{self.job.jsonpatch_path}/results/{self.hashid}'
+
+    @property
+    def path(self) -> Path:
+        return self.job.path / 'results' / f'{self.id}'
+
+    @property
+    def url(self):
+        return url_for(
+            'jobs.job',
+            job_id=self.job_id,
+            _anchor=f'job-{self.job.hashid}-result-{self.hashid}'
+        )
+
+    @property
+    def user_hashid(self):
+        return self.job.user.hashid
+
+    @property
+    def user_id(self):
+        return self.job.user.id
+
+    def to_json_serializeable(self, backrefs=False, relationships=False):
+        json_serializeable = {
+            'id': self.hashid,
+            'description': self.description,
+            **self.file_mixin_to_json_serializeable(
+                backrefs=backrefs,
+                relationships=relationships
+            )
+        }
+        if backrefs:
+            json_serializeable['job'] = \
+                self.job.to_json_serializeable(backrefs=True)
+        if relationships:
+            pass
+        return json_serializeable
--- a/app/models/role.py
+++ b/app/models/role.py
@@ -0,0 +1,100 @@
+from enum import IntEnum
+from flask_hashids import HashidMixin
+from typing import Union
+from app import db
+
+
+class Permission(IntEnum):
+    '''
+    Defines User permissions as integers by the power of 2. User permission
+    can be evaluated using the bitwise operator &.
+    '''
+    ADMINISTRATE = 1
+    CONTRIBUTE = 2
+    USE_API = 4
+
+    @staticmethod
+    def get(permission: Union['Permission', int, str]) -> 'Permission':
+        if isinstance(permission, Permission):
+            return permission
+        if isinstance(permission, int):
+            return Permission(permission)
+        if isinstance(permission, str):
+            return Permission[permission]
+        raise TypeError('permission must be Permission, int, or str')
+
+
+class Role(HashidMixin, db.Model):
+    __tablename__ = 'roles'
+    # Primary key
+    id = db.Column(db.Integer, primary_key=True)
+    # Fields
+    name = db.Column(db.String(64), unique=True)
+    default = db.Column(db.Boolean, default=False, index=True)
+    permissions = db.Column(db.Integer, default=0)
+    # Relationships
+    users = db.relationship('User', back_populates='role', lazy='dynamic')
+
+    def __repr__(self):
+        return f'<Role {self.name}>'
+
+    def has_permission(self, permission: Union[Permission, int, str]):
+        p = Permission.get(permission)
+        return self.permissions & p.value == p.value
+    
+    def add_permission(self, permission: Union[Permission, int, str]):
+        p = Permission.get(permission)
+        if not self.has_permission(p):
+            self.permissions += p.value
+    
+    def remove_permission(self, permission: Union[Permission, int, str]):
+        p = Permission.get(permission)
+        if self.has_permission(p):
+            self.permissions -= p.value
+
+    def reset_permissions(self):
+        self.permissions = 0
+
+    def to_json_serializeable(self, backrefs=False, relationships=False):
+        json_serializeable = {
+            'id': self.hashid,
+            'default': self.default,
+            'name': self.name,
+            'permissions': [
+                x.name for x in Permission
+                if self.has_permission(x.value)
+            ]
+        }
+        if backrefs:
+            pass
+        if relationships:
+            json_serializeable['users'] = {
+                x.hashid: x.to_json_serializeable(relationships=True)
+                for x in self.users
+            }
+        return json_serializeable
+
+    @staticmethod
+    def insert_defaults():
+        roles = {
+            'User': [],
+            'API user': [Permission.USE_API],
+            'Contributor': [Permission.CONTRIBUTE],
+            'Administrator': [
+                Permission.ADMINISTRATE,
+                Permission.CONTRIBUTE,
+                Permission.USE_API
+            ],
+            'System user': []
+        }
+        default_role_name = 'User'
+        for role_name, permissions in roles.items():
+            role = Role.query.filter_by(name=role_name).first()
+            if role is None:
+                role = Role(name=role_name)
+            role.reset_permissions()
+            for permission in permissions:
+                role.add_permission(permission)
+            role.default = role.name == default_role_name
+            db.session.add(role)
+        db.session.commit()
--- a/app/models/spacy_nlp_pipeline_model.py
+++ b/app/models/spacy_nlp_pipeline_model.py
@@ -0,0 +1,136 @@
+from flask import current_app, url_for
+from flask_hashids import HashidMixin
+from tqdm import tqdm
+from pathlib import Path
+import requests
+import yaml
+from app import db
+from app.ext.flask_sqlalchemy import ContainerColumn
+from .file_mixin import FileMixin
+from .user import User
+
+
+class SpaCyNLPPipelineModel(FileMixin, HashidMixin, db.Model):
+    __tablename__ = 'spacy_nlp_pipeline_models'
+    # Primary key
+    id = db.Column(db.Integer, primary_key=True)
+    # Foreign keys
+    user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
+    # Fields
+    title = db.Column(db.String(64))
+    description = db.Column(db.String(255))
+    version = db.Column(db.String(16))
+    compatible_service_versions = db.Column(ContainerColumn(list, 255))
+    publisher = db.Column(db.String(128))
+    publisher_url = db.Column(db.String(512))
+    publishing_url = db.Column(db.String(512))
+    publishing_year = db.Column(db.Integer)
+    pipeline_name = db.Column(db.String(64))
+    is_public = db.Column(db.Boolean, default=False)
+    # Relationships
+    user = db.relationship('User', back_populates='spacy_nlp_pipeline_models')
+
+    @property
+    def path(self) -> Path:
+        return self.user.path / 'spacy_nlp_pipeline_models' / f'{self.id}'
+
+    @property
+    def jsonpatch_path(self):
+        return f'{self.user.jsonpatch_path}/spacy_nlp_pipeline_models/{self.hashid}'
+
+    @property
+    def url(self):
+        return url_for(
+            'contributions.spacy_nlp_pipeline_model',
+            spacy_nlp_pipeline_model_id=self.id
+        )
+
+    @property
+    def user_hashid(self):
+        return self.user.hashid
+
+    @staticmethod
+    def insert_defaults(force_download=False):
+        nopaque_user = User.query.filter_by(username='nopaque').first()
+        default_records_file = Path(__file__).parent / 'default_records' / 'spacy_nlp_pipeline_model.yml'
+        with default_records_file.open('r') as f:
+            default_records = yaml.safe_load(f)
+        for m in default_records:
+            model = SpaCyNLPPipelineModel.query.filter_by(title=m['title'], version=m['version']).first()  # noqa
+            if model is not None:
+                model.compatible_service_versions = m['compatible_service_versions']
+                model.description = m['description']
+                model.filename = m['url'].split('/')[-1]
+                model.publisher = m['publisher']
+                model.publisher_url = m['publisher_url']
+                model.publishing_url = m['publishing_url']
+                model.publishing_year = m['publishing_year']
+                model.is_public = True
+                model.title = m['title']
+                model.version = m['version']
+                model.pipeline_name = m['pipeline_name']
+            else:
+                model = SpaCyNLPPipelineModel(
+                    compatible_service_versions=m['compatible_service_versions'],
+                    description=m['description'],
+                    filename=m['url'].split('/')[-1],
+                    publisher=m['publisher'],
+                    publisher_url=m['publisher_url'],
+                    publishing_url=m['publishing_url'],
+                    publishing_year=m['publishing_year'],
+                    is_public=True,
+                    title=m['title'],
+                    user=nopaque_user,
+                    version=m['version'],
+                    pipeline_name=m['pipeline_name']
+                )
+                db.session.add(model)
+                db.session.flush(objects=[model])
+                db.session.refresh(model)
+            if not model.path.exists() or force_download:
+                r = requests.get(m['url'], stream=True)
+                pbar = tqdm(
+                    desc=f'{model.title} ({model.filename})',
+                    unit="B",
+                    unit_scale=True,
+                    unit_divisor=1024,
+                    total=int(r.headers['Content-Length'])
+                )
+                pbar.clear()
+                with open(model.path, 'wb') as f:
+                    for chunk in r.iter_content(chunk_size=1024):
+                        if chunk:  # filter out keep-alive new chunks
+                            pbar.update(len(chunk))
+                            f.write(chunk)
+                    pbar.close()
+        db.session.commit()
+    
+    def delete(self):
+        try:
+            self.path.unlink(missing_ok=True)
+        except OSError as e:
+            current_app.logger.error(e)
+            raise
+        db.session.delete(self)
+
+    def to_json_serializeable(self, backrefs=False, relationships=False):
+        json_serializeable = {
+            'id': self.hashid,
+            'compatible_service_versions': self.compatible_service_versions,
+            'description': self.description,
+            'publisher': self.publisher,
+            'publisher_url': self.publisher_url,
+            'publishing_url': self.publishing_url,
+            'publishing_year': self.publishing_year,
+            'pipeline_name': self.pipeline_name,
+            'is_public': self.is_public,
+            'title': self.title,
+            'version': self.version,
+            **self.file_mixin_to_json_serializeable()
+        }
+        if backrefs:
+            json_serializeable['user'] = \
+                self.user.to_json_serializeable(backrefs=True)
+        if relationships:
+            pass
+        return json_serializeable
--- a/app/models/tesseract_ocr_pipeline_model.py
+++ b/app/models/tesseract_ocr_pipeline_model.py
@@ -0,0 +1,132 @@
+from flask import current_app, url_for
+from flask_hashids import HashidMixin
+from tqdm import tqdm
+from pathlib import Path
+import requests
+import yaml
+from app import db
+from app.ext.flask_sqlalchemy import ContainerColumn
+from .file_mixin import FileMixin
+from .user import User
+
+
+class TesseractOCRPipelineModel(FileMixin, HashidMixin, db.Model):
+    __tablename__ = 'tesseract_ocr_pipeline_models'
+    # Primary key
+    id = db.Column(db.Integer, primary_key=True)
+    # Foreign keys
+    user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
+    # Fields
+    title = db.Column(db.String(64))
+    description = db.Column(db.String(255))
+    version = db.Column(db.String(16))
+    compatible_service_versions = db.Column(ContainerColumn(list, 255))
+    publisher = db.Column(db.String(128))
+    publisher_url = db.Column(db.String(512))
+    publishing_url = db.Column(db.String(512))
+    publishing_year = db.Column(db.Integer)
+    is_public = db.Column(db.Boolean, default=False)
+    # Relationships
+    user = db.relationship('User', back_populates='tesseract_ocr_pipeline_models')
+
+    @property
+    def path(self) -> Path:
+        return self.user.path / 'tesseract_ocr_pipeline_models' / f'{self.id}'
+
+    @property
+    def jsonpatch_path(self):
+        return f'{self.user.jsonpatch_path}/tesseract_ocr_pipeline_models/{self.hashid}'
+
+    @property
+    def url(self):
+        return url_for(
+            'contributions.tesseract_ocr_pipeline_model',
+            tesseract_ocr_pipeline_model_id=self.id
+        )
+
+    @property
+    def user_hashid(self):
+        return self.user.hashid
+
+    @staticmethod
+    def insert_defaults(force_download=False):
+        nopaque_user = User.query.filter_by(username='nopaque').first()
+        default_records_file = Path(__file__).parent / 'default_records' / 'tesseract_ocr_pipeline_model.yml'
+        with default_records_file.open('r') as f:
+            default_records = yaml.safe_load(f)
+        for m in default_records:
+            model = TesseractOCRPipelineModel.query.filter_by(title=m['title'], version=m['version']).first()  # noqa
+            if model is not None:
+                model.compatible_service_versions = m['compatible_service_versions']
+                model.description = m['description']
+                model.filename = f'{model.id}.traineddata'
+                model.publisher = m['publisher']
+                model.publisher_url = m['publisher_url']
+                model.publishing_url = m['publishing_url']
+                model.publishing_year = m['publishing_year']
+                model.is_public = True
+                model.title = m['title']
+                model.version = m['version']
+            else:
+                model = TesseractOCRPipelineModel(
+                    compatible_service_versions=m['compatible_service_versions'],
+                    description=m['description'],
+                    publisher=m['publisher'],
+                    publisher_url=m['publisher_url'],
+                    publishing_url=m['publishing_url'],
+                    publishing_year=m['publishing_year'],
+                    is_public=True,
+                    title=m['title'],
+                    user=nopaque_user,
+                    version=m['version']
+                )
+                db.session.add(model)
+                db.session.flush(objects=[model])
+                db.session.refresh(model)
+                model.filename = f'{model.id}.traineddata'
+            if not model.path.exists() or force_download:
+                r = requests.get(m['url'], stream=True)
+                pbar = tqdm(
+                    desc=f'{model.title} ({model.filename})',
+                    unit="B",
+                    unit_scale=True,
+                    unit_divisor=1024,
+                    total=int(r.headers['Content-Length'])
+                )
+                pbar.clear()
+                with open(model.path, 'wb') as f:
+                    for chunk in r.iter_content(chunk_size=1024):
+                        if chunk:  # filter out keep-alive new chunks
+                            pbar.update(len(chunk))
+                            f.write(chunk)
+                    pbar.close()
+        db.session.commit()
+
+    def delete(self):
+        try:
+            self.path.unlink(missing_ok=True)
+        except OSError as e:
+            current_app.logger.error(e)
+            raise
+        db.session.delete(self)
+
+    def to_json_serializeable(self, backrefs=False, relationships=False):
+        json_serializeable = {
+            'id': self.hashid,
+            'compatible_service_versions': self.compatible_service_versions,
+            'description': self.description,
+            'publisher': self.publisher,
+            'publisher_url': self.publisher_url,
+            'publishing_url': self.publishing_url,
+            'publishing_year': self.publishing_year,
+            'is_public': self.is_public,
+            'title': self.title,
+            'version': self.version,
+            **self.file_mixin_to_json_serializeable()
+        }
+        if backrefs:
+            json_serializeable['user'] = \
+                self.user.to_json_serializeable(backrefs=True)
+        if relationships:
+            pass
+        return json_serializeable
--- a/app/models/token.py
+++ b/app/models/token.py
@@ -0,0 +1,48 @@
+from datetime import datetime, timedelta
+from app import db
+
+
+class Token(db.Model):
+    __tablename__ = 'tokens'
+    # Primary key
+    id = db.Column(db.Integer, primary_key=True)
+    # Foreign keys
+    user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
+    # Fields
+    access_token = db.Column(db.String(64), index=True)
+    access_expiration = db.Column(db.DateTime)
+    refresh_token = db.Column(db.String(64), index=True)
+    refresh_expiration = db.Column(db.DateTime)
+    # Relationships
+    user = db.relationship('User', back_populates='tokens')
+
+    def expire(self):
+        self.access_expiration = datetime.utcnow()
+        self.refresh_expiration = datetime.utcnow()
+
+    def to_json_serializeable(self, backrefs=False, relationships=False):
+        json_serializeable = {
+            'id': self.hashid,
+            'access_token': self.access_token,
+            'access_expiration': (
+                None if self.access_expiration is None
+                else f'{self.access_expiration.isoformat()}Z'
+            ),
+            'refresh_token': self.refresh_token,
+            'refresh_expiration': (
+                None if self.refresh_expiration is None
+                else f'{self.refresh_expiration.isoformat()}Z'
+            )
+        }
+        if backrefs:
+            json_serializeable['user'] = \
+                self.user.to_json_serializeable(backrefs=True)
+        if relationships:
+            pass
+        return json_serializeable
+
+    @staticmethod
+    def clean():
+        """Remove any tokens that have been expired for more than a day."""
+        yesterday = datetime.utcnow() - timedelta(days=1)
+        Token.query.filter(Token.refresh_expiration < yesterday).delete()
--- a/app/models/user.py
+++ b/app/models/user.py
@@ -0,0 +1,452 @@
+from datetime import datetime, timedelta
+from enum import IntEnum
+from flask import current_app, url_for
+from flask_hashids import HashidMixin
+from flask_login import UserMixin
+from sqlalchemy.ext.associationproxy import association_proxy
+from pathlib import Path
+from typing import Union
+from werkzeug.security import generate_password_hash, check_password_hash
+import jwt
+import re
+import secrets
+import shutil
+from app import db, hashids
+from app.ext.flask_sqlalchemy import IntEnumColumn
+from .corpus import Corpus
+from .corpus_follower_association import CorpusFollowerAssociation
+from .corpus_follower_role import CorpusFollowerRole
+from .role import Permission, Role
+from .token import Token
+
+
+class ProfilePrivacySettings(IntEnum):
+    SHOW_EMAIL = 1
+    SHOW_LAST_SEEN = 2
+    SHOW_MEMBER_SINCE = 4
+
+    @staticmethod
+    def get(profile_privacy_setting: Union['ProfilePrivacySettings', int, str]) -> 'ProfilePrivacySettings':
+        if isinstance(profile_privacy_setting, ProfilePrivacySettings):
+            return profile_privacy_setting
+        if isinstance(profile_privacy_setting, int):
+            return ProfilePrivacySettings(profile_privacy_setting)
+        if isinstance(profile_privacy_setting, str):
+            return ProfilePrivacySettings[profile_privacy_setting]
+        raise TypeError('profile_privacy_setting must be ProfilePrivacySettings, int, or str')
+
+
+class UserSettingJobStatusMailNotificationLevel(IntEnum):
+    NONE = 1
+    END = 2
+    ALL = 3
+
+
+class User(HashidMixin, UserMixin, db.Model):
+    __tablename__ = 'users'
+    # Primary key
+    id = db.Column(db.Integer, primary_key=True)
+    # Foreign keys
+    role_id = db.Column(db.Integer, db.ForeignKey('roles.id'))
+    # Fields
+    email = db.Column(db.String(254), index=True, unique=True)
+    username = db.Column(db.String(64), index=True, unique=True)
+    username_pattern = re.compile(r'^[A-Za-zÄÖÜäöüß0-9_.]*$')
+    password_hash = db.Column(db.String(128))
+    confirmed = db.Column(db.Boolean, default=False)
+    terms_of_use_accepted = db.Column(db.Boolean, default=False)
+    member_since = db.Column(db.DateTime(), default=datetime.utcnow)
+    setting_job_status_mail_notification_level = db.Column(
+        IntEnumColumn(UserSettingJobStatusMailNotificationLevel),
+        default=UserSettingJobStatusMailNotificationLevel.END
+    )
+    last_seen = db.Column(db.DateTime())
+    full_name = db.Column(db.String(64))
+    about_me = db.Column(db.String(256))
+    location = db.Column(db.String(64))
+    website = db.Column(db.String(128))
+    organization = db.Column(db.String(128))
+    is_public = db.Column(db.Boolean, default=False)
+    profile_privacy_settings = db.Column(db.Integer(), default=0)
+    # Relationships
+    avatar = db.relationship(
+        'Avatar',
+        back_populates='user',
+        cascade='all, delete-orphan',
+        uselist=False
+    )
+    corpora = db.relationship(
+        'Corpus',
+        back_populates='user',
+        cascade='all, delete-orphan',
+        lazy='dynamic'
+    )
+    corpus_follower_associations = db.relationship(
+        'CorpusFollowerAssociation',
+        back_populates='follower',
+        cascade='all, delete-orphan'
+    )
+    followed_corpora = association_proxy(
+        'corpus_follower_associations',
+        'corpus',
+        creator=lambda c: CorpusFollowerAssociation(corpus=c)
+    )
+    jobs = db.relationship(
+        'Job',
+        back_populates='user',
+        cascade='all, delete-orphan',
+        lazy='dynamic'
+    )
+    role = db.relationship(
+        'Role',
+        back_populates='users'
+    )
+    spacy_nlp_pipeline_models = db.relationship(
+        'SpaCyNLPPipelineModel',
+        back_populates='user',
+        cascade='all, delete-orphan',
+        lazy='dynamic'
+    )
+    tesseract_ocr_pipeline_models = db.relationship(
+        'TesseractOCRPipelineModel',
+        back_populates='user',
+        cascade='all, delete-orphan',
+        lazy='dynamic'
+    )
+    tokens = db.relationship(
+        'Token',
+        back_populates='user',
+        cascade='all, delete-orphan',
+        lazy='dynamic'
+    )
+
+    def __init__(self, **kwargs):
+        if 'role' not in kwargs:
+            kwargs['role'] = (
+                Role.query.filter_by(name='Administrator').first()
+                if kwargs['email'] == current_app.config['NOPAQUE_ADMIN']
+                else Role.query.filter_by(default=True).first()
+            )
+        super().__init__(**kwargs)
+
+    def __repr__(self):
+        return f'<User {self.username}>'
+
+    @property
+    def jsonpatch_path(self):
+        return f'/users/{self.hashid}'
+
+    @property
+    def password(self):
+        raise AttributeError('password is not a readable attribute')
+
+    @password.setter
+    def password(self, password):
+        self.password_hash = generate_password_hash(password)
+
+    @property
+    def path(self) -> Path:
+        return current_app.config.get('NOPAQUE_DATA_DIR') / 'users' / f'{self.id}'
+
+    @staticmethod
+    def create(**kwargs):
+        user = User(**kwargs)
+        db.session.add(user)
+        db.session.flush(objects=[user])
+        db.session.refresh(user)
+        user_spacy_nlp_pipeline_models_dir = user.path / 'spacy_nlp_pipeline_models'
+        user_tesseract_ocr_pipeline_models_dir = user.path / 'tesseract_ocr_pipeline_models'
+        user_corpora_dir = user.path / 'corpora'
+        user_jobs_dir = user.path / 'jobs'
+        try:
+            user.path.mkdir()
+            user_spacy_nlp_pipeline_models_dir.mkdir()
+            user_tesseract_ocr_pipeline_models_dir.mkdir()
+            user_corpora_dir.mkdir()
+            user_jobs_dir.mkdir()
+        except OSError as e:
+            # TODO: Potential leftover cleanup
+            current_app.logger.error(e)
+            db.session.rollback()
+            raise
+        return user
+
+    @staticmethod
+    def insert_defaults():
+        nopaque_user = User.query.filter_by(username='nopaque').first()
+        system_user_role = Role.query.filter_by(name='System user').first()
+        if nopaque_user is None:
+            nopaque_user = User.create(
+                username='nopaque',
+                role=system_user_role
+            )
+            db.session.add(nopaque_user)
+        elif nopaque_user.role != system_user_role:
+            nopaque_user.role = system_user_role
+        db.session.commit()
+
+    @staticmethod
+    def reset_password(token, new_password):
+        try:
+            payload = jwt.decode(
+                token,
+                current_app.config['SECRET_KEY'],
+                algorithms=['HS256'],
+                issuer=current_app.config['SERVER_NAME'],
+                options={'require': ['exp', 'iat', 'iss', 'purpose', 'sub']}
+            )
+        except jwt.PyJWTError:
+            return False
+        if payload.get('purpose') != 'User.reset_password':
+            return False
+        user_hashid = payload.get('sub')
+        user_id = hashids.decode(user_hashid)
+        user = User.query.get(user_id)
+        if user is None:
+            return False
+        user.password = new_password
+        db.session.add(user)
+        return True
+
+    @staticmethod
+    def verify_access_token(access_token, refresh_token=None):
+        token = Token.query.filter(Token.access_token == access_token).first()
+        if token is not None:
+            if token.access_expiration > datetime.utcnow():
+                token.user.ping()
+                db.session.commit()
+                if token.user.role.name != 'System user':
+                    return token.user
+
+    @staticmethod
+    def verify_refresh_token(refresh_token, access_token):
+        token = Token.query.filter((Token.refresh_token == refresh_token) & (Token.access_token == access_token)).first()
+        if token is not None:
+            if token.refresh_expiration > datetime.utcnow():
+                return token
+            # someone tried to refresh with an expired token
+            # revoke all tokens from this user as a precaution
+            token.user.revoke_auth_tokens()
+            db.session.commit()
+
+    def can(self, permission):
+        return self.role is not None and self.role.has_permission(permission)
+
+    def confirm(self, confirmation_token):
+        try:
+            payload = jwt.decode(
+                confirmation_token,
+                current_app.config['SECRET_KEY'],
+                algorithms=['HS256'],
+                issuer=current_app.config['SERVER_NAME'],
+                options={'require': ['exp', 'iat', 'iss', 'purpose', 'sub']}
+            )
+        except jwt.PyJWTError:
+            return False
+        if payload.get('purpose') != 'user.confirm':
+            return False
+        if payload.get('sub') != self.hashid:
+            return False
+        self.confirmed = True
+        db.session.add(self)
+        return True
+
+    def delete(self):
+        shutil.rmtree(self.path, ignore_errors=True)
+        db.session.delete(self)
+
+    def generate_auth_token(self):
+        return Token(
+            access_token=secrets.token_urlsafe(),
+            access_expiration=datetime.utcnow() + timedelta(minutes=15),
+            refresh_token=secrets.token_urlsafe(),
+            refresh_expiration=datetime.utcnow() + timedelta(days=7),
+            user=self
+        )
+
+    def generate_confirm_token(self, expiration=3600):
+        now = datetime.utcnow()
+        payload = {
+            'exp': now + timedelta(seconds=expiration),
+            'iat': now,
+            'iss': current_app.config['SERVER_NAME'],
+            'purpose': 'user.confirm',
+            'sub': self.hashid
+        }
+        return jwt.encode(
+            payload,
+            current_app.config['SECRET_KEY'],
+            algorithm='HS256'
+        )
+
+    def generate_reset_password_token(self, expiration=3600):
+        now = datetime.utcnow()
+        payload = {
+            'exp': now + timedelta(seconds=expiration),
+            'iat': now,
+            'iss': current_app.config['SERVER_NAME'],
+            'purpose': 'User.reset_password',
+            'sub': self.hashid
+        }
+        return jwt.encode(
+            payload,
+            current_app.config['SECRET_KEY'],
+            algorithm='HS256'
+        )
+
+    def is_administrator(self):
+        return self.can(Permission.ADMINISTRATE)
+
+    def ping(self):
+        self.last_seen = datetime.utcnow()
+
+    def revoke_auth_tokens(self):
+        for token in self.tokens:
+            db.session.delete(token)
+
+    def verify_password(self, password):
+        if self.role.name == 'System user':
+            return False
+        return check_password_hash(self.password_hash, password)
+
+    #region Profile Privacy settings
+    def has_profile_privacy_setting(self, setting):
+        s = ProfilePrivacySettings.get(setting)
+        return self.profile_privacy_settings & s.value == s.value
+    
+    def add_profile_privacy_setting(self, setting):
+        s = ProfilePrivacySettings.get(setting)
+        if not self.has_profile_privacy_setting(s):
+            self.profile_privacy_settings += s.value
+
+    def remove_profile_privacy_setting(self, setting):
+        s = ProfilePrivacySettings.get(setting)
+        if self.has_profile_privacy_setting(s):
+            self.profile_privacy_settings -= s.value
+
+    def reset_profile_privacy_settings(self):
+        self.profile_privacy_settings = 0
+    #endregion Profile Privacy settings
+
+    def follow_corpus(self, corpus, role=None):
+        if role is None:
+            cfr = CorpusFollowerRole.query.filter_by(default=True).first()
+        else:
+            cfr = role
+        if self.is_following_corpus(corpus):
+            cfa = CorpusFollowerAssociation.query.filter_by(corpus=corpus, follower=self).first()
+            if cfa.role != cfr:
+                cfa.role = cfr
+        else:
+            cfa = CorpusFollowerAssociation(corpus=corpus, role=cfr, follower=self)
+            db.session.add(cfa)
+
+    def unfollow_corpus(self, corpus):
+        if not self.is_following_corpus(corpus):
+            return
+        self.followed_corpora.remove(corpus)
+
+    def is_following_corpus(self, corpus):
+        return corpus in self.followed_corpora
+    
+    def generate_follow_corpus_token(self, corpus_hashid, role_name, expiration=7):
+        now = datetime.utcnow()
+        payload = {
+            'exp': expiration,
+            'iat': now,
+            'iss': current_app.config['SERVER_NAME'],
+            'purpose': 'User.follow_corpus',
+            'role_name': role_name,
+            'sub': corpus_hashid
+        }
+        return jwt.encode(
+            payload,
+            current_app.config['SECRET_KEY'],
+            algorithm='HS256'
+        )
+    
+    def follow_corpus_by_token(self, token):
+        try:
+            payload = jwt.decode(
+                token,
+                current_app.config['SECRET_KEY'],
+                algorithms=['HS256'],
+                issuer=current_app.config['SERVER_NAME'],
+                options={'require': ['exp', 'iat', 'iss', 'purpose', 'role_name', 'sub']}
+            )
+        except jwt.PyJWTError:
+            return False
+        if payload.get('purpose') != 'User.follow_corpus':
+            return False
+        corpus_hashid = payload.get('sub')
+        corpus_id = hashids.decode(corpus_hashid)
+        corpus = Corpus.query.get_or_404(corpus_id)
+        if corpus is None:
+            return False
+        role_name = payload.get('role_name')
+        role = CorpusFollowerRole.query.filter_by(name=role_name).first()
+        if role is None:
+            return False
+        self.follow_corpus(corpus, role)
+        # db.session.add(self)
+        return True
+
+    def to_json_serializeable(self, backrefs=False, relationships=False, filter_by_privacy_settings=False):
+        json_serializeable = {
+            'id': self.hashid,
+            'confirmed': self.confirmed,
+            'avatar': url_for('users.user_avatar', user_id=self.id),
+            'email': self.email,
+            'last_seen': (
+                None if self.last_seen is None
+                else f'{self.last_seen.isoformat()}Z'
+            ),
+            'member_since': f'{self.member_since.isoformat()}Z',
+            'username': self.username,
+            'full_name': self.full_name,
+            'about_me': self.about_me,
+            'website': self.website,
+            'location': self.location,
+            'organization': self.organization,
+            'job_status_mail_notification_level': \
+                    self.setting_job_status_mail_notification_level.name,
+            'profile_privacy_settings': {
+                'is_public': self.is_public,
+                'show_email': self.has_profile_privacy_setting(ProfilePrivacySettings.SHOW_EMAIL),
+                'show_last_seen': self.has_profile_privacy_setting(ProfilePrivacySettings.SHOW_LAST_SEEN),
+                'show_member_since': self.has_profile_privacy_setting(ProfilePrivacySettings.SHOW_MEMBER_SINCE)
+            }
+        }
+        if backrefs:
+            json_serializeable['role'] = \
+                self.role.to_json_serializeable(backrefs=True)
+        if relationships:
+            json_serializeable['corpus_follower_associations'] = {
+                x.hashid: x.to_json_serializeable()
+                for x in self.corpus_follower_associations
+            }
+            json_serializeable['corpora'] = {
+                x.hashid: x.to_json_serializeable(relationships=True)
+                for x in self.corpora
+            }
+            json_serializeable['jobs'] = {
+                x.hashid: x.to_json_serializeable(relationships=True)
+                for x in self.jobs
+            }
+            json_serializeable['tesseract_ocr_pipeline_models'] = {
+                x.hashid: x.to_json_serializeable(relationships=True)
+                for x in self.tesseract_ocr_pipeline_models
+            }
+            json_serializeable['spacy_nlp_pipeline_models'] = {
+                x.hashid: x.to_json_serializeable(relationships=True)
+                for x in self.spacy_nlp_pipeline_models
+            }
+
+        if filter_by_privacy_settings:
+            if not self.has_profile_privacy_setting(ProfilePrivacySettings.SHOW_EMAIL):
+                json_serializeable.pop('email')
+            if not self.has_profile_privacy_setting(ProfilePrivacySettings.SHOW_LAST_SEEN):
+                json_serializeable.pop('last_seen')
+            if not self.has_profile_privacy_setting(ProfilePrivacySettings.SHOW_MEMBER_SINCE):
+                json_serializeable.pop('member_since')
+        return json_serializeable
--- a/app/services/init.py
+++ b/app/services/init.py
@@ -1,12 +1,11 @@
 from flask import Blueprint
 from flask_login import login_required
-import os
+from pathlib import Path
 import yaml


-services_file = \
-    os.path.join(os.path.dirname(os.path.abspath(__file__)), 'services.yml')
-with open(services_file, 'r') as f:
+services_file = Path(__file__).parent / 'services.yml'
+with services_file.open('r') as f:
    SERVICES = yaml.safe_load(f)

 bp = Blueprint('services', __name__)
--- a/app/services/routes.py
+++ b/app/services/routes.py
@@ -6,6 +6,7 @@ from app import db, hashids
 from app.models import (
    Job,
    JobInput,
+    JobResult,
    JobStatus,
    TesseractOCRPipelineModel,
    SpaCyNLPPipelineModel
@@ -74,6 +75,8 @@ def tesseract_ocr_pipeline():
    version = request.args.get('version', service_manifest['latest_version'])
    if version not in service_manifest['versions']:
        abort(404)
+    job_results = JobResult.query.all()
+    choosable_job_ids = [job_result.job.hashid for job_result in job_results if job_result.job.service == "file-setup-pipeline" and job_result.filename.endswith('.pdf')]
    form = CreateTesseractOCRPipelineJobForm(prefix='create-job-form', version=version)
    if form.is_submitted():
        if not form.validate():
@@ -111,6 +114,7 @@ def tesseract_ocr_pipeline():
    return render_template(
        'services/tesseract_ocr_pipeline.html.j2',
        title=service_manifest['name'],
+        choosable_job_ids=choosable_job_ids,
        form=form,
        tesseract_ocr_pipeline_models=tesseract_ocr_pipeline_models,
        user_tesseract_ocr_pipeline_models_count=user_tesseract_ocr_pipeline_models_count
--- a/app/services/services.yml
+++ b/app/services/services.yml
@@ -59,3 +59,8 @@ spacy-nlp-pipeline:
        - 'encoding_detection'
      publishing_year: 2022
      url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/spacy-nlp-pipeline/-/releases/v0.1.1'
+    0.1.2:
+      methods:
+        - 'encoding_detection'
+      publishing_year: 2024
+      url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/spacy-nlp-pipeline/-/releases/v0.1.2'
--- a/app/static/images/user_avatar.png
+++ b/app/static/images/user_avatar.png
--- a/app/static/js/resource-lists/job-output-list.js
+++ b/app/static/js/resource-lists/job-output-list.js
@@ -0,0 +1,137 @@
+nopaque.resource_lists.JobOutputList = class JobOutputList extends nopaque.resource_lists.ResourceList {
+  static htmlClass = 'job-output-list';
+
+  constructor(listContainerElement, options = {}) {
+    super(listContainerElement, options);
+    this.listjs.list.addEventListener('click', (event) => {this.onClick(event)});
+    this.isInitialized = false;
+    this.userId = listContainerElement.dataset.userId;
+    this.jobOutput = listContainerElement.dataset.jobOutput;
+    this.jobIds = listContainerElement.dataset.jobIds;
+    if (this.userId === undefined) {return;}
+    app.subscribeUser(this.userId).then((response) => {
+      app.socket.on('PATCH', (patch) => {
+        if (this.isInitialized) {this.onPatch(patch);}
+      });
+    });
+    app.getUser(this.userId).then((user) => {
+      let jobIds = JSON.parse(this.jobIds.replace(/'/g, '"'));
+      let job_results = {};
+      for (let jobId of jobIds) {
+        for (let jobResult of Object.values(user.jobs[jobId].results)) {
+          if (jobResult.mimetype === 'application/pdf') {
+            job_results[jobResult.id] = jobResult;
+            job_results[jobResult.id].description = user.jobs[jobId].description;
+            job_results[jobResult.id].title = user.jobs[jobId].title;
+          }
+        }
+      }
+      this.add(Object.values(job_results));
+      this.isInitialized = true;
+    });
+  }
+
+  get item() {
+    return `
+      <tr class="list-item clickable hoverable">
+        <td><span class="title"></span></td>
+        <td><span class="description"></span></td>
+        <td><span class="filename"></span></td>
+        <td class="right-align">
+          <a class="list-action-trigger btn-flat waves-effect waves-light" data-list-action="add"><i class="material-icons">add</i></a>
+        </td>
+      </tr>
+    `.trim();
+  }
+
+  get valueNames() {
+    return [
+      {data: ['id']},
+      {data: ['creation-date']},
+      'title',
+      'description',
+      'filename'
+    ];
+  }
+
+  initListContainerElement() {
+    if (!this.listContainerElement.hasAttribute('id')) {
+      this.listContainerElement.id = nopaque.Utils.generateElementId('job-output-list-');
+    }
+    let listSearchElementId = nopaque.Utils.generateElementId(`${this.listContainerElement.id}-search-`);
+    this.listContainerElement.innerHTML = `
+      <div class="input-field">
+        <i class="material-icons prefix">search</i>
+        <input id="${listSearchElementId}" class="search" type="text"></input>
+        <label for="${listSearchElementId}">Search job output</label>
+      </div>
+      <table>
+        <thead>
+          <tr>
+            <th>Title</th>
+            <th>Description</th>
+            <th>Filename</th>
+            <th></th>
+          </tr>
+        </thead>
+        <tbody class="list"></tbody>
+      </table>
+      <ul class="pagination"></ul>
+    `;
+  }
+
+  mapResourceToValue(jobOutput) {
+    console.log(jobOutput);
+    return {
+      'id': jobOutput.id,
+      'creation-date': jobOutput.creationDate,
+      'title': jobOutput.title,
+      'description': jobOutput.description,
+      'filename': jobOutput.filename
+    };
+  }
+
+  sort() {
+    this.listjs.sort('title', {order: 'asc'});
+  }
+
+  onClick(event) {
+    let listItemElement = event.target.closest('.list-item[data-id]');
+    if (listItemElement === null) {return;}
+    let itemId = listItemElement.dataset.id;
+    let listActionElement = event.target.closest('.list-action-trigger[data-list-action]');
+    let listAction = listActionElement === null ? 'add' : listActionElement.dataset.listAction;
+    switch (listAction) {
+      case 'add': {
+        listActionElement.querySelector('i').textContent = 'done';
+        listActionElement.dataset.listAction = 'remove';
+        break;
+      }
+      case 'remove': {
+        listActionElement.querySelector('i').textContent = 'add';
+        listActionElement.dataset.listAction = 'add';
+        break;
+      }
+      default: {
+        break;
+      }
+    }
+  }
+
+  // onPatch(patch) {
+  //   let re = new RegExp(`^/users/${this.userId}/jobs/${this.jobId}/results/([A-Za-z0-9]*)`);
+  //   let filteredPatch = patch.filter(operation => re.test(operation.path));
+  //   for (let operation of filteredPatch) {
+  //     switch(operation.op) {
+  //       case 'add': {
+  //         let re = new RegExp(`^/users/${this.userId}/jobs/${this.jobId}/results/([A-Za-z0-9]*)$`);
+  //         if (re.test(operation.path)) {this.add(operation.value);}
+  //         break;
+  //       }
+  //       default: {
+  //         break;
+  //       }
+  //     }
+  //   }
+  // }
+};
--- a/app/templates/_base/navbar.html.j2
+++ b/app/templates/_base/navbar.html.j2
@@ -12,7 +12,7 @@
        <li>
          <a class="dropdown-trigger no-autoinit" data-target="nav-more-dropdown" href="#!" id="nav-more-dropdown-trigger">
            {% if current_user.is_authenticated %}
-            <img src="{{ url_for('users.user_avatar', user_id=current_user.id) }}" alt="avatar" class="circle left" style="height: 54px; padding: 10px 10px 0 0;">
+            <img src="{{ url_for('users.user_avatar', user_id=current_user.id) }}" alt="avatar" class="circle left" style="height: 54px; padding:8px;">
            {{ current_user.username }} ({{ current_user.email }})
            {% else %}
            <i class="material-icons left">more_vert</i>
--- a/app/templates/_base/scripts.html.j2
+++ b/app/templates/_base/scripts.html.j2
@@ -52,6 +52,7 @@
  'js/resource-lists/job-input-list.js',
  'js/resource-lists/job-list.js',
  'js/resource-lists/job-result-list.js',
+  'js/resource-lists/job-output-list.js',
  'js/resource-lists/public-corpus-list.js',
  'js/resource-lists/public-user-list.js',
  'js/resource-lists/spacy-nlp-pipeline-model-list.js',
--- a/app/templates/contributions/spacy_nlp_pipeline_models/create.html.j2
+++ b/app/templates/contributions/spacy_nlp_pipeline_models/create.html.j2
@@ -42,7 +42,7 @@
            {{ form.hidden_tag() }}
            <div class="row">
              <div class="col s12 l5">
-                {{ wtf.render_field(form.spacy_model_file, accept='.tar.gz', placeholder='Choose a .tar.gz file') }}
+                {{ wtf.render_field(form.spacy_model_file, accept='.tar.gz,.whl', placeholder='Choose a .tar.gz or .whl file') }}
              </div>
              <div class="col s12 l7">
                {{ wtf.render_field(form.title, material_icon='title') }}
--- a/app/templates/main/news.html.j2
+++ b/app/templates/main/news.html.j2
@@ -6,119 +6,179 @@
    <div class="col s12">
      <h1 id="title">{{ title }}</h1>
    </div>
+
+    <div class="col s12">
+      <div class="card" id="news-post-january-2024">
+        <div class="card-content">
+          <h6 style="font-weight: 300;">January 2024</h6>
+          <span class="card-title">Looking back on 2023 - new changes to nopaque</span>
+          <br>
+          <p>Hello nopaque users!</p>
+          <p>First of all, the nopaque team would like to wish everyone a good start to 2024! We hope you found the time to relax over the winter break.</p>
+          <p>Now that the new year has come around and we’re all back in the office, we wanted to take the opportunity to tell you about the most important things we’ve worked on in nopaque in 2023 – things we’ve incorporated into our <b>latest nopaque update</b> as of late <b>December 2023</b>. You may have noticed some of them as you’ve returned to your projects on nopaque.</p>
+          <br>
+          <h6 style="font-weight: 300;">Changes to the Query Builder</h6>
+          <p>
+            The Query Builder has undergone changes to make it more intuitive to use and is now the standard option for creating queries.
+            Individual elements of a query can now be easily modified and edited by clicking on them. 
+            An input marker shows your position in the inquiry and where new elements will be added. This and all other elements can be moved around via drag and drop. 
+            A new toggle button enables users to easily switch between the Query Builder and Expert Mode if they prefer to work with the plain Corpus Query Language (CQL) instead. This can be done in the middle of an existing query – existing chips will be “translated” into CQL.
+            This also works the other way around – if you want to switch back, your query in CQL wll be parsed into chips.
+            More details and instructions on how to use the new Query Builder can be found in the manual.
+          </p>
+          <br>
+          <h6 style="font-weight: 300;">Community Update</h6>
+          <p>
+            The most extensive changes to nopaque have taken place in the Social Area. We want nopaque to be a platform where researchers can connect with each other, so we’ve added some more features to make this possible. 
+            Users can now update their personal profiles to be publicly visible to others on nopaque, including a short “About me” section and options to share your website, organization, location, and add an avatar that others can see.
+            It is also possible to share corpora with other researchers via share links, access invitations, or by setting corpus visibility to Public. Other users can only see the meta data of public corpora – further access can be granted upon request. 
+            The extent of access to these shared corpora is managed by assigning the roles of Viewer, Contributor, and Administrator. Viewers may only download the files. Contributors can download and edit files and their metadata as well as analyze and build the corpus. Administrators can manage users, followers and visibility, in addition to all of the above.
+          </p>
+          <br>
+        </div>
+      </div>
+    </div>
+
+    <div class="col s12">
+      <div class="card" id="news-post-july-2023">
+        <div class="card-content">
+          <h6 style="font-weight: 300;">July 2023</h6>
+          <span class="card-title">Visualization Update (beta) - new analysis features</span>
+          <br>
+          <p>Hey users,</p>
+          <p>
+            we wanted to give you some news on updates we’re making to nopaque. 
+            Since we want to make it easier for users to grasp and work with different elements of their data, 
+            we’ve been working on adding some visualization features into the Corpus Analysis service. Currently, the two main modules, 
+            “Reader” and “Concordance” have been expanded with an additional “Static Visualizations” module, but there’s more to come!
+          </p>
+          <p>
+            With the Static Visualizations module, it’s now possible to view information 
+            about your corpus, such as the number of (unique) tokens, sentences, lemmata, 
+            corresponding information on individual texts, the distribution of these elements 
+            within your corpus, as well as searchable lists of word frequencies with stopwords 
+            that can be preset and modified. In the future, this area will be extended with more advanced visualization options.
+          </p>
+          <p>
+            We’ll keep you posted about further visualization updates. Until then, we hope the latest update improves 
+            your research experience with nopaque. And as always, if you have any ideas for nopaque or need assistance, 
+            don’t hesitate to contact us!
+          </p>
+          <br>
+        </div>
+      </div>
+    </div>
+
+    <div class="col s12">
+      <div class="card" id="news-post-november-2022">
+        <div class="card-content">
+          <h6 style="font-weight: 300;">November 2022</h6>
+          <span class="card-title">Contribution Update</span>
+          <br>
+          <p>Dear users,</p>
+          <p>
+            users can now upload their own language models into nopaque. This is useful for working with different languages that are not available as standard in nopaque or if a user wants to work with a language model they have developed themselves. Tesseract models can be uploaded in .traineddata format; spaCy models can be uploaded in tar.gz format. We are also working on the option to upload models in .whl format in the future.
+            Uploaded models can be found in the model list of the corresponding service and can be used immediately. Models can also be made public if you have a role of Contributor in nopaque. 
+          </p>
+          <br>
+          <p><b>Please note:</b> The Contributor role must be requested from the nopaque admins if you would like to make a model public for all users.</p>
+          <br>
+        </div>
+      </div>
+    </div>
  
    <div class="col s12">
-      <div class="card" id="april-2022-update">
+      <div class="card" id="news-post-april-2022">
        <div class="card-content">
-          <span class="card-title">April 2022 update</span>
-          <p>Dear users</p>
+          <h6 style="font-weight: 300;">April 2022</h6>
+          <span class="card-title">April updates – more features, faster operation</span>
          <br>
+          <p>Hello everyone,</p>
          <p>
-            with the April 2022 update we have improved nopaque in all places.
-            We have significantly reworked our backend code to utilize our servers more efficiently,
-            integrated a new service, updated all previously existing ones, rewrote a lot of code and made a few minor design improvements.
+            in April 2022, we released an update improving many elements of nopaque. We rewrote a lot of our code, 
+            including a significant reworking of our backend code for more efficient use of our servers. 
+            We integrated a new service, updated the existing ones, and made some minor design improvements.
          </p>
          <br>
-
-          <span class="card-title">Where is my Job data?</span>
+          <h6 style="font-weight: 300;">Database Cleanup</h6>
          <p>
-            At the beginning of the year, we realized that our storage limit had been reached.
-            This was the time when some users may have noticed system instabilities.
-            We were fortunately able to temporarily solve this problem without data loss
-            by deleting some non-nopaque related data on our system (yes we also do <a href="https://digital-history.uni-bielefeld.de">other things then nopaque</a>).
-            In order to not face the same problem again, we had to dedicate ourselves to a long-term solution.
-            This consists of deleting all previous job data with this update and henceforth storing new job data
-            only for three months after job creation (important note: <b>corpora are not affected</b>).
-            All job data prior to this update has been backed up for you,
-            feel free to contact us at nopaque@uni-bielefeld.de if you would like to get this data back.
+            We may be a bit late with our spring cleaning, but we’ve tidied up our 
+            database system and deleted old, empty corpora, unconfirmed user accounts and 
+            unnecessary data fields.
          </p>
-          <br>
-
-          <span class="card-title">What's new?</span>
+          <h6 style="font-weight: 300;">What's new?</h6>
          <p>
-            By partnering up with <a href="https://readcoop.eu/transkribus/?sc=Transkribus">Transkribus</a> we reached one of our long term goals: integrate a HTR service into nopaque.
-            The <a href="{{ url_for('services.transkribus_htr_pipeline') }}">Transkribus HTR Pipeline</a> service is implemented as a kind of proxied service where the work is split between Transkribus and us.
+            By partnering with Transkribus, we’ve reached one of our long-term goals: to integrate a 
+            Handwritten Text Recognition (HTR) service into nopaque. The Transkribus HTR Pipeline service is implemented as a 
+            kind of proxied service where the work is split between us and Transkribus. 
            That means we do the preprocessing, storage and postprocessing, while Transkribus handles the HTR itself.
          </p>
-          <br>
-
          <p>
-            One of the changes in the background was to fix our performance issues. While implementing the <a href="{{ url_for('services.transkribus_htr_pipeline') }}">Transkribus HTR Pipeline</a> service we
-            found some optimization potential within different steps of our processing routine. These optimizations are now also
-            available in our <a href="{{ url_for('services.transkribus_htr_pipeline') }}">Tesseract OCR Pipeline</a> service, resulting in a speed up of about 4x.
-            For now we are done with the most obvious optimizations but we may include more in the near future, so stay tuned!
+            One change we needed to make in the background was to fix our performance issues. 
+            While implementing the Transkribus HTR Pipeline service, we saw optimization potential 
+            in different steps of our processing routine. These optimizations are now also available 
+            in our Tesseract OCR Pipeline service and result in speeds that are about four times faster 
+            than before. We’re now finished with the major optimizations, but there could be more soon, 
+            so stay tuned!
+          </p>
+          <p>
+            Next, we reorganized our Corpus Analysis code. It was a bit messy, but after a complete rewrite, 
+            we are now able to query a corpus without long loading times and with better error handling, 
+            making the user experience much more stable. The Corpus Analysis service is now modularized and comes with two modules 
+            that recreate and extend the functionality of the old service.
+          </p>
+          <p>
+            The Query Result viewer had to be temporarily disabled, as the code was based on the old Corpus Analysis service. 
+            It will be reintegrated as a module to the Corpus Analysis.
+          </p>
+          <p>
+            The spaCy NLP Pipeline service was also taken care of with some smaller updates. This is important preliminary work 
+            for support of more models/languages missing the full set of linguistic features (lemma, ner, pos, simple_pos). 
+            It still needs some testing and adjustments but will be ready soon! 
+          </p>
+          <p>
+            Last, but not least, we made some design changes. Now, you can find color in places that were previously in black and white. 
+            Nothing big, but the new colors can aid in identifying resources more efficiently. 
+          </p>
+          <h6 style="font-weight: 300;">Where is my job data?</h6>
+          <p>
+            We reached our storage limit at the beginning of the year. 
+            At this time, some users may have noticed system instability. 
+            Fortunately, we found a solution that avoided data loss by deleting some 
+            non-nopaque related data in our system (yes, <a href="https://www.uni-bielefeld.de/fakultaeten/geschichtswissenschaft/abteilung/arbeitsbereiche/digital-history/">we also do things other than nopaque</a>). 
+            To avoid facing the same problem again, we had to find a long-term solution. 
+            In the end, this involved the deletion of all previous job data with this update and, 
+            going forward, only keeping new job data for three months after job creation 
+            (<b>important note:</b> corpora are not affected). All job data created prior to this 
+            update has been backed up for you. Feel free to contact us at <a href="mailto:nopaque@uni-bielefeld.de">nopaque@uni-bielefeld.de</a> 
+            if you would like to get this data back.
          </p>
          <br>
-
-          <p>
-            The next step was to reorganize our <a href="{{ url_for('services.corpus_analysis') }}">Corpus Analysis</a> code. Unfortunatly it was a bit messy, after a complete rewrite we are
-            now able to query a corpus without long loading times and with better error handling, resulting in way more stable user experience.
-            The Corpus Analysis service is now modularized and comes with 2 modules that recreate and extend the functionality of the old service.<br>
-            For now we had to disable the Query Result viewer, the code was based on the old Corpus Analysis service and will be reintegrated as a module to the Corpus Analysis.
-          </p>
-          <br>
-
-          <p>
-            The <a href="{{ url_for('services.spacy_nlp_pipeline') }}">spaCy NLP Pipeline</a> service got some love in the form of smaller updates too.
-            This is important preliminary work to support more models/languages that does not provide the full set of linguistic features (lemma, ner, pos, simple_pos). It still needs some testing and tweaking but will be ready soon!
-          </p>
-          <br>
-
-          <p>
-            Last but not least we made some design changes. Now you can find colors in places where we had just black and white before.
-            Nothing big but the new colors will help you identify ressources more efficient!
-          </p>
-          <br>
-
-          <span class="card-title">Database cleanup</span>
-          <p>
-            We may be a bit late with our spring cleaning but with this update we tidied up within our database system.
-            This means we deleted old corpora with no corpus files, unconfirmed user accounts and in general unnecessary data fields.
-          </p>
-          <br>
-
-          <p>
-            That's it, thank you for using nopaque! We hope you like the update and appreciate all your past and future feedback.
-          </p>
        </div>
      </div>
    </div>

    <div class="col s12">
-      <div class="card" id="maintenance">
-        <div class="card-content">
-          <span class="card-title">Maintenance</span>
-          <p>Dear users</p>
-          <br>
-          <p>Currently we are rewriting big parts of our project infrastructure. Due to this the following features are not available:</p>
-          <ul>
-            <li>Corpus export and import</li>
-            <li>Query result export, import and view</li>
-          </ul>
-          <p>We hope to add these features back in the near future, until then check out our updated corpus analysis.</p>
-        </div>
-      </div>
-    </div>
-
-    <div class="col s12">
-      <div class="card" id="nlp-removed-language-support">
-        <div class="card-content">
-          <span class="card-title">Natural Language Processing removed language support</span>
-          <p>Dear users</p>
-          <br>
-          <p>Not all language models support all features we utizlize in our NLP service. Thats why we had to drop them, as soon as they meet our requirements we will add them back!</p>
-        </div>
-      </div>
-    </div>
-
-    <div class="col s12">
-      <div class="card" id="beta-launch">
+      <div class="card" id="news-post-september-2021">
        <div class="card-content">
+          <h6 style="font-weight: 300;">September 2021</h6>
          <span class="card-title">nopaque's beta launch</span>
-          <p>Dear users</p>
          <br>
-          <p>A few days ago we went live with nopaque. Right now nopaque is still in its Beta phase. So some bugs are to be expected. If you encounter any bugs or some feature is not working as expected please send as an email using the feedback button at the botton of the page in the footer!</p>
-          <p>We are happy to help you with any issues and will use the feedback to fix all mentioned bugs!</p>
+          <p>Hello to all our users!</p>
+          <p>The BETA version of our web platform, nopaque, is now available! Nopaque is a web application that offers different services and tools to support researchers working with image and text-based data. These services include:</p>
+          <ul>
+            <li>File Setup, which converts and merges different data (e.g., books, letters) for further processing</li>
+            <li>Optical Character Recognition, which converts photos and scans into text data for machine readability</li>
+            <li>Natural Language Processing, which extracts information from your text via computational linguistic data processing (tokenization, lemmatization, part-of-speech tagging and named-entity recognition)</li>
+            <li>Corpus analysis, which makes use of CQP Query Language to search through text corpora with the aid of metadata and Natural Language Processing tags.</li>
+          </ul>
+          <p>
+            Nopaque was created based on our experiences working with other subprojects and a Prototyp user study in the 
+            first phase of funding. The platform is open source under the terms of the MIT license (<a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque</a>). 
+            Language support and functions are currently limited – extensions can be requested by sending an email to <a href="mailto:nopaque@uni-bielefeld.de">nopaque@uni-bielefeld.de</a>. 
+            Because we are still in the beta phase, some bugs are to be expected. If you encounter any problems, please let us know! 
+            We are thankful for all feedback we receive.
+          </p>
        </div>
      </div>
    </div>
--- a/app/templates/services/tesseract_ocr_pipeline.html.j2
+++ b/app/templates/services/tesseract_ocr_pipeline.html.j2
@@ -37,6 +37,15 @@

    <div class="col s12">
      <h2>Submit a job</h2>
+      <div class="card">
+        <div class="card-content">
+          <p>Add an existing file from your workflow or add a new one below.</p>
+          <div class="job-output-list" data-user-id="{{ current_user.hashid}}" data-job-ids="{{ choosable_job_ids }}"></div>
+        </div>
+        <div class="card-action right-align">
+          <a class="waves-effect waves-light btn"><i class="material-icons right">send</i>Submit</a>
+        </div>
+      </div>
      <div class="card">
        <form class="create-job-form" enctype="multipart/form-data" method="POST">
          <div class="card-content">
@@ -51,6 +60,8 @@
              <div class="col s12 l5">
                {{ wtf.render_field(form.pdf, accept='application/pdf', placeholder='Choose a PDF file') }}
              </div>
+            </div>
+            <div class="row">
              <div class="col s12 l4">
                <div class="input-field">
                  <i class="material-icons prefix">language</i>
--- a/app/users/cli.py
+++ b/app/users/cli.py
@@ -1,6 +1,4 @@
 from app.models import User
-import os
-import shutil
 from app import db
 from . import bp

--- a/app/users/routes.py
+++ b/app/users/routes.py
@@ -7,7 +7,6 @@ from flask import (
 )
 from flask_breadcrumbs import register_breadcrumb
 from flask_login import current_user
-import os
 from app.models import User
 from . import bp
 from .utils import user_dynamic_list_constructor as user_dlc
@@ -40,8 +39,8 @@ def user_avatar(user_id):
    if user.avatar is None:
        return redirect(url_for('static', filename='images/user_avatar.png'))
    return send_from_directory(
-        os.path.dirname(user.avatar.path),
-        os.path.basename(user.avatar.path),
+        user.avatar.path.parent,
+        user.avatar.path.name,
        as_attachment=True,
        attachment_filename=user.avatar.filename,
        mimetype=user.avatar.mimetype
--- a/config.py
+++ b/config.py
@@ -1,6 +1,7 @@
 from dotenv import load_dotenv
 from flask import Flask
 from logging.handlers import RotatingFileHandler
+from pathlib import Path
 from werkzeug.middleware.proxy_fix import ProxyFix
 import logging
 import os
@@ -57,8 +58,7 @@ class Config:

    ''' # nopaque # '''
    NOPAQUE_ADMIN = os.environ.get('NOPAQUE_ADMIN')
-    NOPAQUE_DATA_DIR = \
-        os.path.abspath(os.environ.get('NOPAQUE_DATA_PATH', '/mnt/nopaque'))
+    NOPAQUE_DATA_DIR = Path(os.environ.get('NOPAQUE_DATA_PATH', '/mnt/nopaque'))
    NOPAQUE_IS_PRIMARY_INSTANCE = \
        os.environ.get('NOPAQUE_IS_PRIMARY_INSTANCE', 'true').lower() == 'true'
    NOPAQUE_MAIL_SUBJECT_PREFIX = '[nopaque]'
@@ -115,7 +115,7 @@ class Config:
    NOPAQUE_READCOOP_USERNAME = os.environ.get('NOPAQUE_READCOOP_USERNAME')
    NOPAQUE_READCOOP_PASSWORD = os.environ.get('NOPAQUE_READCOOP_PASSWORD')

-    NOPAQUE_VERSION='1.0.0'
+    NOPAQUE_VERSION='1.0.2'

    @staticmethod
    def init_app(app: Flask):
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@ apifairy
 cqi>=0.1.7
 dnspython==2.2.1
 docker
-eventlet
+eventlet==0.34.2
 Flask==2.1.3
 Flask-APScheduler
 Flask-Assets==2.0
Author	SHA1	Message	Date
Inga Kirschnick	2c709e65d0	Merge branch 'access-pipeline' of gitlab.ub.uni-bielefeld.de:sfb1288inf/nopaque into access-pipeline	2024-03-14 10:12:41 +01:00
Inga Kirschnick	71c0ddf515	Job output list implementation	2024-03-14 10:09:45 +01:00
Inga Kirschnick	5c395d1e06	Job output list implementation	2024-03-14 10:08:52 +01:00
Patrick Jentsch	82d6f6003f	Restructure Dockerfile for better caching	2024-03-13 12:58:39 +01:00
Patrick Jentsch	9da74c1c6f	Use pathlib where possible	2024-03-07 15:49:04 +01:00
Patrick Jentsch	ec23bd94ee	add missing import	2024-03-06 14:46:25 +01:00
Patrick Jentsch	55a62053b0	Make models in package work	2024-03-06 13:20:02 +01:00
Patrick Jentsch	a1e5bd61e0	move models in seperate modules	2024-03-05 16:02:23 +01:00
Patrick Jentsch	cf8c164d60	allow .whl files for spacy nlp pipeline contributions	2024-02-14 14:24:21 +01:00
Patrick Jentsch	05ab204e5a	Merge branch 'master' into development	2024-02-14 13:43:45 +01:00
Patrick Jentsch	9f188afd16	Bump nopaque version	2024-02-14 13:42:20 +01:00
Patrick Jentsch	dc77ac7b76	Add new spaCy NLP Pipeline version	2024-02-14 13:40:49 +01:00
Inga Kirschnick	84276af322	Merge branch 'development'	2024-01-23 14:38:24 +01:00
Inga Kirschnick	d9d4067536	Set new version in config	2024-01-23 14:38:10 +01:00
Inga Kirschnick	ba65cf5911	Merge branch 'development'	2024-01-23 14:19:06 +01:00
Patrick Jentsch	69a1edc51e	fix eventlet version	2024-01-23 13:31:15 +01:00
Inga Kirschnick	32ad8c7359	News + user avatar fix	2024-01-22 10:58:52 +01:00