Add SpaCyNLPModel database model

2026-01-03 13:00:55 +00:00 · 2022-10-13 15:05:54 +02:00
parent 89ccca11eb
commit b9e5939c1b
5 changed files with 175 additions and 5 deletions
--- a/app/SpaCyNLPPipelineModel.defaults.yml
+++ b/app/SpaCyNLPPipelineModel.defaults.yml
@@ -0,0 +1,10 @@
+- title: 'de_core_news_md-3.4.0'
+  description: 'German pipeline optimized for CPU. Components: tok2vec, tagger, morphologizer, parser, lemmatizer (trainable_lemmatizer), senter, ner.'
+  url: 'https://github.com/explosion/spacy-models/releases/download/de_core_news_md-3.4.0/de_core_news_md-3.4.0.tar.gz'
+  publisher: 'Explosion'
+  publisher_url: 'https://github.com/explosion'
+  publishing_url: 'https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.4.0'
+  publishing_year: 2022
+  version: '3.4.0'
+  compatible_service_versions:
+    - '0.1.0'
--- a/app/cli.py
+++ b/app/cli.py
@@ -5,7 +5,8 @@ import os
 from app.models import (
    Role,
    User,
-    TesseractOCRPipelineModel
+    TesseractOCRPipelineModel,
+    SpaCyNLPPipelineModel
 )


@@ -39,6 +40,8 @@ def register(app):
        Role.insert_defaults()
        current_app.logger.info('Insert/Update default users')
        User.insert_defaults()
+        current_app.logger.info('Insert/Update default SpaCyNLPPipelineModels')
+        SpaCyNLPPipelineModel.insert_defaults()
        current_app.logger.info('Insert/Update default TesseractOCRPipelineModels')
        TesseractOCRPipelineModel.insert_defaults()

--- a/app/models.py
+++ b/app/models.py
@@ -277,6 +277,12 @@ class User(HashidMixin, UserMixin, db.Model):
        cascade='all, delete-orphan',
        lazy='dynamic'
    )
+    spacy_nlp_pipeline_models = db.relationship(
+        'SpaCyNLPPipelineModel',
+        backref='user',
+        cascade='all, delete-orphan',
+        lazy='dynamic'
+    )
    corpora = db.relationship(
        'Corpus',
        backref='user',
@@ -333,6 +339,7 @@ class User(HashidMixin, UserMixin, db.Model):
        db.session.refresh(user)
        try:
            os.mkdir(user.path)
+            os.mkdir(os.path.join(user.path, 'spacy_nlp_pipeline_models'))
            os.mkdir(os.path.join(user.path, 'tesseract_ocr_pipeline_models'))
            os.mkdir(os.path.join(user.path, 'corpora'))
            os.mkdir(os.path.join(user.path, 'jobs'))
@@ -614,6 +621,105 @@ class TesseractOCRPipelineModel(FileMixin, HashidMixin, db.Model):
        return _json


+class SpaCyNLPPipelineModel(FileMixin, HashidMixin, db.Model):
+    __tablename__ = 'spacy_nlp_pipeline_models'
+    # Primary key
+    id = db.Column(db.Integer, primary_key=True)
+    # Foreign keys
+    user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
+    # Fields
+    title = db.Column(db.String(64))
+    description = db.Column(db.String(255))
+    version = db.Column(db.String(16))
+    compatible_service_versions = db.Column(ContainerColumn(list, 255))
+    publisher = db.Column(db.String(128))
+    publisher_url = db.Column(db.String(512))
+    publishing_url = db.Column(db.String(512))
+    publishing_year = db.Column(db.Integer)
+    shared = db.Column(db.Boolean, default=False)
+    # Backrefs: user: User
+
+    @property
+    def path(self):
+        return os.path.join(
+            self.user.path,
+            'spacy_nlp_pipeline_models',
+            str(self.id)
+        )
+
+    @staticmethod
+    def insert_defaults():
+        nopaque_user = User.query.filter_by(username='nopaque').first()
+        defaults_file = os.path.join(
+            os.path.dirname(os.path.abspath(__file__)),
+            'SpaCyNLPPipelineModel.defaults.yml'
+        )
+        with open(defaults_file, 'r') as f:
+            defaults = yaml.safe_load(f)
+        for m in defaults:
+            model = SpaCyNLPPipelineModel.query.filter_by(title=m['title'], version=m['version']).first()  # noqa
+            if model is not None:
+                model.compatible_service_versions = m['compatible_service_versions']
+                model.description = m['description']
+                model.publisher = m['publisher']
+                model.publisher_url = m['publisher_url']
+                model.publishing_url = m['publishing_url']
+                model.publishing_year = m['publishing_year']
+                model.shared = True
+                model.title = m['title']
+                model.version = m['version']
+                continue
+            model = SpaCyNLPPipelineModel(
+                compatible_service_versions=m['compatible_service_versions'],
+                description=m['description'],
+                publisher=m['publisher'],
+                publisher_url=m['publisher_url'],
+                publishing_url=m['publishing_url'],
+                publishing_year=m['publishing_year'],
+                shared=True,
+                title=m['title'],
+                user=nopaque_user,
+                version=m['version']
+            )
+            db.session.add(model)
+            db.session.flush(objects=[model])
+            db.session.refresh(model)
+            model.filename = f'{model.id}.traineddata'
+            r = requests.get(m['url'], stream=True)
+            pbar = tqdm(
+                desc=f'{model.title} ({model.filename})',
+                unit="B",
+                unit_scale=True,
+                unit_divisor=1024,
+                total=int(r.headers['Content-Length'])
+            )
+            pbar.clear()
+            with open(model.path, 'wb') as f:
+                for chunk in r.iter_content(chunk_size=1024):
+                    if chunk:  # filter out keep-alive new chunks
+                        pbar.update(len(chunk))
+                        f.write(chunk)
+                pbar.close()
+        db.session.commit()
+
+    def to_json(self, backrefs=False, relationships=False):
+        _json = {
+            'id': self.hashid,
+            'compatible_service_versions': self.compatible_service_versions,
+            'description': self.description,
+            'publisher': self.publisher,
+            'publisher_url': self.publisher_url,
+            'publishing_url': self.publishing_url,
+            'publishing_year': self.publishing_year,
+            'shared': self.shared,
+            'title': self.title,
+            **self.file_mixin_to_json()
+        }
+        if backrefs:
+            _json['user'] = self.user.to_json(backrefs=True)
+        return _json
+
+
 class JobInput(FileMixin, HashidMixin, db.Model):
    __tablename__ = 'job_inputs'
    # Primary key
--- a/migrations/versions/31dd42e5ea6f_.py
+++ b/migrations/versions/31dd42e5ea6f_.py
@@ -0,0 +1,53 @@
+"""Add spacy_nlp_pipeline_models table
+
+Revision ID: 31dd42e5ea6f
+Revises: a3b727e3ff71
+Create Date: 2022-10-13 12:47:50.870474
+
+"""
+from alembic import op
+import shutil
+import sqlalchemy as sa
+import os
+from app.models import User
+
+
+# revision identifiers, used by Alembic.
+revision = '31dd42e5ea6f'
+down_revision = 'a3b727e3ff71'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    for user in User.query.all():
+        os.mkdir(os.path.join(user.path, 'spacy_nlp_pipeline_models'))
+    op.create_table('spacy_nlp_pipeline_models',
+    sa.Column('creation_date', sa.DateTime(), nullable=True),
+    sa.Column('filename', sa.String(length=255), nullable=True),
+    sa.Column('last_edited_date', sa.DateTime(), nullable=True),
+    sa.Column('mimetype', sa.String(length=255), nullable=True),
+    sa.Column('id', sa.Integer(), nullable=False),
+    sa.Column('user_id', sa.Integer(), nullable=True),
+    sa.Column('title', sa.String(length=64), nullable=True),
+    sa.Column('description', sa.String(length=255), nullable=True),
+    sa.Column('version', sa.String(length=16), nullable=True),
+    sa.Column('compatible_service_versions', sa.String(length=255), nullable=True),
+    sa.Column('publisher', sa.String(length=128), nullable=True),
+    sa.Column('publisher_url', sa.String(length=512), nullable=True),
+    sa.Column('publishing_url', sa.String(length=512), nullable=True),
+    sa.Column('publishing_year', sa.Integer(), nullable=True),
+    sa.Column('shared', sa.Boolean(), nullable=True),
+    sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
+    sa.PrimaryKeyConstraint('id')
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    for user in User.query.all():
+        shutil.rmtree(os.path.join(user.path, 'spacy_nlp_pipeline_models'))
+    op.drop_table('spacy_nlp_pipeline_models')
+    # ### end Alembic commands ###
--- a/migrations/versions/63b2cc26a01f_.py
+++ b/migrations/versions/63b2cc26a01f_.py
@@ -19,8 +19,7 @@ depends_on = None

 def upgrade():
    # ### commands auto generated by Alembic - please adjust! ###
-    users = User.query.all()
-    for user in users:
+    for user in User.query.all():
        old_tesseract_ocr_pipeline_model_path = os.path.join(
            user.path,
            'tesseract_ocr_models'
@@ -40,8 +39,7 @@ def upgrade():

 def downgrade():
    # ### commands auto generated by Alembic - please adjust! ###
-    users = User.query.all()
-    for user in users:
+    for user in User.query.all():
        old_tesseract_ocr_pipeline_model_path = os.path.join(
            user.path,
            'tesseract_ocr_models'