From b9e5939c1b5783cba99abfa70081f904dbd9682a Mon Sep 17 00:00:00 2001
From: Patrick Jentsch
Date: Thu, 13 Oct 2022 15:05:54 +0200
Subject: [PATCH] Add SpaCyNLPModel database model
---
app/SpaCyNLPPipelineModel.defaults.yml | 10 +++
app/cli.py | 5 +-
app/models.py | 106 +++++++++++++++++++++++++
migrations/versions/31dd42e5ea6f_.py | 53 +++++++++++++
migrations/versions/63b2cc26a01f_.py | 6 +-
5 files changed, 175 insertions(+), 5 deletions(-)
create mode 100644 app/SpaCyNLPPipelineModel.defaults.yml
create mode 100644 migrations/versions/31dd42e5ea6f_.py
diff --git a/app/SpaCyNLPPipelineModel.defaults.yml b/app/SpaCyNLPPipelineModel.defaults.yml
new file mode 100644
index 00000000..576f85e4
--- /dev/null
+++ b/app/SpaCyNLPPipelineModel.defaults.yml
@@ -0,0 +1,10 @@
+- title: 'de_core_news_md-3.4.0'
+ description: 'German pipeline optimized for CPU. Components: tok2vec, tagger, morphologizer, parser, lemmatizer (trainable_lemmatizer), senter, ner.'
+ url: 'https://github.com/explosion/spacy-models/releases/download/de_core_news_md-3.4.0/de_core_news_md-3.4.0.tar.gz'
+ publisher: 'Explosion'
+ publisher_url: 'https://github.com/explosion'
+ publishing_url: 'https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.4.0'
+ publishing_year: 2022
+ version: '3.4.0'
+ compatible_service_versions:
+ - '0.1.0'
diff --git a/app/cli.py b/app/cli.py
index 54226707..826aa790 100644
--- a/app/cli.py
+++ b/app/cli.py
@@ -5,7 +5,8 @@ import os
from app.models import (
Role,
User,
- TesseractOCRPipelineModel
+ TesseractOCRPipelineModel,
+ SpaCyNLPPipelineModel
)
@@ -39,6 +40,8 @@ def register(app):
Role.insert_defaults()
current_app.logger.info('Insert/Update default users')
User.insert_defaults()
+ current_app.logger.info('Insert/Update default SpaCyNLPPipelineModels')
+ SpaCyNLPPipelineModel.insert_defaults()
current_app.logger.info('Insert/Update default TesseractOCRPipelineModels')
TesseractOCRPipelineModel.insert_defaults()
diff --git a/app/models.py b/app/models.py
index 18188b49..cc5d60ce 100644
--- a/app/models.py
+++ b/app/models.py
@@ -277,6 +277,12 @@ class User(HashidMixin, UserMixin, db.Model):
cascade='all, delete-orphan',
lazy='dynamic'
)
+ spacy_nlp_pipeline_models = db.relationship(
+ 'SpaCyNLPPipelineModel',
+ backref='user',
+ cascade='all, delete-orphan',
+ lazy='dynamic'
+ )
corpora = db.relationship(
'Corpus',
backref='user',
@@ -333,6 +339,7 @@ class User(HashidMixin, UserMixin, db.Model):
db.session.refresh(user)
try:
os.mkdir(user.path)
+ os.mkdir(os.path.join(user.path, 'spacy_nlp_pipeline_models'))
os.mkdir(os.path.join(user.path, 'tesseract_ocr_pipeline_models'))
os.mkdir(os.path.join(user.path, 'corpora'))
os.mkdir(os.path.join(user.path, 'jobs'))
@@ -614,6 +621,105 @@ class TesseractOCRPipelineModel(FileMixin, HashidMixin, db.Model):
return _json
+class SpaCyNLPPipelineModel(FileMixin, HashidMixin, db.Model):
+ __tablename__ = 'spacy_nlp_pipeline_models'
+ # Primary key
+ id = db.Column(db.Integer, primary_key=True)
+ # Foreign keys
+ user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
+ # Fields
+ title = db.Column(db.String(64))
+ description = db.Column(db.String(255))
+ version = db.Column(db.String(16))
+ compatible_service_versions = db.Column(ContainerColumn(list, 255))
+ publisher = db.Column(db.String(128))
+ publisher_url = db.Column(db.String(512))
+ publishing_url = db.Column(db.String(512))
+ publishing_year = db.Column(db.Integer)
+ shared = db.Column(db.Boolean, default=False)
+ # Backrefs: user: User
+
+ @property
+ def path(self):
+ return os.path.join(
+ self.user.path,
+ 'spacy_nlp_pipeline_models',
+ str(self.id)
+ )
+
+ @staticmethod
+ def insert_defaults():
+ nopaque_user = User.query.filter_by(username='nopaque').first()
+ defaults_file = os.path.join(
+ os.path.dirname(os.path.abspath(__file__)),
+ 'SpaCyNLPPipelineModel.defaults.yml'
+ )
+ with open(defaults_file, 'r') as f:
+ defaults = yaml.safe_load(f)
+ for m in defaults:
+ model = SpaCyNLPPipelineModel.query.filter_by(title=m['title'], version=m['version']).first() # noqa
+ if model is not None:
+ model.compatible_service_versions = m['compatible_service_versions']
+ model.description = m['description']
+ model.publisher = m['publisher']
+ model.publisher_url = m['publisher_url']
+ model.publishing_url = m['publishing_url']
+ model.publishing_year = m['publishing_year']
+ model.shared = True
+ model.title = m['title']
+ model.version = m['version']
+ continue
+ model = SpaCyNLPPipelineModel(
+ compatible_service_versions=m['compatible_service_versions'],
+ description=m['description'],
+ publisher=m['publisher'],
+ publisher_url=m['publisher_url'],
+ publishing_url=m['publishing_url'],
+ publishing_year=m['publishing_year'],
+ shared=True,
+ title=m['title'],
+ user=nopaque_user,
+ version=m['version']
+ )
+ db.session.add(model)
+ db.session.flush(objects=[model])
+ db.session.refresh(model)
+ model.filename = f'{model.id}.traineddata'
+ r = requests.get(m['url'], stream=True)
+ pbar = tqdm(
+ desc=f'{model.title} ({model.filename})',
+ unit="B",
+ unit_scale=True,
+ unit_divisor=1024,
+ total=int(r.headers['Content-Length'])
+ )
+ pbar.clear()
+ with open(model.path, 'wb') as f:
+ for chunk in r.iter_content(chunk_size=1024):
+ if chunk: # filter out keep-alive new chunks
+ pbar.update(len(chunk))
+ f.write(chunk)
+ pbar.close()
+ db.session.commit()
+
+ def to_json(self, backrefs=False, relationships=False):
+ _json = {
+ 'id': self.hashid,
+ 'compatible_service_versions': self.compatible_service_versions,
+ 'description': self.description,
+ 'publisher': self.publisher,
+ 'publisher_url': self.publisher_url,
+ 'publishing_url': self.publishing_url,
+ 'publishing_year': self.publishing_year,
+ 'shared': self.shared,
+ 'title': self.title,
+ **self.file_mixin_to_json()
+ }
+ if backrefs:
+ _json['user'] = self.user.to_json(backrefs=True)
+ return _json
+
+
class JobInput(FileMixin, HashidMixin, db.Model):
__tablename__ = 'job_inputs'
# Primary key
diff --git a/migrations/versions/31dd42e5ea6f_.py b/migrations/versions/31dd42e5ea6f_.py
new file mode 100644
index 00000000..a92ada9c
--- /dev/null
+++ b/migrations/versions/31dd42e5ea6f_.py
@@ -0,0 +1,53 @@
+"""Add spacy_nlp_pipeline_models table
+
+Revision ID: 31dd42e5ea6f
+Revises: a3b727e3ff71
+Create Date: 2022-10-13 12:47:50.870474
+
+"""
+from alembic import op
+import shutil
+import sqlalchemy as sa
+import os
+from app.models import User
+
+
+# revision identifiers, used by Alembic.
+revision = '31dd42e5ea6f'
+down_revision = 'a3b727e3ff71'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+ # ### commands auto generated by Alembic - please adjust! ###
+ for user in User.query.all():
+ os.mkdir(os.path.join(user.path, 'spacy_nlp_pipeline_models'))
+ op.create_table('spacy_nlp_pipeline_models',
+ sa.Column('creation_date', sa.DateTime(), nullable=True),
+ sa.Column('filename', sa.String(length=255), nullable=True),
+ sa.Column('last_edited_date', sa.DateTime(), nullable=True),
+ sa.Column('mimetype', sa.String(length=255), nullable=True),
+ sa.Column('id', sa.Integer(), nullable=False),
+ sa.Column('user_id', sa.Integer(), nullable=True),
+ sa.Column('title', sa.String(length=64), nullable=True),
+ sa.Column('description', sa.String(length=255), nullable=True),
+ sa.Column('version', sa.String(length=16), nullable=True),
+ sa.Column('compatible_service_versions', sa.String(length=255), nullable=True),
+ sa.Column('publisher', sa.String(length=128), nullable=True),
+ sa.Column('publisher_url', sa.String(length=512), nullable=True),
+ sa.Column('publishing_url', sa.String(length=512), nullable=True),
+ sa.Column('publishing_year', sa.Integer(), nullable=True),
+ sa.Column('shared', sa.Boolean(), nullable=True),
+ sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
+ sa.PrimaryKeyConstraint('id')
+ )
+ # ### end Alembic commands ###
+
+
+def downgrade():
+ # ### commands auto generated by Alembic - please adjust! ###
+ for user in User.query.all():
+ shutil.rmtree(os.path.join(user.path, 'spacy_nlp_pipeline_models'))
+ op.drop_table('spacy_nlp_pipeline_models')
+ # ### end Alembic commands ###
diff --git a/migrations/versions/63b2cc26a01f_.py b/migrations/versions/63b2cc26a01f_.py
index 5876ed3c..35eacc86 100644
--- a/migrations/versions/63b2cc26a01f_.py
+++ b/migrations/versions/63b2cc26a01f_.py
@@ -19,8 +19,7 @@ depends_on = None
def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
- users = User.query.all()
- for user in users:
+ for user in User.query.all():
old_tesseract_ocr_pipeline_model_path = os.path.join(
user.path,
'tesseract_ocr_models'
@@ -40,8 +39,7 @@ def upgrade():
def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
- users = User.query.all()
- for user in users:
+ for user in User.query.all():
old_tesseract_ocr_pipeline_model_path = os.path.join(
user.path,
'tesseract_ocr_models'