From b9e5939c1b5783cba99abfa70081f904dbd9682a Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Thu, 13 Oct 2022 15:05:54 +0200 Subject: [PATCH] Add SpaCyNLPModel database model --- app/SpaCyNLPPipelineModel.defaults.yml | 10 +++ app/cli.py | 5 +- app/models.py | 106 +++++++++++++++++++++++++ migrations/versions/31dd42e5ea6f_.py | 53 +++++++++++++ migrations/versions/63b2cc26a01f_.py | 6 +- 5 files changed, 175 insertions(+), 5 deletions(-) create mode 100644 app/SpaCyNLPPipelineModel.defaults.yml create mode 100644 migrations/versions/31dd42e5ea6f_.py diff --git a/app/SpaCyNLPPipelineModel.defaults.yml b/app/SpaCyNLPPipelineModel.defaults.yml new file mode 100644 index 00000000..576f85e4 --- /dev/null +++ b/app/SpaCyNLPPipelineModel.defaults.yml @@ -0,0 +1,10 @@ +- title: 'de_core_news_md-3.4.0' + description: 'German pipeline optimized for CPU. Components: tok2vec, tagger, morphologizer, parser, lemmatizer (trainable_lemmatizer), senter, ner.' + url: 'https://github.com/explosion/spacy-models/releases/download/de_core_news_md-3.4.0/de_core_news_md-3.4.0.tar.gz' + publisher: 'Explosion' + publisher_url: 'https://github.com/explosion' + publishing_url: 'https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.4.0' + publishing_year: 2022 + version: '3.4.0' + compatible_service_versions: + - '0.1.0' diff --git a/app/cli.py b/app/cli.py index 54226707..826aa790 100644 --- a/app/cli.py +++ b/app/cli.py @@ -5,7 +5,8 @@ import os from app.models import ( Role, User, - TesseractOCRPipelineModel + TesseractOCRPipelineModel, + SpaCyNLPPipelineModel ) @@ -39,6 +40,8 @@ def register(app): Role.insert_defaults() current_app.logger.info('Insert/Update default users') User.insert_defaults() + current_app.logger.info('Insert/Update default SpaCyNLPPipelineModels') + SpaCyNLPPipelineModel.insert_defaults() current_app.logger.info('Insert/Update default TesseractOCRPipelineModels') TesseractOCRPipelineModel.insert_defaults() diff --git a/app/models.py b/app/models.py index 18188b49..cc5d60ce 100644 --- a/app/models.py +++ b/app/models.py @@ -277,6 +277,12 @@ class User(HashidMixin, UserMixin, db.Model): cascade='all, delete-orphan', lazy='dynamic' ) + spacy_nlp_pipeline_models = db.relationship( + 'SpaCyNLPPipelineModel', + backref='user', + cascade='all, delete-orphan', + lazy='dynamic' + ) corpora = db.relationship( 'Corpus', backref='user', @@ -333,6 +339,7 @@ class User(HashidMixin, UserMixin, db.Model): db.session.refresh(user) try: os.mkdir(user.path) + os.mkdir(os.path.join(user.path, 'spacy_nlp_pipeline_models')) os.mkdir(os.path.join(user.path, 'tesseract_ocr_pipeline_models')) os.mkdir(os.path.join(user.path, 'corpora')) os.mkdir(os.path.join(user.path, 'jobs')) @@ -614,6 +621,105 @@ class TesseractOCRPipelineModel(FileMixin, HashidMixin, db.Model): return _json +class SpaCyNLPPipelineModel(FileMixin, HashidMixin, db.Model): + __tablename__ = 'spacy_nlp_pipeline_models' + # Primary key + id = db.Column(db.Integer, primary_key=True) + # Foreign keys + user_id = db.Column(db.Integer, db.ForeignKey('users.id')) + # Fields + title = db.Column(db.String(64)) + description = db.Column(db.String(255)) + version = db.Column(db.String(16)) + compatible_service_versions = db.Column(ContainerColumn(list, 255)) + publisher = db.Column(db.String(128)) + publisher_url = db.Column(db.String(512)) + publishing_url = db.Column(db.String(512)) + publishing_year = db.Column(db.Integer) + shared = db.Column(db.Boolean, default=False) + # Backrefs: user: User + + @property + def path(self): + return os.path.join( + self.user.path, + 'spacy_nlp_pipeline_models', + str(self.id) + ) + + @staticmethod + def insert_defaults(): + nopaque_user = User.query.filter_by(username='nopaque').first() + defaults_file = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + 'SpaCyNLPPipelineModel.defaults.yml' + ) + with open(defaults_file, 'r') as f: + defaults = yaml.safe_load(f) + for m in defaults: + model = SpaCyNLPPipelineModel.query.filter_by(title=m['title'], version=m['version']).first() # noqa + if model is not None: + model.compatible_service_versions = m['compatible_service_versions'] + model.description = m['description'] + model.publisher = m['publisher'] + model.publisher_url = m['publisher_url'] + model.publishing_url = m['publishing_url'] + model.publishing_year = m['publishing_year'] + model.shared = True + model.title = m['title'] + model.version = m['version'] + continue + model = SpaCyNLPPipelineModel( + compatible_service_versions=m['compatible_service_versions'], + description=m['description'], + publisher=m['publisher'], + publisher_url=m['publisher_url'], + publishing_url=m['publishing_url'], + publishing_year=m['publishing_year'], + shared=True, + title=m['title'], + user=nopaque_user, + version=m['version'] + ) + db.session.add(model) + db.session.flush(objects=[model]) + db.session.refresh(model) + model.filename = f'{model.id}.traineddata' + r = requests.get(m['url'], stream=True) + pbar = tqdm( + desc=f'{model.title} ({model.filename})', + unit="B", + unit_scale=True, + unit_divisor=1024, + total=int(r.headers['Content-Length']) + ) + pbar.clear() + with open(model.path, 'wb') as f: + for chunk in r.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + pbar.update(len(chunk)) + f.write(chunk) + pbar.close() + db.session.commit() + + def to_json(self, backrefs=False, relationships=False): + _json = { + 'id': self.hashid, + 'compatible_service_versions': self.compatible_service_versions, + 'description': self.description, + 'publisher': self.publisher, + 'publisher_url': self.publisher_url, + 'publishing_url': self.publishing_url, + 'publishing_year': self.publishing_year, + 'shared': self.shared, + 'title': self.title, + **self.file_mixin_to_json() + } + if backrefs: + _json['user'] = self.user.to_json(backrefs=True) + return _json + + class JobInput(FileMixin, HashidMixin, db.Model): __tablename__ = 'job_inputs' # Primary key diff --git a/migrations/versions/31dd42e5ea6f_.py b/migrations/versions/31dd42e5ea6f_.py new file mode 100644 index 00000000..a92ada9c --- /dev/null +++ b/migrations/versions/31dd42e5ea6f_.py @@ -0,0 +1,53 @@ +"""Add spacy_nlp_pipeline_models table + +Revision ID: 31dd42e5ea6f +Revises: a3b727e3ff71 +Create Date: 2022-10-13 12:47:50.870474 + +""" +from alembic import op +import shutil +import sqlalchemy as sa +import os +from app.models import User + + +# revision identifiers, used by Alembic. +revision = '31dd42e5ea6f' +down_revision = 'a3b727e3ff71' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + for user in User.query.all(): + os.mkdir(os.path.join(user.path, 'spacy_nlp_pipeline_models')) + op.create_table('spacy_nlp_pipeline_models', + sa.Column('creation_date', sa.DateTime(), nullable=True), + sa.Column('filename', sa.String(length=255), nullable=True), + sa.Column('last_edited_date', sa.DateTime(), nullable=True), + sa.Column('mimetype', sa.String(length=255), nullable=True), + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('user_id', sa.Integer(), nullable=True), + sa.Column('title', sa.String(length=64), nullable=True), + sa.Column('description', sa.String(length=255), nullable=True), + sa.Column('version', sa.String(length=16), nullable=True), + sa.Column('compatible_service_versions', sa.String(length=255), nullable=True), + sa.Column('publisher', sa.String(length=128), nullable=True), + sa.Column('publisher_url', sa.String(length=512), nullable=True), + sa.Column('publishing_url', sa.String(length=512), nullable=True), + sa.Column('publishing_year', sa.Integer(), nullable=True), + sa.Column('shared', sa.Boolean(), nullable=True), + sa.ForeignKeyConstraint(['user_id'], ['users.id'], ), + sa.PrimaryKeyConstraint('id') + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + for user in User.query.all(): + shutil.rmtree(os.path.join(user.path, 'spacy_nlp_pipeline_models')) + op.drop_table('spacy_nlp_pipeline_models') + # ### end Alembic commands ### diff --git a/migrations/versions/63b2cc26a01f_.py b/migrations/versions/63b2cc26a01f_.py index 5876ed3c..35eacc86 100644 --- a/migrations/versions/63b2cc26a01f_.py +++ b/migrations/versions/63b2cc26a01f_.py @@ -19,8 +19,7 @@ depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### - users = User.query.all() - for user in users: + for user in User.query.all(): old_tesseract_ocr_pipeline_model_path = os.path.join( user.path, 'tesseract_ocr_models' @@ -40,8 +39,7 @@ def upgrade(): def downgrade(): # ### commands auto generated by Alembic - please adjust! ### - users = User.query.all() - for user in users: + for user in User.query.all(): old_tesseract_ocr_pipeline_model_path = os.path.join( user.path, 'tesseract_ocr_models'