mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2024-11-14 16:55:42 +00:00
Add SpaCyNLPModel database model
This commit is contained in:
parent
89ccca11eb
commit
b9e5939c1b
10
app/SpaCyNLPPipelineModel.defaults.yml
Normal file
10
app/SpaCyNLPPipelineModel.defaults.yml
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
- title: 'de_core_news_md-3.4.0'
|
||||||
|
description: 'German pipeline optimized for CPU. Components: tok2vec, tagger, morphologizer, parser, lemmatizer (trainable_lemmatizer), senter, ner.'
|
||||||
|
url: 'https://github.com/explosion/spacy-models/releases/download/de_core_news_md-3.4.0/de_core_news_md-3.4.0.tar.gz'
|
||||||
|
publisher: 'Explosion'
|
||||||
|
publisher_url: 'https://github.com/explosion'
|
||||||
|
publishing_url: 'https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.4.0'
|
||||||
|
publishing_year: 2022
|
||||||
|
version: '3.4.0'
|
||||||
|
compatible_service_versions:
|
||||||
|
- '0.1.0'
|
@ -5,7 +5,8 @@ import os
|
|||||||
from app.models import (
|
from app.models import (
|
||||||
Role,
|
Role,
|
||||||
User,
|
User,
|
||||||
TesseractOCRPipelineModel
|
TesseractOCRPipelineModel,
|
||||||
|
SpaCyNLPPipelineModel
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -39,6 +40,8 @@ def register(app):
|
|||||||
Role.insert_defaults()
|
Role.insert_defaults()
|
||||||
current_app.logger.info('Insert/Update default users')
|
current_app.logger.info('Insert/Update default users')
|
||||||
User.insert_defaults()
|
User.insert_defaults()
|
||||||
|
current_app.logger.info('Insert/Update default SpaCyNLPPipelineModels')
|
||||||
|
SpaCyNLPPipelineModel.insert_defaults()
|
||||||
current_app.logger.info('Insert/Update default TesseractOCRPipelineModels')
|
current_app.logger.info('Insert/Update default TesseractOCRPipelineModels')
|
||||||
TesseractOCRPipelineModel.insert_defaults()
|
TesseractOCRPipelineModel.insert_defaults()
|
||||||
|
|
||||||
|
106
app/models.py
106
app/models.py
@ -277,6 +277,12 @@ class User(HashidMixin, UserMixin, db.Model):
|
|||||||
cascade='all, delete-orphan',
|
cascade='all, delete-orphan',
|
||||||
lazy='dynamic'
|
lazy='dynamic'
|
||||||
)
|
)
|
||||||
|
spacy_nlp_pipeline_models = db.relationship(
|
||||||
|
'SpaCyNLPPipelineModel',
|
||||||
|
backref='user',
|
||||||
|
cascade='all, delete-orphan',
|
||||||
|
lazy='dynamic'
|
||||||
|
)
|
||||||
corpora = db.relationship(
|
corpora = db.relationship(
|
||||||
'Corpus',
|
'Corpus',
|
||||||
backref='user',
|
backref='user',
|
||||||
@ -333,6 +339,7 @@ class User(HashidMixin, UserMixin, db.Model):
|
|||||||
db.session.refresh(user)
|
db.session.refresh(user)
|
||||||
try:
|
try:
|
||||||
os.mkdir(user.path)
|
os.mkdir(user.path)
|
||||||
|
os.mkdir(os.path.join(user.path, 'spacy_nlp_pipeline_models'))
|
||||||
os.mkdir(os.path.join(user.path, 'tesseract_ocr_pipeline_models'))
|
os.mkdir(os.path.join(user.path, 'tesseract_ocr_pipeline_models'))
|
||||||
os.mkdir(os.path.join(user.path, 'corpora'))
|
os.mkdir(os.path.join(user.path, 'corpora'))
|
||||||
os.mkdir(os.path.join(user.path, 'jobs'))
|
os.mkdir(os.path.join(user.path, 'jobs'))
|
||||||
@ -614,6 +621,105 @@ class TesseractOCRPipelineModel(FileMixin, HashidMixin, db.Model):
|
|||||||
return _json
|
return _json
|
||||||
|
|
||||||
|
|
||||||
|
class SpaCyNLPPipelineModel(FileMixin, HashidMixin, db.Model):
|
||||||
|
__tablename__ = 'spacy_nlp_pipeline_models'
|
||||||
|
# Primary key
|
||||||
|
id = db.Column(db.Integer, primary_key=True)
|
||||||
|
# Foreign keys
|
||||||
|
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
|
||||||
|
# Fields
|
||||||
|
title = db.Column(db.String(64))
|
||||||
|
description = db.Column(db.String(255))
|
||||||
|
version = db.Column(db.String(16))
|
||||||
|
compatible_service_versions = db.Column(ContainerColumn(list, 255))
|
||||||
|
publisher = db.Column(db.String(128))
|
||||||
|
publisher_url = db.Column(db.String(512))
|
||||||
|
publishing_url = db.Column(db.String(512))
|
||||||
|
publishing_year = db.Column(db.Integer)
|
||||||
|
shared = db.Column(db.Boolean, default=False)
|
||||||
|
# Backrefs: user: User
|
||||||
|
|
||||||
|
@property
|
||||||
|
def path(self):
|
||||||
|
return os.path.join(
|
||||||
|
self.user.path,
|
||||||
|
'spacy_nlp_pipeline_models',
|
||||||
|
str(self.id)
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def insert_defaults():
|
||||||
|
nopaque_user = User.query.filter_by(username='nopaque').first()
|
||||||
|
defaults_file = os.path.join(
|
||||||
|
os.path.dirname(os.path.abspath(__file__)),
|
||||||
|
'SpaCyNLPPipelineModel.defaults.yml'
|
||||||
|
)
|
||||||
|
with open(defaults_file, 'r') as f:
|
||||||
|
defaults = yaml.safe_load(f)
|
||||||
|
for m in defaults:
|
||||||
|
model = SpaCyNLPPipelineModel.query.filter_by(title=m['title'], version=m['version']).first() # noqa
|
||||||
|
if model is not None:
|
||||||
|
model.compatible_service_versions = m['compatible_service_versions']
|
||||||
|
model.description = m['description']
|
||||||
|
model.publisher = m['publisher']
|
||||||
|
model.publisher_url = m['publisher_url']
|
||||||
|
model.publishing_url = m['publishing_url']
|
||||||
|
model.publishing_year = m['publishing_year']
|
||||||
|
model.shared = True
|
||||||
|
model.title = m['title']
|
||||||
|
model.version = m['version']
|
||||||
|
continue
|
||||||
|
model = SpaCyNLPPipelineModel(
|
||||||
|
compatible_service_versions=m['compatible_service_versions'],
|
||||||
|
description=m['description'],
|
||||||
|
publisher=m['publisher'],
|
||||||
|
publisher_url=m['publisher_url'],
|
||||||
|
publishing_url=m['publishing_url'],
|
||||||
|
publishing_year=m['publishing_year'],
|
||||||
|
shared=True,
|
||||||
|
title=m['title'],
|
||||||
|
user=nopaque_user,
|
||||||
|
version=m['version']
|
||||||
|
)
|
||||||
|
db.session.add(model)
|
||||||
|
db.session.flush(objects=[model])
|
||||||
|
db.session.refresh(model)
|
||||||
|
model.filename = f'{model.id}.traineddata'
|
||||||
|
r = requests.get(m['url'], stream=True)
|
||||||
|
pbar = tqdm(
|
||||||
|
desc=f'{model.title} ({model.filename})',
|
||||||
|
unit="B",
|
||||||
|
unit_scale=True,
|
||||||
|
unit_divisor=1024,
|
||||||
|
total=int(r.headers['Content-Length'])
|
||||||
|
)
|
||||||
|
pbar.clear()
|
||||||
|
with open(model.path, 'wb') as f:
|
||||||
|
for chunk in r.iter_content(chunk_size=1024):
|
||||||
|
if chunk: # filter out keep-alive new chunks
|
||||||
|
pbar.update(len(chunk))
|
||||||
|
f.write(chunk)
|
||||||
|
pbar.close()
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
def to_json(self, backrefs=False, relationships=False):
|
||||||
|
_json = {
|
||||||
|
'id': self.hashid,
|
||||||
|
'compatible_service_versions': self.compatible_service_versions,
|
||||||
|
'description': self.description,
|
||||||
|
'publisher': self.publisher,
|
||||||
|
'publisher_url': self.publisher_url,
|
||||||
|
'publishing_url': self.publishing_url,
|
||||||
|
'publishing_year': self.publishing_year,
|
||||||
|
'shared': self.shared,
|
||||||
|
'title': self.title,
|
||||||
|
**self.file_mixin_to_json()
|
||||||
|
}
|
||||||
|
if backrefs:
|
||||||
|
_json['user'] = self.user.to_json(backrefs=True)
|
||||||
|
return _json
|
||||||
|
|
||||||
|
|
||||||
class JobInput(FileMixin, HashidMixin, db.Model):
|
class JobInput(FileMixin, HashidMixin, db.Model):
|
||||||
__tablename__ = 'job_inputs'
|
__tablename__ = 'job_inputs'
|
||||||
# Primary key
|
# Primary key
|
||||||
|
53
migrations/versions/31dd42e5ea6f_.py
Normal file
53
migrations/versions/31dd42e5ea6f_.py
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
"""Add spacy_nlp_pipeline_models table
|
||||||
|
|
||||||
|
Revision ID: 31dd42e5ea6f
|
||||||
|
Revises: a3b727e3ff71
|
||||||
|
Create Date: 2022-10-13 12:47:50.870474
|
||||||
|
|
||||||
|
"""
|
||||||
|
from alembic import op
|
||||||
|
import shutil
|
||||||
|
import sqlalchemy as sa
|
||||||
|
import os
|
||||||
|
from app.models import User
|
||||||
|
|
||||||
|
|
||||||
|
# revision identifiers, used by Alembic.
|
||||||
|
revision = '31dd42e5ea6f'
|
||||||
|
down_revision = 'a3b727e3ff71'
|
||||||
|
branch_labels = None
|
||||||
|
depends_on = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade():
|
||||||
|
# ### commands auto generated by Alembic - please adjust! ###
|
||||||
|
for user in User.query.all():
|
||||||
|
os.mkdir(os.path.join(user.path, 'spacy_nlp_pipeline_models'))
|
||||||
|
op.create_table('spacy_nlp_pipeline_models',
|
||||||
|
sa.Column('creation_date', sa.DateTime(), nullable=True),
|
||||||
|
sa.Column('filename', sa.String(length=255), nullable=True),
|
||||||
|
sa.Column('last_edited_date', sa.DateTime(), nullable=True),
|
||||||
|
sa.Column('mimetype', sa.String(length=255), nullable=True),
|
||||||
|
sa.Column('id', sa.Integer(), nullable=False),
|
||||||
|
sa.Column('user_id', sa.Integer(), nullable=True),
|
||||||
|
sa.Column('title', sa.String(length=64), nullable=True),
|
||||||
|
sa.Column('description', sa.String(length=255), nullable=True),
|
||||||
|
sa.Column('version', sa.String(length=16), nullable=True),
|
||||||
|
sa.Column('compatible_service_versions', sa.String(length=255), nullable=True),
|
||||||
|
sa.Column('publisher', sa.String(length=128), nullable=True),
|
||||||
|
sa.Column('publisher_url', sa.String(length=512), nullable=True),
|
||||||
|
sa.Column('publishing_url', sa.String(length=512), nullable=True),
|
||||||
|
sa.Column('publishing_year', sa.Integer(), nullable=True),
|
||||||
|
sa.Column('shared', sa.Boolean(), nullable=True),
|
||||||
|
sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
|
||||||
|
sa.PrimaryKeyConstraint('id')
|
||||||
|
)
|
||||||
|
# ### end Alembic commands ###
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade():
|
||||||
|
# ### commands auto generated by Alembic - please adjust! ###
|
||||||
|
for user in User.query.all():
|
||||||
|
shutil.rmtree(os.path.join(user.path, 'spacy_nlp_pipeline_models'))
|
||||||
|
op.drop_table('spacy_nlp_pipeline_models')
|
||||||
|
# ### end Alembic commands ###
|
@ -19,8 +19,7 @@ depends_on = None
|
|||||||
|
|
||||||
def upgrade():
|
def upgrade():
|
||||||
# ### commands auto generated by Alembic - please adjust! ###
|
# ### commands auto generated by Alembic - please adjust! ###
|
||||||
users = User.query.all()
|
for user in User.query.all():
|
||||||
for user in users:
|
|
||||||
old_tesseract_ocr_pipeline_model_path = os.path.join(
|
old_tesseract_ocr_pipeline_model_path = os.path.join(
|
||||||
user.path,
|
user.path,
|
||||||
'tesseract_ocr_models'
|
'tesseract_ocr_models'
|
||||||
@ -40,8 +39,7 @@ def upgrade():
|
|||||||
|
|
||||||
def downgrade():
|
def downgrade():
|
||||||
# ### commands auto generated by Alembic - please adjust! ###
|
# ### commands auto generated by Alembic - please adjust! ###
|
||||||
users = User.query.all()
|
for user in User.query.all():
|
||||||
for user in users:
|
|
||||||
old_tesseract_ocr_pipeline_model_path = os.path.join(
|
old_tesseract_ocr_pipeline_model_path = os.path.join(
|
||||||
user.path,
|
user.path,
|
||||||
'tesseract_ocr_models'
|
'tesseract_ocr_models'
|
||||||
|
Loading…
Reference in New Issue
Block a user