diff --git a/app/SpaCyNLPPipelineModel.defaults.yml b/app/SpaCyNLPPipelineModel.defaults.yml index ed4ea3bd..00031ed0 100644 --- a/app/SpaCyNLPPipelineModel.defaults.yml +++ b/app/SpaCyNLPPipelineModel.defaults.yml @@ -1,66 +1,177 @@ -- title: 'de_core_news_md-3.4.0' - description: 'German pipeline optimized for CPU. Components: tok2vec, tagger, morphologizer, parser, lemmatizer (trainable_lemmatizer), senter, ner.' - url: 'https://github.com/explosion/spacy-models/releases/download/de_core_news_md-3.4.0/de_core_news_md-3.4.0.tar.gz' +- title: 'Catalan' + description: 'Catalan pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.' + url: 'https://github.com/explosion/spacy-models/releases/download/ca_core_news_md-3.2.0/ca_core_news_md-3.2.0.tar.gz' publisher: 'Explosion' publisher_url: 'https://github.com/explosion' - publishing_url: 'https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.4.0' - publishing_year: 2022 + publishing_url: 'https://github.com/explosion/spacy-models/releases/tag/ca_core_news_md-3.2.0' + publishing_year: 2021 + pipeline_name: 'ca_core_news_md' + version: '3.2.0' + compatible_service_versions: + - '0.1.0' +- title: 'German' + description: 'German pipeline optimized for CPU. Components: tok2vec, tagger, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.' + url: 'https://github.com/explosion/spacy-models/releases/download/de_core_news_md-3.2.0/de_core_news_md-3.2.0.tar.gz' + publisher: 'Explosion' + publisher_url: 'https://github.com/explosion' + publishing_url: 'https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.2.0' + publishing_year: 2021 pipeline_name: 'de_core_news_md' - version: '3.4.0' + version: '3.2.0' compatible_service_versions: - - '0.1.0' -- title: 'en_core_web_md-3.4.1' + - '0.1.0' +- title: 'Greek' + description: 'Greek pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.' + url: 'https://github.com/explosion/spacy-models/releases/download/el_core_news_md-3.2.0/el_core_news_md-3.2.0.tar.gz' + publisher: 'Explosion' + publisher_url: 'https://github.com/explosion' + publishing_url: 'https://github.com/explosion/spacy-models/releases/tag/el_core_news_md-3.2.0' + publishing_year: 2021 + pipeline_name: 'el_core_news_md' + version: '3.2.0' + compatible_service_versions: + - '0.1.0' +- title: 'English' description: 'English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer.' - url: 'https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1.tar.gz' + url: 'https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.2.0/en_core_web_md-3.2.0.tar.gz' publisher: 'Explosion' publisher_url: 'https://github.com/explosion' - publishing_url: 'https://github.com/explosion/spacy-models/releases/tag/en_core_web_md-3.4.1' - publishing_year: 2022 + publishing_url: 'https://github.com/explosion/spacy-models/releases/tag/en_core_web_md-3.2.0' + publishing_year: 2021 pipeline_name: 'en_core_web_md' - version: '3.4.1' + version: '3.2.0' compatible_service_versions: - '0.1.0' -- title: 'uk_core_news_md-3.4.0' - description: 'Ukrainian pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.' - url: 'https://github.com/explosion/spacy-models/releases/download/uk_core_news_md-3.4.0/uk_core_news_md-3.4.0.tar.gz' +- title: 'Spanish' + description: 'Spanish pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.' + url: 'https://github.com/explosion/spacy-models/releases/download/es_core_news_md-3.2.0/es_core_news_md-3.2.0.tar.gz' publisher: 'Explosion' publisher_url: 'https://github.com/explosion' - publishing_url: 'https://github.com/explosion/spacy-models/releases/tag/uk_core_news_md-3.4.0' - publishing_year: 2022 - pipeline_name: 'uk_core_news_md' - version: '3.4.0' + publishing_url: 'https://github.com/explosion/spacy-models/releases/tag/es_core_news_md-3.2.0' + publishing_year: 2021 + pipeline_name: 'es_core_news_md' + version: '3.2.0' compatible_service_versions: - '0.1.0' -- title: 'zh_core_web_md-3.4.0' - description: 'Chinese pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler.' - url: 'https://github.com/explosion/spacy-models/releases/download/zh_core_web_md-3.4.0/zh_core_web_md-3.4.0.tar.gz' +- title: 'French' + description: 'French pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.' + url: 'https://github.com/explosion/spacy-models/releases/download/fr_core_news_md-3.2.0/fr_core_news_md-3.2.0.tar.gz' publisher: 'Explosion' publisher_url: 'https://github.com/explosion' - publishing_url: 'https://github.com/explosion/spacy-models/releases/tag/zh_core_web_md-3.4.0' - publishing_year: 2022 - pipeline_name: 'zh_core_web_md' - version: '3.4.0' + publishing_url: 'https://github.com/explosion/spacy-models/releases/tag/fr_core_news_md-3.2.0' + publishing_year: 2021 + pipeline_name: 'fr_core_news_md' + version: '3.2.0' compatible_service_versions: - '0.1.0' -- title: 'ru_core_news_md-3.4.0' +- title: 'Italian' + description: 'Italian pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.' + url: 'https://github.com/explosion/spacy-models/releases/download/it_core_news_md-3.2.0/it_core_news_md-3.2.0.tar.gz' + publisher: 'Explosion' + publisher_url: 'https://github.com/explosion' + publishing_url: 'https://github.com/explosion/spacy-models/releases/tag/it_core_news_md-3.2.0' + publishing_year: 2021 + pipeline_name: 'it_core_news_md' + version: '3.2.0' + compatible_service_versions: + - '0.1.0' +- title: 'Polish' + description: 'Polish pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.' + url: 'https://github.com/explosion/spacy-models/releases/download/pl_core_news_md-3.2.0/pl_core_news_md-3.2.0.tar.gz' + publisher: 'Explosion' + publisher_url: 'https://github.com/explosion' + publishing_url: 'https://github.com/explosion/spacy-models/releases/tag/pl_core_news_md-3.2.0' + publishing_year: 2021 + pipeline_name: 'pl_core_news_md' + version: '3.2.0' + compatible_service_versions: + - '0.1.0' +- title: 'Russian' description: 'Russian pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.' - url: 'https://github.com/explosion/spacy-models/releases/download/ru_core_news_md-3.4.0/ru_core_news_md-3.4.0.tar.gz' + url: 'https://github.com/explosion/spacy-models/releases/download/ru_core_news_md-3.2.0/ru_core_news_md-3.2.0.tar.gz' publisher: 'Explosion' publisher_url: 'https://github.com/explosion' - publishing_url: 'https://github.com/explosion/spacy-models/releases/tag/ru_core_news_md-3.4.0' - publishing_year: 2022 + publishing_url: 'https://github.com/explosion/spacy-models/releases/tag/ru_core_news_md-3.2.0' + publishing_year: 2021 pipeline_name: 'ru_core_news_md' - version: '3.4.0' + version: '3.2.0' compatible_service_versions: - '0.1.0' -- title: 'la_core_cltk_sm-0.1.0' - description: 'Latin pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.' - url: 'https://github.com/diyclassics/latin-spacy-models/raw/main/la_core_cltk_sm/la_core_cltk_sm-0.1.0.tar.gz' - publisher: 'DIY Classics' - publisher_url: 'https://github.com/diyclassics/' - publishing_url: 'https://github.com/diyclassics/latin-spacy-models/tree/main/la_core_cltk_sm' - publishing_year: 2022 - pipeline_name: 'la_core_cltk_sm' - version: '0.1.0' +- title: 'Chinese' + description: 'Chinese pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler.' + url: 'https://github.com/explosion/spacy-models/releases/download/zh_core_web_md-3.2.0/zh_core_web_md-3.2.0.tar.gz' + publisher: 'Explosion' + publisher_url: 'https://github.com/explosion' + publishing_url: 'https://github.com/explosion/spacy-models/releases/tag/zh_core_web_md-3.2.0' + publishing_year: 2021 + pipeline_name: 'zh_core_web_md' + version: '3.2.0' compatible_service_versions: - '0.1.0' + +# - title: 'de_core_news_md-3.4.0' +# description: 'German pipeline optimized for CPU. Components: tok2vec, tagger, morphologizer, parser, lemmatizer (trainable_lemmatizer), senter, ner.' +# url: 'https://github.com/explosion/spacy-models/releases/download/de_core_news_md-3.4.0/de_core_news_md-3.4.0.tar.gz' +# publisher: 'Explosion' +# publisher_url: 'https://github.com/explosion' +# publishing_url: 'https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.4.0' +# publishing_year: 2022 +# pipeline_name: 'de_core_news_md' +# version: '3.4.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'en_core_web_md-3.4.1' +# description: 'English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer.' +# url: 'https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1.tar.gz' +# publisher: 'Explosion' +# publisher_url: 'https://github.com/explosion' +# publishing_url: 'https://github.com/explosion/spacy-models/releases/tag/en_core_web_md-3.4.1' +# publishing_year: 2022 +# pipeline_name: 'en_core_web_md' +# version: '3.4.1' +# compatible_service_versions: +# - '0.1.0' +# - title: 'uk_core_news_md-3.4.0' +# description: 'Ukrainian pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.' +# url: 'https://github.com/explosion/spacy-models/releases/download/uk_core_news_md-3.4.0/uk_core_news_md-3.4.0.tar.gz' +# publisher: 'Explosion' +# publisher_url: 'https://github.com/explosion' +# publishing_url: 'https://github.com/explosion/spacy-models/releases/tag/uk_core_news_md-3.4.0' +# publishing_year: 2022 +# pipeline_name: 'uk_core_news_md' +# version: '3.4.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'zh_core_web_md-3.4.0' +# description: 'Chinese pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler.' +# url: 'https://github.com/explosion/spacy-models/releases/download/zh_core_web_md-3.4.0/zh_core_web_md-3.4.0.tar.gz' +# publisher: 'Explosion' +# publisher_url: 'https://github.com/explosion' +# publishing_url: 'https://github.com/explosion/spacy-models/releases/tag/zh_core_web_md-3.4.0' +# publishing_year: 2022 +# pipeline_name: 'zh_core_web_md' +# version: '3.4.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'ru_core_news_md-3.4.0' +# description: 'Russian pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.' +# url: 'https://github.com/explosion/spacy-models/releases/download/ru_core_news_md-3.4.0/ru_core_news_md-3.4.0.tar.gz' +# publisher: 'Explosion' +# publisher_url: 'https://github.com/explosion' +# publishing_url: 'https://github.com/explosion/spacy-models/releases/tag/ru_core_news_md-3.4.0' +# publishing_year: 2022 +# pipeline_name: 'ru_core_news_md' +# version: '3.4.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'la_core_cltk_sm-0.1.0' +# description: 'Latin pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.' +# url: 'https://github.com/diyclassics/latin-spacy-models/raw/main/la_core_cltk_sm/la_core_cltk_sm-0.1.0.tar.gz' +# publisher: 'DIY Classics' +# publisher_url: 'https://github.com/diyclassics/' +# publishing_url: 'https://github.com/diyclassics/latin-spacy-models/tree/main/la_core_cltk_sm' +# publishing_year: 2022 +# pipeline_name: 'la_core_cltk_sm' +# version: '0.1.0' +# compatible_service_versions: +# - '0.1.0' diff --git a/app/contributions/forms.py b/app/contributions/forms.py index dcdfaea8..c0611e17 100644 --- a/app/contributions/forms.py +++ b/app/contributions/forms.py @@ -1,4 +1,4 @@ -from xml.dom import ValidationErr +from flask import current_app from flask_wtf import FlaskForm from flask_wtf.file import FileField, FileRequired from wtforms import ( @@ -66,8 +66,9 @@ class TesseractOCRModelContributionForm(CreateContributionBaseForm): compatible_service_versions = SelectMultipleField( 'Compatible service versions' ) - def validate_traineddata(self, field): - if field.data.mimetype != '.traineddata': + def validate_tesseract_model_file(self, field): + current_app.logger.warning(field.data.filename) + if not field.data.filename.lower().endswith('.traineddata'): raise ValidationError('traineddata files only!') def __init__(self, *args, **kwargs): @@ -87,8 +88,9 @@ class SpacyNLPModelContributionForm(CreateContributionBaseForm): compatible_service_versions = SelectMultipleField( 'Compatible service versions' ) - def validate_spacy(self, field): - if field.data.mimetype != '.tar.gz': + def validate_spacy_model_file(self, field): + current_app.logger.warning(field.data.filename) + if not field.data.filename.lower().endswith('.tar.gz'): raise ValidationError('.tar.gz files only!') def __init__(self, *args, **kwargs): diff --git a/app/services/forms.py b/app/services/forms.py index 5c0af906..134e9456 100644 --- a/app/services/forms.py +++ b/app/services/forms.py @@ -10,7 +10,7 @@ from wtforms import ( ValidationError ) from wtforms.validators import InputRequired, Length -from app.models import TesseractOCRPipelineModel +from app.models import TesseractOCRPipelineModel, SpaCyNLPPipelineModel from . import SERVICES @@ -73,11 +73,11 @@ class CreateTesseractOCRPipelineJobForm(CreateJobBaseForm): if 'disabled' in self.binarization.render_kw: del self.binarization.render_kw['disabled'] models = [ - x for x in TesseractOCRPipelineModel.query.filter().all() + x for x in TesseractOCRPipelineModel.query.order_by(TesseractOCRPipelineModel.title).all() if version in x.compatible_service_versions and (x.shared == True or x.user == current_user) ] self.model.choices = [('', 'Choose your option')] - self.model.choices += [(x.hashid, x.title) for x in models] + self.model.choices += [(x.hashid, f'{x.title} [{x.version}]') for x in models] self.model.default = '' self.version.choices = [(x, x) for x in service_manifest['versions']] self.version.data = version @@ -127,7 +127,7 @@ class CreateSpacyNLPPipelineJobForm(CreateJobBaseForm): encoding_detection = BooleanField('Encoding detection', render_kw={'disabled': True}) txt = FileField('File', validators=[FileRequired()]) model = SelectField('Model', validators=[InputRequired()]) - + def validate_encoding_detection(self, field): service_info = SERVICES['spacy-nlp-pipeline']['versions'][self.version.data] if field.data: @@ -153,8 +153,12 @@ class CreateSpacyNLPPipelineJobForm(CreateJobBaseForm): if 'encoding_detection' in service_info['methods']: if 'disabled' in self.encoding_detection.render_kw: del self.encoding_detection.render_kw['disabled'] + models = [ + x for x in SpaCyNLPPipelineModel.query.order_by(SpaCyNLPPipelineModel.title).all() + if version in x.compatible_service_versions and (x.shared == True or x.user == current_user) + ] self.model.choices = [('', 'Choose your option')] - self.model.choices += [(x, y) for x, y in service_info['models'].items()] # noqa + self.model.choices += [(x.hashid, f'{x.title} [{x.version}]') for x in models] self.model.default = '' self.version.choices = [(x, x) for x in service_manifest['versions']] self.version.data = version diff --git a/app/services/routes.py b/app/services/routes.py index b34d0619..4bfca9bb 100644 --- a/app/services/routes.py +++ b/app/services/routes.py @@ -6,7 +6,8 @@ from app.models import ( Job, JobInput, JobStatus, - TesseractOCRPipelineModel + TesseractOCRPipelineModel, + SpaCyNLPPipelineModel ) from . import bp, SERVICES from .forms import ( @@ -172,6 +173,7 @@ def spacy_nlp_pipeline(): if version not in service_manifest['versions']: abort(404) form = CreateSpacyNLPPipelineJobForm(prefix='create-job-form', version=version) + spacy_nlp_pipeline_models = SpaCyNLPPipelineModel.query.all() if form.is_submitted(): if not form.validate(): response = {'errors': form.errors} @@ -202,6 +204,7 @@ def spacy_nlp_pipeline(): return render_template( 'services/spacy_nlp_pipeline.html.j2', form=form, + spacy_nlp_pipeline_models=spacy_nlp_pipeline_models, title=service_manifest['name'] ) diff --git a/app/templates/contributions/_breadcrumbs.html.j2 b/app/templates/contributions/_breadcrumbs.html.j2 index 4ccfad3b..327d0578 100644 --- a/app/templates/contributions/_breadcrumbs.html.j2 +++ b/app/templates/contributions/_breadcrumbs.html.j2 @@ -2,30 +2,30 @@