From 9c22370eeaf951055c29988183abe40599522c5d Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Tue, 28 Nov 2023 12:10:55 +0100 Subject: [PATCH] Implement force download parameter in model insert_defaults methods --- app/models.py | 138 ++++++++++++++++++++++++++------------------------ 1 file changed, 71 insertions(+), 67 deletions(-) diff --git a/app/models.py b/app/models.py index 91477f92..ba90ca08 100644 --- a/app/models.py +++ b/app/models.py @@ -953,7 +953,7 @@ class TesseractOCRPipelineModel(FileMixin, HashidMixin, db.Model): return self.user.hashid @staticmethod - def insert_defaults(): + def insert_defaults(force_download=False): nopaque_user = User.query.filter_by(username='nopaque').first() defaults_file = os.path.join( os.path.dirname(os.path.abspath(__file__)), @@ -966,6 +966,7 @@ class TesseractOCRPipelineModel(FileMixin, HashidMixin, db.Model): if model is not None: model.compatible_service_versions = m['compatible_service_versions'] model.description = m['description'] + model.filename = f'{model.id}.traineddata' model.publisher = m['publisher'] model.publisher_url = m['publisher_url'] model.publishing_url = m['publishing_url'] @@ -973,38 +974,39 @@ class TesseractOCRPipelineModel(FileMixin, HashidMixin, db.Model): model.is_public = True model.title = m['title'] model.version = m['version'] - continue - model = TesseractOCRPipelineModel( - compatible_service_versions=m['compatible_service_versions'], - description=m['description'], - publisher=m['publisher'], - publisher_url=m['publisher_url'], - publishing_url=m['publishing_url'], - publishing_year=m['publishing_year'], - is_public=True, - title=m['title'], - user=nopaque_user, - version=m['version'] - ) - db.session.add(model) - db.session.flush(objects=[model]) - db.session.refresh(model) - model.filename = f'{model.id}.traineddata' - r = requests.get(m['url'], stream=True) - pbar = tqdm( - desc=f'{model.title} ({model.filename})', - unit="B", - unit_scale=True, - unit_divisor=1024, - total=int(r.headers['Content-Length']) - ) - pbar.clear() - with open(model.path, 'wb') as f: - for chunk in r.iter_content(chunk_size=1024): - if chunk: # filter out keep-alive new chunks - pbar.update(len(chunk)) - f.write(chunk) - pbar.close() + else: + model = TesseractOCRPipelineModel( + compatible_service_versions=m['compatible_service_versions'], + description=m['description'], + publisher=m['publisher'], + publisher_url=m['publisher_url'], + publishing_url=m['publishing_url'], + publishing_year=m['publishing_year'], + is_public=True, + title=m['title'], + user=nopaque_user, + version=m['version'] + ) + db.session.add(model) + db.session.flush(objects=[model]) + db.session.refresh(model) + model.filename = f'{model.id}.traineddata' + if not os.path.exists(model.path) or force_download: + r = requests.get(m['url'], stream=True) + pbar = tqdm( + desc=f'{model.title} ({model.filename})', + unit="B", + unit_scale=True, + unit_divisor=1024, + total=int(r.headers['Content-Length']) + ) + pbar.clear() + with open(model.path, 'wb') as f: + for chunk in r.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + pbar.update(len(chunk)) + f.write(chunk) + pbar.close() db.session.commit() def delete(self): @@ -1080,7 +1082,7 @@ class SpaCyNLPPipelineModel(FileMixin, HashidMixin, db.Model): return self.user.hashid @staticmethod - def insert_defaults(): + def insert_defaults(force_download=False): nopaque_user = User.query.filter_by(username='nopaque').first() defaults_file = os.path.join( os.path.dirname(os.path.abspath(__file__)), @@ -1093,6 +1095,7 @@ class SpaCyNLPPipelineModel(FileMixin, HashidMixin, db.Model): if model is not None: model.compatible_service_versions = m['compatible_service_versions'] model.description = m['description'] + model.filename = m['url'].split('/')[-1] model.publisher = m['publisher'] model.publisher_url = m['publisher_url'] model.publishing_url = m['publishing_url'] @@ -1101,39 +1104,40 @@ class SpaCyNLPPipelineModel(FileMixin, HashidMixin, db.Model): model.title = m['title'] model.version = m['version'] model.pipeline_name = m['pipeline_name'] - continue - model = SpaCyNLPPipelineModel( - compatible_service_versions=m['compatible_service_versions'], - description=m['description'], - publisher=m['publisher'], - publisher_url=m['publisher_url'], - publishing_url=m['publishing_url'], - publishing_year=m['publishing_year'], - is_public=True, - title=m['title'], - user=nopaque_user, - version=m['version'], - pipeline_name=m['pipeline_name'] - ) - db.session.add(model) - db.session.flush(objects=[model]) - db.session.refresh(model) - model.filename = m['url'].split('/')[-1] - r = requests.get(m['url'], stream=True) - pbar = tqdm( - desc=f'{model.title} ({model.filename})', - unit="B", - unit_scale=True, - unit_divisor=1024, - total=int(r.headers['Content-Length']) - ) - pbar.clear() - with open(model.path, 'wb') as f: - for chunk in r.iter_content(chunk_size=1024): - if chunk: # filter out keep-alive new chunks - pbar.update(len(chunk)) - f.write(chunk) - pbar.close() + else: + model = SpaCyNLPPipelineModel( + compatible_service_versions=m['compatible_service_versions'], + description=m['description'], + filename=m['url'].split('/')[-1], + publisher=m['publisher'], + publisher_url=m['publisher_url'], + publishing_url=m['publishing_url'], + publishing_year=m['publishing_year'], + is_public=True, + title=m['title'], + user=nopaque_user, + version=m['version'], + pipeline_name=m['pipeline_name'] + ) + db.session.add(model) + db.session.flush(objects=[model]) + db.session.refresh(model) + if not os.path.exists(model.path) or force_download: + r = requests.get(m['url'], stream=True) + pbar = tqdm( + desc=f'{model.title} ({model.filename})', + unit="B", + unit_scale=True, + unit_divisor=1024, + total=int(r.headers['Content-Length']) + ) + pbar.clear() + with open(model.path, 'wb') as f: + for chunk in r.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + pbar.update(len(chunk)) + f.write(chunk) + pbar.close() db.session.commit() def delete(self):