mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2025-07-04 11:43:18 +00:00
Use pathlib where possible
This commit is contained in:
app
converters
corpora
jobs
main
models
__init__.pyavatar.pycorpus.pycorpus_file.pyjob.pyjob_input.pyjob_result.pyspacy_nlp_pipeline_model.pytesseract_ocr_pipeline_model.pyuser.py
services
users
@ -4,7 +4,7 @@ from flask import current_app, url_for
|
||||
from flask_hashids import HashidMixin
|
||||
from sqlalchemy.ext.associationproxy import association_proxy
|
||||
from typing import Union
|
||||
import os
|
||||
from pathlib import Path
|
||||
import shutil
|
||||
import xml.etree.ElementTree as ET
|
||||
from app import db
|
||||
@ -88,8 +88,8 @@ class Corpus(HashidMixin, db.Model):
|
||||
return f'{self.user.jsonpatch_path}/corpora/{self.hashid}'
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return os.path.join(self.user.path, 'corpora', str(self.id))
|
||||
def path(self) -> Path:
|
||||
return self.user.path / 'corpora' / f'{self.id}'
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
@ -105,27 +105,39 @@ class Corpus(HashidMixin, db.Model):
|
||||
db.session.add(corpus)
|
||||
db.session.flush(objects=[corpus])
|
||||
db.session.refresh(corpus)
|
||||
corpus_files_dir = corpus.path / 'files'
|
||||
corpus_cwb_dir = corpus.path / 'cwb'
|
||||
corpus_cwb_data_dir = corpus_cwb_dir / 'data'
|
||||
corpus_cwb_registry_dir = corpus_cwb_dir / 'registry'
|
||||
try:
|
||||
os.mkdir(corpus.path)
|
||||
os.mkdir(os.path.join(corpus.path, 'files'))
|
||||
os.mkdir(os.path.join(corpus.path, 'cwb'))
|
||||
os.mkdir(os.path.join(corpus.path, 'cwb', 'data'))
|
||||
os.mkdir(os.path.join(corpus.path, 'cwb', 'registry'))
|
||||
corpus.path.mkdir()
|
||||
corpus_files_dir.mkdir()
|
||||
corpus_cwb_dir.mkdir()
|
||||
corpus_cwb_data_dir.mkdir()
|
||||
corpus_cwb_registry_dir.mkdir()
|
||||
except OSError as e:
|
||||
# TODO: Potential leftover cleanup
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
raise e
|
||||
raise
|
||||
return corpus
|
||||
|
||||
def build(self):
|
||||
build_dir = os.path.join(self.path, 'cwb')
|
||||
shutil.rmtree(build_dir, ignore_errors=True)
|
||||
os.mkdir(build_dir)
|
||||
os.mkdir(os.path.join(build_dir, 'data'))
|
||||
os.mkdir(os.path.join(build_dir, 'registry'))
|
||||
corpus_cwb_dir = self.path / 'cwb'
|
||||
corpus_cwb_data_dir = corpus_cwb_dir / 'data'
|
||||
corpus_cwb_registry_dir = corpus_cwb_dir / 'registry'
|
||||
try:
|
||||
shutil.rmtree(corpus_cwb_dir, ignore_errors=True)
|
||||
corpus_cwb_dir.mkdir()
|
||||
corpus_cwb_data_dir.mkdir()
|
||||
corpus_cwb_registry_dir.mkdir()
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
self.status = CorpusStatus.FAILED
|
||||
raise
|
||||
corpus_element = ET.fromstring('<corpus>\n</corpus>')
|
||||
for corpus_file in self.files:
|
||||
normalized_vrt_path = os.path.join(build_dir, f'{corpus_file.id}.norm.vrt')
|
||||
normalized_vrt_path = corpus_cwb_dir / f'{corpus_file.id}.norm.vrt'
|
||||
try:
|
||||
normalize_vrt_file(corpus_file.path, normalized_vrt_path)
|
||||
except:
|
||||
@ -152,7 +164,7 @@ class Corpus(HashidMixin, db.Model):
|
||||
# corpus_element.insert(1, text_element)
|
||||
corpus_element.append(text_element)
|
||||
ET.ElementTree(corpus_element).write(
|
||||
os.path.join(build_dir, 'corpus.vrt'),
|
||||
corpus_cwb_dir / 'corpus.vrt',
|
||||
encoding='utf-8'
|
||||
)
|
||||
self.status = CorpusStatus.SUBMITTED
|
||||
|
Reference in New Issue
Block a user