Compare commits

...

2 Commits

Author SHA1 Message Date
Patrick Jentsch
f7b7da2b1f restrict memory usage for nlp tasks 2021-04-22 08:46:28 +02:00
Patrick Jentsch
2813d1a222 Fix long text processing 2021-04-22 08:43:34 +02:00
2 changed files with 8 additions and 7 deletions

3
nlp
View File

@ -71,6 +71,7 @@ class NLPPipeline(WorkflowRunner):
''' '''
nlp_tasks = [] nlp_tasks = []
n_cores = max(1, int(self.getNCores() / len(self.jobs))) n_cores = max(1, int(self.getNCores() / len(self.jobs)))
mem_mb = min(n_cores * 2048, int(self.getMemMb() / len(self.jobs)))
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
output_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name)) # noqa output_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name)) # noqa
cmd = 'spacy-nlp' cmd = 'spacy-nlp'
@ -81,7 +82,7 @@ class NLPPipeline(WorkflowRunner):
deps = 'setup_output_directory_-_{}'.format(i) deps = 'setup_output_directory_-_{}'.format(i)
lbl = 'nlp_-_{}'.format(i) lbl = 'nlp_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl, task = self.addTask(command=cmd, dependencies=deps, label=lbl,
nCores=n_cores) memMb=mem_mb, nCores=n_cores)
nlp_tasks.append(task) nlp_tasks.append(task)
''' '''

View File

@ -142,12 +142,12 @@ for text_chunk in text_chunks:
for ent_candidate in token.sent.ents: for ent_candidate in token.sent.ents:
if ent_candidate.start_char == token.idx: if ent_candidate.start_char == token.idx:
ent = ent_candidate ent = ent_candidate
break
annotation = {'start': ent.start_char + chunk_offset, annotation = {'start': ent.start_char + chunk_offset,
'end': ent.end_char + chunk_offset, 'end': ent.end_char + chunk_offset,
'tag': 'ent', 'tag': 'ent',
'properties': {'type': token.ent_type_}} 'properties': {'type': token.ent_type_}}
annotations.append(annotation) annotations.append(annotation)
break
annotation = {'start': token.idx + chunk_offset, annotation = {'start': token.idx + chunk_offset,
'end': token.idx + len(token.text) + chunk_offset, 'end': token.idx + len(token.text) + chunk_offset,
'tag': 'token', 'tag': 'token',
@ -157,7 +157,7 @@ for text_chunk in text_chunks:
if token.ent_type_: if token.ent_type_:
annotation['properties']['ner'] = token.ent_type_ annotation['properties']['ner'] = token.ent_type_
annotations.append(annotation) annotations.append(annotation)
chunk_offset = len(text_chunk) chunk_offset += len(text_chunk)
with open(args.output, 'w') as output_file: with open(args.output, 'w') as output_file:
json.dump({'meta': meta, 'tags': tags, 'annotations': annotations}, json.dump({'meta': meta, 'tags': tags, 'annotations': annotations},