mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
synced 2025-01-13 05:50:35 +00:00
Compare commits
2 Commits
cd976692d6
...
f7b7da2b1f
Author | SHA1 | Date | |
---|---|---|---|
|
f7b7da2b1f | ||
|
2813d1a222 |
3
nlp
3
nlp
@ -71,6 +71,7 @@ class NLPPipeline(WorkflowRunner):
|
|||||||
'''
|
'''
|
||||||
nlp_tasks = []
|
nlp_tasks = []
|
||||||
n_cores = max(1, int(self.getNCores() / len(self.jobs)))
|
n_cores = max(1, int(self.getNCores() / len(self.jobs)))
|
||||||
|
mem_mb = min(n_cores * 2048, int(self.getMemMb() / len(self.jobs)))
|
||||||
for i, job in enumerate(self.jobs):
|
for i, job in enumerate(self.jobs):
|
||||||
output_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name)) # noqa
|
output_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name)) # noqa
|
||||||
cmd = 'spacy-nlp'
|
cmd = 'spacy-nlp'
|
||||||
@ -81,7 +82,7 @@ class NLPPipeline(WorkflowRunner):
|
|||||||
deps = 'setup_output_directory_-_{}'.format(i)
|
deps = 'setup_output_directory_-_{}'.format(i)
|
||||||
lbl = 'nlp_-_{}'.format(i)
|
lbl = 'nlp_-_{}'.format(i)
|
||||||
task = self.addTask(command=cmd, dependencies=deps, label=lbl,
|
task = self.addTask(command=cmd, dependencies=deps, label=lbl,
|
||||||
nCores=n_cores)
|
memMb=mem_mb, nCores=n_cores)
|
||||||
nlp_tasks.append(task)
|
nlp_tasks.append(task)
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
12
spacy-nlp
12
spacy-nlp
@ -142,12 +142,12 @@ for text_chunk in text_chunks:
|
|||||||
for ent_candidate in token.sent.ents:
|
for ent_candidate in token.sent.ents:
|
||||||
if ent_candidate.start_char == token.idx:
|
if ent_candidate.start_char == token.idx:
|
||||||
ent = ent_candidate
|
ent = ent_candidate
|
||||||
|
annotation = {'start': ent.start_char + chunk_offset,
|
||||||
|
'end': ent.end_char + chunk_offset,
|
||||||
|
'tag': 'ent',
|
||||||
|
'properties': {'type': token.ent_type_}}
|
||||||
|
annotations.append(annotation)
|
||||||
break
|
break
|
||||||
annotation = {'start': ent.start_char + chunk_offset,
|
|
||||||
'end': ent.end_char + chunk_offset,
|
|
||||||
'tag': 'ent',
|
|
||||||
'properties': {'type': token.ent_type_}}
|
|
||||||
annotations.append(annotation)
|
|
||||||
annotation = {'start': token.idx + chunk_offset,
|
annotation = {'start': token.idx + chunk_offset,
|
||||||
'end': token.idx + len(token.text) + chunk_offset,
|
'end': token.idx + len(token.text) + chunk_offset,
|
||||||
'tag': 'token',
|
'tag': 'token',
|
||||||
@ -157,7 +157,7 @@ for text_chunk in text_chunks:
|
|||||||
if token.ent_type_:
|
if token.ent_type_:
|
||||||
annotation['properties']['ner'] = token.ent_type_
|
annotation['properties']['ner'] = token.ent_type_
|
||||||
annotations.append(annotation)
|
annotations.append(annotation)
|
||||||
chunk_offset = len(text_chunk)
|
chunk_offset += len(text_chunk)
|
||||||
|
|
||||||
with open(args.output, 'w') as output_file:
|
with open(args.output, 'w') as output_file:
|
||||||
json.dump({'meta': meta, 'tags': tags, 'annotations': annotations},
|
json.dump({'meta': meta, 'tags': tags, 'annotations': annotations},
|
||||||
|
Loading…
x
Reference in New Issue
Block a user