From 2813d1a2228b650f1d242fc9ae1736945012abfa Mon Sep 17 00:00:00 2001
From: Patrick Jentsch
Date: Thu, 22 Apr 2021 08:43:34 +0200
Subject: [PATCH] Fix long text processing
---
nlp | 1 +
spacy-nlp | 12 ++++++------
2 files changed, 7 insertions(+), 6 deletions(-)
diff --git a/nlp b/nlp
index 4af40c1..0df5a4b 100755
--- a/nlp
+++ b/nlp
@@ -71,6 +71,7 @@ class NLPPipeline(WorkflowRunner):
'''
nlp_tasks = []
n_cores = max(1, int(self.getNCores() / len(self.jobs)))
+ mem_mb = min(n_cores * 2048, int(self.getMemMb() / len(self.jobs)))
for i, job in enumerate(self.jobs):
output_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name)) # noqa
cmd = 'spacy-nlp'
diff --git a/spacy-nlp b/spacy-nlp
index 39d28a6..1950a6d 100755
--- a/spacy-nlp
+++ b/spacy-nlp
@@ -142,12 +142,12 @@ for text_chunk in text_chunks:
for ent_candidate in token.sent.ents:
if ent_candidate.start_char == token.idx:
ent = ent_candidate
+ annotation = {'start': ent.start_char + chunk_offset,
+ 'end': ent.end_char + chunk_offset,
+ 'tag': 'ent',
+ 'properties': {'type': token.ent_type_}}
+ annotations.append(annotation)
break
- annotation = {'start': ent.start_char + chunk_offset,
- 'end': ent.end_char + chunk_offset,
- 'tag': 'ent',
- 'properties': {'type': token.ent_type_}}
- annotations.append(annotation)
annotation = {'start': token.idx + chunk_offset,
'end': token.idx + len(token.text) + chunk_offset,
'tag': 'token',
@@ -157,7 +157,7 @@ for text_chunk in text_chunks:
if token.ent_type_:
annotation['properties']['ner'] = token.ent_type_
annotations.append(annotation)
- chunk_offset = len(text_chunk)
+ chunk_offset += len(text_chunk)
with open(args.output, 'w') as output_file:
json.dump({'meta': meta, 'tags': tags, 'annotations': annotations},