From 2813d1a2228b650f1d242fc9ae1736945012abfa Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Thu, 22 Apr 2021 08:43:34 +0200 Subject: [PATCH] Fix long text processing --- nlp | 1 + spacy-nlp | 12 ++++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/nlp b/nlp index 4af40c1..0df5a4b 100755 --- a/nlp +++ b/nlp @@ -71,6 +71,7 @@ class NLPPipeline(WorkflowRunner): ''' nlp_tasks = [] n_cores = max(1, int(self.getNCores() / len(self.jobs))) + mem_mb = min(n_cores * 2048, int(self.getMemMb() / len(self.jobs))) for i, job in enumerate(self.jobs): output_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name)) # noqa cmd = 'spacy-nlp' diff --git a/spacy-nlp b/spacy-nlp index 39d28a6..1950a6d 100755 --- a/spacy-nlp +++ b/spacy-nlp @@ -142,12 +142,12 @@ for text_chunk in text_chunks: for ent_candidate in token.sent.ents: if ent_candidate.start_char == token.idx: ent = ent_candidate + annotation = {'start': ent.start_char + chunk_offset, + 'end': ent.end_char + chunk_offset, + 'tag': 'ent', + 'properties': {'type': token.ent_type_}} + annotations.append(annotation) break - annotation = {'start': ent.start_char + chunk_offset, - 'end': ent.end_char + chunk_offset, - 'tag': 'ent', - 'properties': {'type': token.ent_type_}} - annotations.append(annotation) annotation = {'start': token.idx + chunk_offset, 'end': token.idx + len(token.text) + chunk_offset, 'tag': 'token', @@ -157,7 +157,7 @@ for text_chunk in text_chunks: if token.ent_type_: annotation['properties']['ner'] = token.ent_type_ annotations.append(annotation) - chunk_offset = len(text_chunk) + chunk_offset += len(text_chunk) with open(args.output, 'w') as output_file: json.dump({'meta': meta, 'tags': tags, 'annotations': annotations},