From 2813d1a2228b650f1d242fc9ae1736945012abfa Mon Sep 17 00:00:00 2001
From: Patrick Jentsch <p.jentsch@uni-bielefeld.de>
Date: Thu, 22 Apr 2021 08:43:34 +0200
Subject: [PATCH] Fix long text processing

---
 nlp       |  1 +
 spacy-nlp | 12 ++++++------
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/nlp b/nlp
index 4af40c1..0df5a4b 100755
--- a/nlp
+++ b/nlp
@@ -71,6 +71,7 @@ class NLPPipeline(WorkflowRunner):
         '''
         nlp_tasks = []
         n_cores = max(1, int(self.getNCores() / len(self.jobs)))
+        mem_mb = min(n_cores * 2048, int(self.getMemMb() / len(self.jobs)))
         for i, job in enumerate(self.jobs):
             output_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name))  # noqa
             cmd = 'spacy-nlp'
diff --git a/spacy-nlp b/spacy-nlp
index 39d28a6..1950a6d 100755
--- a/spacy-nlp
+++ b/spacy-nlp
@@ -142,12 +142,12 @@ for text_chunk in text_chunks:
             for ent_candidate in token.sent.ents:
                 if ent_candidate.start_char == token.idx:
                     ent = ent_candidate
+                    annotation = {'start': ent.start_char + chunk_offset,
+                                  'end': ent.end_char + chunk_offset,
+                                  'tag': 'ent',
+                                  'properties': {'type': token.ent_type_}}
+                    annotations.append(annotation)
                     break
-            annotation = {'start': ent.start_char + chunk_offset,
-                          'end': ent.end_char + chunk_offset,
-                          'tag': 'ent',
-                          'properties': {'type': token.ent_type_}}
-            annotations.append(annotation)
         annotation = {'start': token.idx + chunk_offset,
                       'end': token.idx + len(token.text) + chunk_offset,
                       'tag': 'token',
@@ -157,7 +157,7 @@ for text_chunk in text_chunks:
         if token.ent_type_:
             annotation['properties']['ner'] = token.ent_type_
         annotations.append(annotation)
-    chunk_offset = len(text_chunk)
+    chunk_offset += len(text_chunk)
 
 with open(args.output, 'w') as output_file:
     json.dump({'meta': meta, 'tags': tags, 'annotations': annotations},