Fix long text processing

2026-08-02 03:13:32 +00:00 · 2021-04-22 08:43:34 +02:00
parent cd976692d6
commit 2813d1a222
2 changed files with 7 additions and 6 deletions
@@ -71,6 +71,7 @@ class NLPPipeline(WorkflowRunner):
        '''
        nlp_tasks = []
        n_cores = max(1, int(self.getNCores() / len(self.jobs)))
+        mem_mb = min(n_cores * 2048, int(self.getMemMb() / len(self.jobs)))
        for i, job in enumerate(self.jobs):
            output_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name))  # noqa
            cmd = 'spacy-nlp'
@@ -142,12 +142,12 @@ for text_chunk in text_chunks:
            for ent_candidate in token.sent.ents:
                if ent_candidate.start_char == token.idx:
                    ent = ent_candidate
+                    annotation = {'start': ent.start_char + chunk_offset,
+                                  'end': ent.end_char + chunk_offset,
+                                  'tag': 'ent',
+                                  'properties': {'type': token.ent_type_}}
+                    annotations.append(annotation)
                    break
-            annotation = {'start': ent.start_char + chunk_offset,
-                          'end': ent.end_char + chunk_offset,
-                          'tag': 'ent',
-                          'properties': {'type': token.ent_type_}}
-            annotations.append(annotation)
        annotation = {'start': token.idx + chunk_offset,
                      'end': token.idx + len(token.text) + chunk_offset,
                      'tag': 'token',
@@ -157,7 +157,7 @@ for text_chunk in text_chunks:
        if token.ent_type_:
            annotation['properties']['ner'] = token.ent_type_
        annotations.append(annotation)
-    chunk_offset = len(text_chunk)
+    chunk_offset += len(text_chunk)

 with open(args.output, 'w') as output_file:
    json.dump({'meta': meta, 'tags': tags, 'annotations': annotations},