Fix long text processing

2026-06-19 00:45:44 +00:00 · 2021-04-22 08:43:34 +02:00
parent cd976692d6
commit 2813d1a222
2 changed files with 7 additions and 6 deletions
@@ -71,6 +71,7 @@ class NLPPipeline(WorkflowRunner):
        '''
        nlp_tasks = []
        n_cores = max(1, int(self.getNCores() / len(self.jobs)))
+        mem_mb = min(n_cores * 2048, int(self.getMemMb() / len(self.jobs)))
        for i, job in enumerate(self.jobs):
            output_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name))  # noqa
            cmd = 'spacy-nlp'
@@ -142,12 +142,12 @@ for text_chunk in text_chunks:
            for ent_candidate in token.sent.ents:
                if ent_candidate.start_char == token.idx:
                    ent = ent_candidate
-                    break
                    annotation = {'start': ent.start_char + chunk_offset,
                                  'end': ent.end_char + chunk_offset,
                                  'tag': 'ent',
                                  'properties': {'type': token.ent_type_}}
                    annotations.append(annotation)
+                    break
        annotation = {'start': token.idx + chunk_offset,
                      'end': token.idx + len(token.text) + chunk_offset,
                      'tag': 'token',
@@ -157,7 +157,7 @@ for text_chunk in text_chunks:
        if token.ent_type_:
            annotation['properties']['ner'] = token.ent_type_
        annotations.append(annotation)
-    chunk_offset = len(text_chunk)
+    chunk_offset += len(text_chunk)

 with open(args.output, 'w') as output_file:
    json.dump({'meta': meta, 'tags': tags, 'annotations': annotations},