restrict memory usage for nlp tasks

Fix long text processing
2026-04-26 07:41:37 +00:00 · 2021-04-22 08:46:28 +02:00 · 2021-04-22 08:43:34 +02:00
2 changed files with 8 additions and 7 deletions
--- a/3
+++ b/3
@@ -71,6 +71,7 @@ class NLPPipeline(WorkflowRunner):
        '''
        nlp_tasks = []
        n_cores = max(1, int(self.getNCores() / len(self.jobs)))
+        mem_mb = min(n_cores * 2048, int(self.getMemMb() / len(self.jobs)))
        for i, job in enumerate(self.jobs):
            output_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name))  # noqa
            cmd = 'spacy-nlp'
@@ -81,7 +82,7 @@ class NLPPipeline(WorkflowRunner):
            deps = 'setup_output_directory_-_{}'.format(i)
            lbl = 'nlp_-_{}'.format(i)
            task = self.addTask(command=cmd, dependencies=deps, label=lbl,
-                                nCores=n_cores)
+                                memMb=mem_mb, nCores=n_cores)
            nlp_tasks.append(task)

        '''
--- a/4
+++ b/4
@@ -142,12 +142,12 @@ for text_chunk in text_chunks:
            for ent_candidate in token.sent.ents:
                if ent_candidate.start_char == token.idx:
                    ent = ent_candidate
-                    break
                    annotation = {'start': ent.start_char + chunk_offset,
                                  'end': ent.end_char + chunk_offset,
                                  'tag': 'ent',
                                  'properties': {'type': token.ent_type_}}
                    annotations.append(annotation)
+                    break
        annotation = {'start': token.idx + chunk_offset,
                      'end': token.idx + len(token.text) + chunk_offset,
                      'tag': 'token',
@@ -157,7 +157,7 @@ for text_chunk in text_chunks:
        if token.ent_type_:
            annotation['properties']['ner'] = token.ent_type_
        annotations.append(annotation)
-    chunk_offset = len(text_chunk)
+    chunk_offset += len(text_chunk)

 with open(args.output, 'w') as output_file:
    json.dump({'meta': meta, 'tags': tags, 'annotations': annotations},
Author	SHA1	Message	Date
Patrick Jentsch	f7b7da2b1f	restrict memory usage for nlp tasks	2021-04-22 08:46:28 +02:00
Patrick Jentsch	2813d1a222	Fix long text processing	2021-04-22 08:43:34 +02:00