Fix long text processing

2025-10-11 02:42:08 +00:00 · 2021-04-22 08:43:34 +02:00
parent cd976692d6
commit 2813d1a222
2 changed files with 7 additions and 6 deletions
--- a/12
+++ b/12
@@ -142,12 +142,12 @@ for text_chunk in text_chunks:
            for ent_candidate in token.sent.ents:
                if ent_candidate.start_char == token.idx:
                    ent = ent_candidate
+                    annotation = {'start': ent.start_char + chunk_offset,
+                                  'end': ent.end_char + chunk_offset,
+                                  'tag': 'ent',
+                                  'properties': {'type': token.ent_type_}}
+                    annotations.append(annotation)
                    break
-            annotation = {'start': ent.start_char + chunk_offset,
-                          'end': ent.end_char + chunk_offset,
-                          'tag': 'ent',
-                          'properties': {'type': token.ent_type_}}
-            annotations.append(annotation)
        annotation = {'start': token.idx + chunk_offset,
                      'end': token.idx + len(token.text) + chunk_offset,
                      'tag': 'token',
@@ -157,7 +157,7 @@ for text_chunk in text_chunks:
        if token.ent_type_:
            annotation['properties']['ner'] = token.ent_type_
        annotations.append(annotation)
-    chunk_offset = len(text_chunk)
+    chunk_offset += len(text_chunk)

 with open(args.output, 'w') as output_file:
    json.dump({'meta': meta, 'tags': tags, 'annotations': annotations},