Preliminary work

2025-10-11 02:42:08 +00:00 · 2021-07-13 16:31:53 +02:00
parent 5139fd9727
commit 4dea95a108
6 changed files with 374 additions and 61 deletions
--- a/251
+++ b/251
@@ -8,6 +8,14 @@ import json
 import os
 import spacy
 import textwrap
+import uuid
+
+
+def UUIDnopaque(name):
+    return 'nopaque_{}'.format(
+        uuid.uuid3(uuid.NAMESPACE_DNS,
+                   '{}@nopaque.sfb1288.uni-bielefeld.de'.format(name))
+    )


 spacy_models = {spacy.info(pipeline)['lang']: pipeline
@@ -70,65 +78,167 @@ meta = {
    }
 }

-
-tags = {
-    'token': {
-        'description': '',
-        'properties': {
-            'lemma': {
+tags = [
+    {
+        'id': UUIDnopaque('token'),
+        'name': 'token',
+        'description': 'An individual token — i.e. a word, punctuation symbol, whitespace, etc.',
+        'properties': [
+            {
+                'id': UUIDnopaque('token.lemma'),
+                'name': 'lemma',
                'description': 'The base form of the word',
                'flags': ['required'],
-                'tagset': None
+                'labels': []
            },
-            'pos': {
+            {
+                'id': UUIDnopaque('token.pos'),
+                'name': 'pos',
                'description': 'The detailed part-of-speech tag',
                'flags': ['required'],
-                'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['tagger']}  # noqa
+                'labels': [
+                    {
+                        'id': UUIDnopaque('token.pos={}'.format(label)),
+                        'name': label,
+                        'description': spacy.explain(label) or ''
+                    } for label in spacy.info(model)['labels']['tagger']
+                ]
            },
-            'simple_pos': {
+            {
+                'id': UUIDnopaque('token.simple_pos'),
+                'name': 'simple_pos',
                'description': 'The simple UPOS part-of-speech tag',
                'flags': ['required'],
-                'tagset': {
-                    'ADJ': 'adjective',
-                    'ADP': 'adposition',
-                    'ADV': 'adverb',
-                    'AUX': 'auxiliary verb',
-                    'CONJ': 'coordinating conjunction',
-                    'DET': 'determiner',
-                    'INTJ': 'interjection',
-                    'NOUN': 'noun',
-                    'NUM': 'numeral',
-                    'PART': 'particle',
-                    'PRON': 'pronoun',
-                    'PROPN': 'proper noun',
-                    'PUNCT': 'punctuation',
-                    'SCONJ': 'subordinating conjunction',
-                    'SYM': 'symbol',
-                    'VERB': 'verb',
-                    'X': 'other'
-                }
+                'labels': [
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'ADJ',
+                        'description': 'adjective'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'ADP',
+                        'description': 'adposition'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'ADV',
+                        'description': 'adverb'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'AUX',
+                        'description': 'auxiliary verb'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'CONJ',
+                        'description': 'coordinating conjunction'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'DET',
+                        'description': 'determiner'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'INTJ',
+                        'description': 'interjection'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'NOUN',
+                        'description': 'noun'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'NUM',
+                        'description': 'numeral'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'PART',
+                        'description': 'particle'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'PRON',
+                        'description': 'pronoun'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'PROPN',
+                        'description': 'proper noun'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'PUNCT',
+                        'description': 'punctuation'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'SCONJ',
+                        'description': 'subordinating conjunction'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'SYM',
+                        'description': 'symbol'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'VERB',
+                        'description': 'verb'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'X',
+                        'description': 'other'
+                    }
+                ]
            },
-            'ner': {
-                'description': 'Label indicating the type of the entity',
-                'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['ner']}  # noqa
-            }
-        }
-    },
-    's': {
-        'description': 'Encodes the start and end of a sentence',
-        'properties': None
-    },
-    'ent': {
-        'description': 'Encodes the start and end of a named entity',
-        'properties': {
-            'type': {
+            {
+                'id': UUIDnopaque('token.ner'),
+                'name': 'ner',
                'description': 'Label indicating the type of the entity',
                'flags': ['required'],
-                'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['ner']}  # noqa
+                'labels': [
+                    {
+                        'id': UUIDnopaque('token.ner={}'.format(label)),
+                        'name': label,
+                        'description': spacy.explain(label) or ''
+                    } for label in spacy.info(model)['labels']['ner']
+                ]
            }
-        }
+        ]
+    },
+    {
+        'id': UUIDnopaque('s'),
+        'name': 's',
+        'description': 'Encodes the start and end of a sentence',
+        'properties': []
+    },
+    {
+        'id': UUIDnopaque('ent'),
+        'name': 'ent',
+        'description': 'Encodes the start and end of a named entity',
+        'properties': [
+            {
+                'id': UUIDnopaque('ent.type'),
+                'name': 'type',
+                'description': 'Label indicating the type of the entity',
+                'flags': ['required'],
+                'labels': [
+                    {
+                        'id': UUIDnopaque('ent.type={}'.format(label)),
+                        'name': label,
+                        'description': spacy.explain(label) or ''
+                    } for label in spacy.info(model)['labels']['ner']
+                ]
+            }
+        ]
    }
-}
+]

 annotations = []

@@ -142,27 +252,50 @@ while text_chunks:
        if token.is_sent_start:
            annotation = {'start': token.sent.start_char + chunk_offset,
                          'end': token.sent.end_char + chunk_offset,
-                          'tag': 's'}
+                          'tag_id': UUIDnopaque('s'),
+                          'properties': []}
            annotations.append(annotation)
        # Check if the token is the start of an entity
        if token.ent_iob == 3:
            for ent_candidate in token.sent.ents:
                if ent_candidate.start_char == token.idx:
                    ent = ent_candidate
-                    annotation = {'start': ent.start_char + chunk_offset,
-                                  'end': ent.end_char + chunk_offset,
-                                  'tag': 'ent',
-                                  'properties': {'type': token.ent_type_}}
+                    annotation = {
+                        'start': ent.start_char + chunk_offset,
+                        'end': ent.end_char + chunk_offset,
+                        'tag_id': UUIDnopaque('ent'),
+                        'properties': [
+                            {
+                                'property_id': UUIDnopaque('ent.type'),
+                                'value': token.ent_type_
+                            }
+                        ]
+                    }
                    annotations.append(annotation)
                    break
-        annotation = {'start': token.idx + chunk_offset,
-                      'end': token.idx + len(token.text) + chunk_offset,
-                      'tag': 'token',
-                      'properties': {'pos': token.tag_,
-                                     'lemma': token.lemma_,
-                                     'simple_pos': token.pos_}}
-        if token.ent_type_:
-            annotation['properties']['ner'] = token.ent_type_
+        annotation = {
+            'start': token.idx + chunk_offset,
+            'end': token.idx + len(token.text) + chunk_offset,
+            'tag_id': UUIDnopaque('token'),
+            'properties': [
+                {
+                   'property_id': UUIDnopaque('token.pos'),
+                   'value': token.tag_
+                },
+                {
+                    'property_id': UUIDnopaque('token.lemma'),
+                    'value': token.lemma_
+                },
+                {
+                    'property_id': UUIDnopaque('token.simple_pos'),
+                    'value': token.pos_
+                },
+                {
+                    'property_id': UUIDnopaque('token.ner'),
+                    'value': token.ent_type_ if token.ent_type_ else 'None'
+                }
+            ]
+        }
        annotations.append(annotation)
    chunk_offset += len(text_chunk)
    text_chunk = None