Preliminary work

2026-08-02 04:33:33 +00:00 · 2021-07-13 16:31:53 +02:00
parent 5139fd9727
commit 4dea95a108
6 changed files with 374 additions and 61 deletions
@@ -37,6 +37,11 @@ class NLPPipelineJob:
        self.file = file
        self.name = os.path.basename(file).rsplit('.', 1)[0]
        self.output_dir = output_dir
+        catma_stand_off_data_file = file.rsplit('.', 1)[0] + '.catma-stand-off.json'  # noqa
+        if os.path.exists(catma_stand_off_data_file):
+            self.catma_stand_off_data_file = catma_stand_off_data_file
+        else:
+            self.catma_stand_off_data_file = None


 class NLPPipeline(WorkflowRunner):
@@ -93,10 +98,12 @@ class NLPPipeline(WorkflowRunner):
        vrt_creation_tasks = []
        for i, job in enumerate(self.jobs):
            output_file = os.path.join(job.output_dir, '{}.vrt'.format(job.name))  # noqa
-            nlp_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name))  # noqa
+            nopaque_stand_off_data_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name))  # noqa
            cmd = 'vrt-creator'
            cmd += ' "{}"'.format(job.file)
-            cmd += ' "{}"'.format(nlp_file)
+            cmd += ' "{}"'.format(nopaque_stand_off_data_file)
+            if job.catma_stand_off_data_file is not None:
+                cmd += ' --catma-stand-off-data "{}"'.format(job.catma_stand_off_data_file)  # noqa
            cmd += ' "{}"'.format(output_file)
            deps = 'nlp_-_{}'.format(i)
            lbl = 'vrt_creation_-_{}'.format(i)
@@ -0,0 +1,126 @@
+'''
+    'generator': {
+        'name': 'nopaque NLP service',
+        'version': '1.0.0',
+        'arguments': {
+            'check_encoding': args.check_encoding,
+            'language': args.language
+        }
+    },
+    'file': {
+        'encoding': encoding,
+        'md5': text_md5.hexdigest(),
+        'name': os.path.basename(args.input)
+    }
+'''
+
+
+class StandOffData:
+    def __init__(self, attrs):
+        self.tags = {tag_definition.id: tag_definition for tag_definition in
+                     [TagDefinition(x) for x in attrs.get('tags', [])]}
+        self.annotations = [TagAnnotation(x, self.tags) for x in
+                            attrs.get('annotations', [])]
+
+
+class TagAnnotation:
+    def __init__(self, attrs, tag_lookup):
+        self.tag_id = attrs['tag_id']
+        self.tag_lookup = tag_lookup
+        if self.tag_id not in self.tag_lookup:
+            raise Exception('Unknown tag id: {}'.format(self.tag_id))
+        self.start = attrs['start']
+        self.end = attrs['end']
+        if self.start >= self.end:
+            raise Exception('start must be lower then end')
+        self.description = attrs.get('description', '')
+        self.properties = [
+            PropertyAnnotation(x, self.tag_lookup[self.tag_id].properties)
+            for x in attrs.get('properties', [])
+        ]
+        for required_property_id in self.tag_lookup[self.tag_id].required_properties:
+            if required_property_id not in self.properties:
+                raise Exception('Missing required property: {}'.format(required_property_id))
+
+    @property
+    def name(self):
+        return self.tag_lookup[self.tag_id].name
+
+    def __lt__(self, other):
+        if self.start == other.start:
+            return self.name == 'token' and other.name != 'token'
+        else:
+            return self.start < other.start
+
+    def __le__(self, other):
+        if self.start == other.start:
+            return self.name == 'token' or other.name != 'token'
+        else:
+            return self.start < other.start
+
+    def __eq__(self, other):
+        return self.start == other.start and self.name == other.name
+
+    def __ne__(self, other):
+        return self.start != other.start and self.name != other.name
+
+    def __gt__(self, other):
+        if self.start == other.start:
+            return self.name != 'token' and other.name == 'token'
+        else:
+            return self.start > other.start
+
+    def __ge__(self, other):
+        if self.start == other.start:
+            return self.name != 'token' or other.name == 'token'
+        else:
+            return self.start > other.start
+
+
+class PropertyAnnotation:
+    def __init__(self, attrs, property_lookup):
+        self.property_id = property['property_id']
+        self.property_lookup = property_lookup
+        if self.property_id not in self.property_lookup:
+            raise Exception('Unknown property id: {}'.format(self.property_id))
+        self.value = property['value']
+        # TODO: Process attrs['possibleValues'] as self.labels (no id?)
+
+    @property
+    def name(self):
+        return self.property_lookup[self.property_id].name
+
+
+class TagDefinition:
+    def __init__(self, attrs):
+        self.id = attrs['id']
+        self.name = attrs['name']
+        self.description = attrs.get('description', '')
+        self.properties = {
+            property_definition.id: property_definition
+            for property_definition in [
+                PropertyDefinition(x) for x in attrs.get('properties', [])
+            ]
+        }
+
+    @property
+    def required_properties(self):
+        return {property.id: property for property in self.properties
+                if property.is_required}
+
+
+class PropertyDefinition:
+    def __init__(self, attrs):
+        self.id = attrs['id']
+        self.name = attrs['name']
+        self.description = attrs.get('description', '')
+        self.flags = attrs.get('flags', [])
+        self.labels = attrs.get('labels', [])
+
+    @property
+    def is_required(self):
+        return 'required' in self.flags
+
+    @property
+    def has_multiple_values(self):
+        return 'multiple' in self.flags
@@ -0,0 +1,47 @@
+def create_vrt(text, stand_off_data):
+    # Devide annotations into CWB's verticalized text format (.vrt) logic
+    p_attrs = []    # positional attributes
+    s_attrs = []    # structural attributes
+    for annotation in stand_off_data.annotations:
+        if annotation.name == 'token':
+            p_attrs.append(annotation)
+        else:
+            s_attrs.append(annotation)
+    # Sort annotations, necessary for the next checks
+    p_attrs.sort()
+    s_attrs.sort()
+    # Check for p_attr<->p_attr overlap
+    for i, p_attr in enumerate(p_attrs[:-1]):
+        next_p_attr = p_attrs[i + 1]
+        # Check if first_p_attr starts/ends within second_p_attr
+        if ((p_attr.start >= next_p_attr.start) and (p_attr.start <= next_p_attr.end)  # noqa
+            or (p_attr.end >= next_p_attr.start) and (p_attr.end <= next_p_attr.end)):  # noqa
+            raise Exception('Positional attribute overlaps another')
+    # Check for s_attr<->p_attr overlap
+    for i, s_attr in enumerate(s_attrs):
+        for p_attr in p_attrs:
+            # Check if s_attr starts within p_attr
+            if s_attr.start > p_attr.start and s_attr.start < p_attr.end:
+                # Change s_attr start to p_attr's start
+                s_attrs[i].start = p_attr.start
+            # Check if s_attr ends within p_attr
+            if s_attr.end < p_attr.end and s_attr.end > p_attr.start:
+                # Change s_attr end to p_attr's end
+                s_attrs[i].end = p_attr.end
+            # Check if s_attr starts/ends before/after p_attr
+            if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start:
+                # No further Checking needed (just because p_attrs are sorted)
+                break
+    s_attr_start_buffer = {}
+    s_attr_end_buffer = {}
+    for i, s_attr in enumerate(s_attrs):
+        if s_attr_start_buffer[s_attr.start]:
+            s_attr_start_buffer[s_attr.start].append(i)
+        else:
+            s_attr_start_buffer[s_attr.start] = [i]
+        if s_attr_end_buffer[s_attr.end]:
+            s_attr_end_buffer[s_attr.end].append(i)
+        else:
+            s_attr_end_buffer[s_attr.end] = [1]
+    vrt = ''
+    # TODO do the work!
@@ -8,6 +8,14 @@ import json
 import os
 import spacy
 import textwrap
+import uuid
+
+
+def UUIDnopaque(name):
+    return 'nopaque_{}'.format(
+        uuid.uuid3(uuid.NAMESPACE_DNS,
+                   '{}@nopaque.sfb1288.uni-bielefeld.de'.format(name))
+    )


 spacy_models = {spacy.info(pipeline)['lang']: pipeline
@@ -70,65 +78,167 @@ meta = {
    }
 }

-
-tags = {
-    'token': {
-        'description': '',
-        'properties': {
-            'lemma': {
+tags = [
+    {
+        'id': UUIDnopaque('token'),
+        'name': 'token',
+        'description': 'An individual token — i.e. a word, punctuation symbol, whitespace, etc.',
+        'properties': [
+            {
+                'id': UUIDnopaque('token.lemma'),
+                'name': 'lemma',
                'description': 'The base form of the word',
                'flags': ['required'],
-                'tagset': None
+                'labels': []
            },
-            'pos': {
+            {
+                'id': UUIDnopaque('token.pos'),
+                'name': 'pos',
                'description': 'The detailed part-of-speech tag',
                'flags': ['required'],
-                'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['tagger']}  # noqa
+                'labels': [
+                    {
+                        'id': UUIDnopaque('token.pos={}'.format(label)),
+                        'name': label,
+                        'description': spacy.explain(label) or ''
+                    } for label in spacy.info(model)['labels']['tagger']
+                ]
            },
-            'simple_pos': {
+            {
+                'id': UUIDnopaque('token.simple_pos'),
+                'name': 'simple_pos',
                'description': 'The simple UPOS part-of-speech tag',
                'flags': ['required'],
-                'tagset': {
-                    'ADJ': 'adjective',
-                    'ADP': 'adposition',
-                    'ADV': 'adverb',
-                    'AUX': 'auxiliary verb',
-                    'CONJ': 'coordinating conjunction',
-                    'DET': 'determiner',
-                    'INTJ': 'interjection',
-                    'NOUN': 'noun',
-                    'NUM': 'numeral',
-                    'PART': 'particle',
-                    'PRON': 'pronoun',
-                    'PROPN': 'proper noun',
-                    'PUNCT': 'punctuation',
-                    'SCONJ': 'subordinating conjunction',
-                    'SYM': 'symbol',
-                    'VERB': 'verb',
-                    'X': 'other'
-                }
+                'labels': [
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'ADJ',
+                        'description': 'adjective'
                    },
-            'ner': {
-                'description': 'Label indicating the type of the entity',
-                'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['ner']}  # noqa
-            }
-        }
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'ADP',
+                        'description': 'adposition'
                    },
-    's': {
-        'description': 'Encodes the start and end of a sentence',
-        'properties': None
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'ADV',
+                        'description': 'adverb'
                    },
-    'ent': {
-        'description': 'Encodes the start and end of a named entity',
-        'properties': {
-            'type': {
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'AUX',
+                        'description': 'auxiliary verb'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'CONJ',
+                        'description': 'coordinating conjunction'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'DET',
+                        'description': 'determiner'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'INTJ',
+                        'description': 'interjection'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'NOUN',
+                        'description': 'noun'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'NUM',
+                        'description': 'numeral'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'PART',
+                        'description': 'particle'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'PRON',
+                        'description': 'pronoun'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'PROPN',
+                        'description': 'proper noun'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'PUNCT',
+                        'description': 'punctuation'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'SCONJ',
+                        'description': 'subordinating conjunction'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'SYM',
+                        'description': 'symbol'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'VERB',
+                        'description': 'verb'
+                    },
+                    {
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
+                        'name': 'X',
+                        'description': 'other'
+                    }
+                ]
+            },
+            {
+                'id': UUIDnopaque('token.ner'),
+                'name': 'ner',
                'description': 'Label indicating the type of the entity',
                'flags': ['required'],
-                'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['ner']}  # noqa
-            }
+                'labels': [
+                    {
+                        'id': UUIDnopaque('token.ner={}'.format(label)),
+                        'name': label,
+                        'description': spacy.explain(label) or ''
+                    } for label in spacy.info(model)['labels']['ner']
+                ]
            }
+        ]
+    },
+    {
+        'id': UUIDnopaque('s'),
+        'name': 's',
+        'description': 'Encodes the start and end of a sentence',
+        'properties': []
+    },
+    {
+        'id': UUIDnopaque('ent'),
+        'name': 'ent',
+        'description': 'Encodes the start and end of a named entity',
+        'properties': [
+            {
+                'id': UUIDnopaque('ent.type'),
+                'name': 'type',
+                'description': 'Label indicating the type of the entity',
+                'flags': ['required'],
+                'labels': [
+                    {
+                        'id': UUIDnopaque('ent.type={}'.format(label)),
+                        'name': label,
+                        'description': spacy.explain(label) or ''
+                    } for label in spacy.info(model)['labels']['ner']
+                ]
            }
+        ]
    }
+]

 annotations = []

@@ -142,27 +252,50 @@ while text_chunks:
        if token.is_sent_start:
            annotation = {'start': token.sent.start_char + chunk_offset,
                          'end': token.sent.end_char + chunk_offset,
-                          'tag': 's'}
+                          'tag_id': UUIDnopaque('s'),
+                          'properties': []}
            annotations.append(annotation)
        # Check if the token is the start of an entity
        if token.ent_iob == 3:
            for ent_candidate in token.sent.ents:
                if ent_candidate.start_char == token.idx:
                    ent = ent_candidate
-                    annotation = {'start': ent.start_char + chunk_offset,
+                    annotation = {
+                        'start': ent.start_char + chunk_offset,
                        'end': ent.end_char + chunk_offset,
-                                  'tag': 'ent',
-                                  'properties': {'type': token.ent_type_}}
+                        'tag_id': UUIDnopaque('ent'),
+                        'properties': [
+                            {
+                                'property_id': UUIDnopaque('ent.type'),
+                                'value': token.ent_type_
+                            }
+                        ]
+                    }
                    annotations.append(annotation)
                    break
-        annotation = {'start': token.idx + chunk_offset,
+        annotation = {
+            'start': token.idx + chunk_offset,
            'end': token.idx + len(token.text) + chunk_offset,
-                      'tag': 'token',
-                      'properties': {'pos': token.tag_,
-                                     'lemma': token.lemma_,
-                                     'simple_pos': token.pos_}}
-        if token.ent_type_:
-            annotation['properties']['ner'] = token.ent_type_
+            'tag_id': UUIDnopaque('token'),
+            'properties': [
+                {
+                   'property_id': UUIDnopaque('token.pos'),
+                   'value': token.tag_
+                },
+                {
+                    'property_id': UUIDnopaque('token.lemma'),
+                    'value': token.lemma_
+                },
+                {
+                    'property_id': UUIDnopaque('token.simple_pos'),
+                    'value': token.pos_
+                },
+                {
+                    'property_id': UUIDnopaque('token.ner'),
+                    'value': token.ent_type_ if token.ent_type_ else 'None'
+                }
+            ]
+        }
        annotations.append(annotation)
    chunk_offset += len(text_chunk)
    text_chunk = None