Preliminary work

2025-07-01 13:50:33 +00:00 · 2021-07-13 16:31:53 +02:00
parent 5139fd9727
commit 4dea95a108
6 changed files with 374 additions and 61 deletions
--- a/11
+++ b/11
@ -37,6 +37,11 @@ class NLPPipelineJob:
        self.file = file
        self.name = os.path.basename(file).rsplit('.', 1)[0]
        self.output_dir = output_dir
        catma_stand_off_data_file = file.rsplit('.', 1)[0] + '.catma-stand-off.json'  # noqa
        if os.path.exists(catma_stand_off_data_file):
            self.catma_stand_off_data_file = catma_stand_off_data_file
        else:
            self.catma_stand_off_data_file = None
 class NLPPipeline(WorkflowRunner):
@ -93,10 +98,12 @@ class NLPPipeline(WorkflowRunner):
        vrt_creation_tasks = []
        for i, job in enumerate(self.jobs):
            output_file = os.path.join(job.output_dir, '{}.vrt'.format(job.name))  # noqa
-            nlp_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name))  # noqa
+            nopaque_stand_off_data_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name))  # noqa
            cmd = 'vrt-creator'
            cmd += ' "{}"'.format(job.file)
-            cmd += ' "{}"'.format(nlp_file)
+            cmd += ' "{}"'.format(nopaque_stand_off_data_file)
            if job.catma_stand_off_data_file is not None:
                cmd += ' --catma-stand-off-data "{}"'.format(job.catma_stand_off_data_file)  # noqa
            cmd += ' "{}"'.format(output_file)
            deps = 'nlp_-_{}'.format(i)
            lbl = 'vrt_creation_-_{}'.format(i)
--- a/packages/stand-off-data-py/setup.py
+++ b/packages/stand-off-data-py/setup.py
--- a/packages/stand-off-data-py/stand_off_data/init.py
+++ b/packages/stand-off-data-py/stand_off_data/init.py
--- a/packages/stand-off-data-py/stand_off_data/models.py
+++ b/packages/stand-off-data-py/stand_off_data/models.py
@ -0,0 +1,126 @@
 '''
    'generator': {
        'name': 'nopaque NLP service',
        'version': '1.0.0',
        'arguments': {
            'check_encoding': args.check_encoding,
            'language': args.language
        }
    },
    'file': {
        'encoding': encoding,
        'md5': text_md5.hexdigest(),
        'name': os.path.basename(args.input)
    }
 '''
 class StandOffData:
    def __init__(self, attrs):
        self.tags = {tag_definition.id: tag_definition for tag_definition in
                     [TagDefinition(x) for x in attrs.get('tags', [])]}
        self.annotations = [TagAnnotation(x, self.tags) for x in
                            attrs.get('annotations', [])]
 class TagAnnotation:
    def __init__(self, attrs, tag_lookup):
        self.tag_id = attrs['tag_id']
        self.tag_lookup = tag_lookup
        if self.tag_id not in self.tag_lookup:
            raise Exception('Unknown tag id: {}'.format(self.tag_id))
        self.start = attrs['start']
        self.end = attrs['end']
        if self.start >= self.end:
            raise Exception('start must be lower then end')
        self.description = attrs.get('description', '')
        self.properties = [
            PropertyAnnotation(x, self.tag_lookup[self.tag_id].properties)
            for x in attrs.get('properties', [])
        ]
        for required_property_id in self.tag_lookup[self.tag_id].required_properties:
            if required_property_id not in self.properties:
                raise Exception('Missing required property: {}'.format(required_property_id))
    @property
    def name(self):
        return self.tag_lookup[self.tag_id].name
    def __lt__(self, other):
        if self.start == other.start:
            return self.name == 'token' and other.name != 'token'
        else:
            return self.start < other.start
    def __le__(self, other):
        if self.start == other.start:
            return self.name == 'token' or other.name != 'token'
        else:
            return self.start < other.start
    def __eq__(self, other):
        return self.start == other.start and self.name == other.name
    def __ne__(self, other):
        return self.start != other.start and self.name != other.name
    def __gt__(self, other):
        if self.start == other.start:
            return self.name != 'token' and other.name == 'token'
        else:
            return self.start > other.start
    def __ge__(self, other):
        if self.start == other.start:
            return self.name != 'token' or other.name == 'token'
        else:
            return self.start > other.start
 class PropertyAnnotation:
    def __init__(self, attrs, property_lookup):
        self.property_id = property['property_id']
        self.property_lookup = property_lookup
        if self.property_id not in self.property_lookup:
            raise Exception('Unknown property id: {}'.format(self.property_id))
        self.value = property['value']
        # TODO: Process attrs['possibleValues'] as self.labels (no id?)
    @property
    def name(self):
        return self.property_lookup[self.property_id].name
 class TagDefinition:
    def __init__(self, attrs):
        self.id = attrs['id']
        self.name = attrs['name']
        self.description = attrs.get('description', '')
        self.properties = {
            property_definition.id: property_definition
            for property_definition in [
                PropertyDefinition(x) for x in attrs.get('properties', [])
            ]
        }
    @property
    def required_properties(self):
        return {property.id: property for property in self.properties
                if property.is_required}
 class PropertyDefinition:
    def __init__(self, attrs):
        self.id = attrs['id']
        self.name = attrs['name']
        self.description = attrs.get('description', '')
        self.flags = attrs.get('flags', [])
        self.labels = attrs.get('labels', [])
    @property
    def is_required(self):
        return 'required' in self.flags
    @property
    def has_multiple_values(self):
        return 'multiple' in self.flags
--- a/packages/stand-off-data-py/stand_off_data/utils.py
+++ b/packages/stand-off-data-py/stand_off_data/utils.py
@ -0,0 +1,47 @@
 def create_vrt(text, stand_off_data):
    # Devide annotations into CWB's verticalized text format (.vrt) logic
    p_attrs = []    # positional attributes
    s_attrs = []    # structural attributes
    for annotation in stand_off_data.annotations:
        if annotation.name == 'token':
            p_attrs.append(annotation)
        else:
            s_attrs.append(annotation)
    # Sort annotations, necessary for the next checks
    p_attrs.sort()
    s_attrs.sort()
    # Check for p_attr<->p_attr overlap
    for i, p_attr in enumerate(p_attrs[:-1]):
        next_p_attr = p_attrs[i + 1]
        # Check if first_p_attr starts/ends within second_p_attr
        if ((p_attr.start >= next_p_attr.start) and (p_attr.start <= next_p_attr.end)  # noqa
            or (p_attr.end >= next_p_attr.start) and (p_attr.end <= next_p_attr.end)):  # noqa
            raise Exception('Positional attribute overlaps another')
    # Check for s_attr<->p_attr overlap
    for i, s_attr in enumerate(s_attrs):
        for p_attr in p_attrs:
            # Check if s_attr starts within p_attr
            if s_attr.start > p_attr.start and s_attr.start < p_attr.end:
                # Change s_attr start to p_attr's start
                s_attrs[i].start = p_attr.start
            # Check if s_attr ends within p_attr
            if s_attr.end < p_attr.end and s_attr.end > p_attr.start:
                # Change s_attr end to p_attr's end
                s_attrs[i].end = p_attr.end
            # Check if s_attr starts/ends before/after p_attr
            if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start:
                # No further Checking needed (just because p_attrs are sorted)
                break
    s_attr_start_buffer = {}
    s_attr_end_buffer = {}
    for i, s_attr in enumerate(s_attrs):
        if s_attr_start_buffer[s_attr.start]:
            s_attr_start_buffer[s_attr.start].append(i)
        else:
            s_attr_start_buffer[s_attr.start] = [i]
        if s_attr_end_buffer[s_attr.end]:
            s_attr_end_buffer[s_attr.end].append(i)
        else:
            s_attr_end_buffer[s_attr.end] = [1]
    vrt = ''
    # TODO do the work!
--- a/241
+++ b/241
@ -8,6 +8,14 @@ import json
 import os
 import spacy
 import textwrap
 import uuid
 def UUIDnopaque(name):
    return 'nopaque_{}'.format(
        uuid.uuid3(uuid.NAMESPACE_DNS,
                   '{}@nopaque.sfb1288.uni-bielefeld.de'.format(name))
    )
 spacy_models = {spacy.info(pipeline)['lang']: pipeline
@ -70,65 +78,167 @@ meta = {
    }
 }
-
+tags = [
-tags = {
+    {
-    'token': {
+        'id': UUIDnopaque('token'),
-        'description': '',
+        'name': 'token',
-        'properties': {
+        'description': 'An individual token — i.e. a word, punctuation symbol, whitespace, etc.',
-            'lemma': {
+        'properties': [
            {
                'id': UUIDnopaque('token.lemma'),
                'name': 'lemma',
                'description': 'The base form of the word',
                'flags': ['required'],
-                'tagset': None
+                'labels': []
            },
-            'pos': {
+            {
                'id': UUIDnopaque('token.pos'),
                'name': 'pos',
                'description': 'The detailed part-of-speech tag',
                'flags': ['required'],
-                'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['tagger']}  # noqa
+                'labels': [
                    {
                        'id': UUIDnopaque('token.pos={}'.format(label)),
                        'name': label,
                        'description': spacy.explain(label) or ''
                    } for label in spacy.info(model)['labels']['tagger']
                ]
            },
-            'simple_pos': {
+            {
                'id': UUIDnopaque('token.simple_pos'),
                'name': 'simple_pos',
                'description': 'The simple UPOS part-of-speech tag',
                'flags': ['required'],
-                'tagset': {
+                'labels': [
-                    'ADJ': 'adjective',
+                    {
-                    'ADP': 'adposition',
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
-                    'ADV': 'adverb',
+                        'name': 'ADJ',
-                    'AUX': 'auxiliary verb',
+                        'description': 'adjective'
                    'CONJ': 'coordinating conjunction',
                    'DET': 'determiner',
                    'INTJ': 'interjection',
                    'NOUN': 'noun',
                    'NUM': 'numeral',
                    'PART': 'particle',
                    'PRON': 'pronoun',
                    'PROPN': 'proper noun',
                    'PUNCT': 'punctuation',
                    'SCONJ': 'subordinating conjunction',
                    'SYM': 'symbol',
                    'VERB': 'verb',
                    'X': 'other'
                }
                    },
-            'ner': {
+                    {
-                'description': 'Label indicating the type of the entity',
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
-                'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['ner']}  # noqa
+                        'name': 'ADP',
-            }
+                        'description': 'adposition'
        }
                    },
-    's': {
+                    {
-        'description': 'Encodes the start and end of a sentence',
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
-        'properties': None
+                        'name': 'ADV',
                        'description': 'adverb'
                    },
-    'ent': {
+                    {
-        'description': 'Encodes the start and end of a named entity',
+                        'id': UUIDnopaque('token.simple_pos=ADJ'),
-        'properties': {
+                        'name': 'AUX',
-            'type': {
+                        'description': 'auxiliary verb'
                    },
                    {
                        'id': UUIDnopaque('token.simple_pos=ADJ'),
                        'name': 'CONJ',
                        'description': 'coordinating conjunction'
                    },
                    {
                        'id': UUIDnopaque('token.simple_pos=ADJ'),
                        'name': 'DET',
                        'description': 'determiner'
                    },
                    {
                        'id': UUIDnopaque('token.simple_pos=ADJ'),
                        'name': 'INTJ',
                        'description': 'interjection'
                    },
                    {
                        'id': UUIDnopaque('token.simple_pos=ADJ'),
                        'name': 'NOUN',
                        'description': 'noun'
                    },
                    {
                        'id': UUIDnopaque('token.simple_pos=ADJ'),
                        'name': 'NUM',
                        'description': 'numeral'
                    },
                    {
                        'id': UUIDnopaque('token.simple_pos=ADJ'),
                        'name': 'PART',
                        'description': 'particle'
                    },
                    {
                        'id': UUIDnopaque('token.simple_pos=ADJ'),
                        'name': 'PRON',
                        'description': 'pronoun'
                    },
                    {
                        'id': UUIDnopaque('token.simple_pos=ADJ'),
                        'name': 'PROPN',
                        'description': 'proper noun'
                    },
                    {
                        'id': UUIDnopaque('token.simple_pos=ADJ'),
                        'name': 'PUNCT',
                        'description': 'punctuation'
                    },
                    {
                        'id': UUIDnopaque('token.simple_pos=ADJ'),
                        'name': 'SCONJ',
                        'description': 'subordinating conjunction'
                    },
                    {
                        'id': UUIDnopaque('token.simple_pos=ADJ'),
                        'name': 'SYM',
                        'description': 'symbol'
                    },
                    {
                        'id': UUIDnopaque('token.simple_pos=ADJ'),
                        'name': 'VERB',
                        'description': 'verb'
                    },
                    {
                        'id': UUIDnopaque('token.simple_pos=ADJ'),
                        'name': 'X',
                        'description': 'other'
                    }
                ]
            },
            {
                'id': UUIDnopaque('token.ner'),
                'name': 'ner',
                'description': 'Label indicating the type of the entity',
                'flags': ['required'],
-                'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['ner']}  # noqa
+                'labels': [
-            }
+                    {
                        'id': UUIDnopaque('token.ner={}'.format(label)),
                        'name': label,
                        'description': spacy.explain(label) or ''
                    } for label in spacy.info(model)['labels']['ner']
                ]
            }
        ]
    },
    {
        'id': UUIDnopaque('s'),
        'name': 's',
        'description': 'Encodes the start and end of a sentence',
        'properties': []
    },
    {
        'id': UUIDnopaque('ent'),
        'name': 'ent',
        'description': 'Encodes the start and end of a named entity',
        'properties': [
            {
                'id': UUIDnopaque('ent.type'),
                'name': 'type',
                'description': 'Label indicating the type of the entity',
                'flags': ['required'],
                'labels': [
                    {
                        'id': UUIDnopaque('ent.type={}'.format(label)),
                        'name': label,
                        'description': spacy.explain(label) or ''
                    } for label in spacy.info(model)['labels']['ner']
                ]
            }
        ]
    }
 ]
 annotations = []
@ -142,27 +252,50 @@ while text_chunks:
        if token.is_sent_start:
            annotation = {'start': token.sent.start_char + chunk_offset,
                          'end': token.sent.end_char + chunk_offset,
-                          'tag': 's'}
+                          'tag_id': UUIDnopaque('s'),
                          'properties': []}
            annotations.append(annotation)
        # Check if the token is the start of an entity
        if token.ent_iob == 3:
            for ent_candidate in token.sent.ents:
                if ent_candidate.start_char == token.idx:
                    ent = ent_candidate
-                    annotation = {'start': ent.start_char + chunk_offset,
+                    annotation = {
                        'start': ent.start_char + chunk_offset,
                        'end': ent.end_char + chunk_offset,
-                                  'tag': 'ent',
+                        'tag_id': UUIDnopaque('ent'),
-                                  'properties': {'type': token.ent_type_}}
+                        'properties': [
                            {
                                'property_id': UUIDnopaque('ent.type'),
                                'value': token.ent_type_
                            }
                        ]
                    }
                    annotations.append(annotation)
                    break
-        annotation = {'start': token.idx + chunk_offset,
+        annotation = {
            'start': token.idx + chunk_offset,
            'end': token.idx + len(token.text) + chunk_offset,
-                      'tag': 'token',
+            'tag_id': UUIDnopaque('token'),
-                      'properties': {'pos': token.tag_,
+            'properties': [
-                                     'lemma': token.lemma_,
+                {
-                                     'simple_pos': token.pos_}}
+                   'property_id': UUIDnopaque('token.pos'),
-        if token.ent_type_:
+                   'value': token.tag_
-            annotation['properties']['ner'] = token.ent_type_
+                },
                {
                    'property_id': UUIDnopaque('token.lemma'),
                    'value': token.lemma_
                },
                {
                    'property_id': UUIDnopaque('token.simple_pos'),
                    'value': token.pos_
                },
                {
                    'property_id': UUIDnopaque('token.ner'),
                    'value': token.ent_type_ if token.ent_type_ else 'None'
                }
            ]
        }
        annotations.append(annotation)
    chunk_offset += len(text_chunk)
    text_chunk = None