From 4dea95a1089d61862a8bb062276a33129f5cbe74 Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Tue, 13 Jul 2021 16:31:53 +0200 Subject: [PATCH] Preliminary work --- nlp | 11 +- packages/stand-off-data-py/setup.py | 0 .../stand_off_data/__init__.py | 0 .../stand_off_data/models.py | 126 +++++++++ .../stand-off-data-py/stand_off_data/utils.py | 47 ++++ spacy-nlp | 251 ++++++++++++++---- 6 files changed, 374 insertions(+), 61 deletions(-) create mode 100644 packages/stand-off-data-py/setup.py create mode 100644 packages/stand-off-data-py/stand_off_data/__init__.py create mode 100644 packages/stand-off-data-py/stand_off_data/models.py create mode 100644 packages/stand-off-data-py/stand_off_data/utils.py diff --git a/nlp b/nlp index 031126f..3ffefab 100755 --- a/nlp +++ b/nlp @@ -37,6 +37,11 @@ class NLPPipelineJob: self.file = file self.name = os.path.basename(file).rsplit('.', 1)[0] self.output_dir = output_dir + catma_stand_off_data_file = file.rsplit('.', 1)[0] + '.catma-stand-off.json' # noqa + if os.path.exists(catma_stand_off_data_file): + self.catma_stand_off_data_file = catma_stand_off_data_file + else: + self.catma_stand_off_data_file = None class NLPPipeline(WorkflowRunner): @@ -93,10 +98,12 @@ class NLPPipeline(WorkflowRunner): vrt_creation_tasks = [] for i, job in enumerate(self.jobs): output_file = os.path.join(job.output_dir, '{}.vrt'.format(job.name)) # noqa - nlp_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name)) # noqa + nopaque_stand_off_data_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name)) # noqa cmd = 'vrt-creator' cmd += ' "{}"'.format(job.file) - cmd += ' "{}"'.format(nlp_file) + cmd += ' "{}"'.format(nopaque_stand_off_data_file) + if job.catma_stand_off_data_file is not None: + cmd += ' --catma-stand-off-data "{}"'.format(job.catma_stand_off_data_file) # noqa cmd += ' "{}"'.format(output_file) deps = 'nlp_-_{}'.format(i) lbl = 'vrt_creation_-_{}'.format(i) diff --git a/packages/stand-off-data-py/setup.py b/packages/stand-off-data-py/setup.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/stand-off-data-py/stand_off_data/__init__.py b/packages/stand-off-data-py/stand_off_data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/stand-off-data-py/stand_off_data/models.py b/packages/stand-off-data-py/stand_off_data/models.py new file mode 100644 index 0000000..5d93aad --- /dev/null +++ b/packages/stand-off-data-py/stand_off_data/models.py @@ -0,0 +1,126 @@ +''' + 'generator': { + 'name': 'nopaque NLP service', + 'version': '1.0.0', + 'arguments': { + 'check_encoding': args.check_encoding, + 'language': args.language + } + }, + 'file': { + 'encoding': encoding, + 'md5': text_md5.hexdigest(), + 'name': os.path.basename(args.input) + } +''' + + +class StandOffData: + def __init__(self, attrs): + self.tags = {tag_definition.id: tag_definition for tag_definition in + [TagDefinition(x) for x in attrs.get('tags', [])]} + self.annotations = [TagAnnotation(x, self.tags) for x in + attrs.get('annotations', [])] + + +class TagAnnotation: + def __init__(self, attrs, tag_lookup): + self.tag_id = attrs['tag_id'] + self.tag_lookup = tag_lookup + if self.tag_id not in self.tag_lookup: + raise Exception('Unknown tag id: {}'.format(self.tag_id)) + self.start = attrs['start'] + self.end = attrs['end'] + if self.start >= self.end: + raise Exception('start must be lower then end') + self.description = attrs.get('description', '') + self.properties = [ + PropertyAnnotation(x, self.tag_lookup[self.tag_id].properties) + for x in attrs.get('properties', []) + ] + for required_property_id in self.tag_lookup[self.tag_id].required_properties: + if required_property_id not in self.properties: + raise Exception('Missing required property: {}'.format(required_property_id)) + + @property + def name(self): + return self.tag_lookup[self.tag_id].name + + def __lt__(self, other): + if self.start == other.start: + return self.name == 'token' and other.name != 'token' + else: + return self.start < other.start + + def __le__(self, other): + if self.start == other.start: + return self.name == 'token' or other.name != 'token' + else: + return self.start < other.start + + def __eq__(self, other): + return self.start == other.start and self.name == other.name + + def __ne__(self, other): + return self.start != other.start and self.name != other.name + + def __gt__(self, other): + if self.start == other.start: + return self.name != 'token' and other.name == 'token' + else: + return self.start > other.start + + def __ge__(self, other): + if self.start == other.start: + return self.name != 'token' or other.name == 'token' + else: + return self.start > other.start + + +class PropertyAnnotation: + def __init__(self, attrs, property_lookup): + self.property_id = property['property_id'] + self.property_lookup = property_lookup + if self.property_id not in self.property_lookup: + raise Exception('Unknown property id: {}'.format(self.property_id)) + self.value = property['value'] + # TODO: Process attrs['possibleValues'] as self.labels (no id?) + + @property + def name(self): + return self.property_lookup[self.property_id].name + + +class TagDefinition: + def __init__(self, attrs): + self.id = attrs['id'] + self.name = attrs['name'] + self.description = attrs.get('description', '') + self.properties = { + property_definition.id: property_definition + for property_definition in [ + PropertyDefinition(x) for x in attrs.get('properties', []) + ] + } + + @property + def required_properties(self): + return {property.id: property for property in self.properties + if property.is_required} + + +class PropertyDefinition: + def __init__(self, attrs): + self.id = attrs['id'] + self.name = attrs['name'] + self.description = attrs.get('description', '') + self.flags = attrs.get('flags', []) + self.labels = attrs.get('labels', []) + + @property + def is_required(self): + return 'required' in self.flags + + @property + def has_multiple_values(self): + return 'multiple' in self.flags diff --git a/packages/stand-off-data-py/stand_off_data/utils.py b/packages/stand-off-data-py/stand_off_data/utils.py new file mode 100644 index 0000000..b62fdd5 --- /dev/null +++ b/packages/stand-off-data-py/stand_off_data/utils.py @@ -0,0 +1,47 @@ +def create_vrt(text, stand_off_data): + # Devide annotations into CWB's verticalized text format (.vrt) logic + p_attrs = [] # positional attributes + s_attrs = [] # structural attributes + for annotation in stand_off_data.annotations: + if annotation.name == 'token': + p_attrs.append(annotation) + else: + s_attrs.append(annotation) + # Sort annotations, necessary for the next checks + p_attrs.sort() + s_attrs.sort() + # Check for p_attr<->p_attr overlap + for i, p_attr in enumerate(p_attrs[:-1]): + next_p_attr = p_attrs[i + 1] + # Check if first_p_attr starts/ends within second_p_attr + if ((p_attr.start >= next_p_attr.start) and (p_attr.start <= next_p_attr.end) # noqa + or (p_attr.end >= next_p_attr.start) and (p_attr.end <= next_p_attr.end)): # noqa + raise Exception('Positional attribute overlaps another') + # Check for s_attr<->p_attr overlap + for i, s_attr in enumerate(s_attrs): + for p_attr in p_attrs: + # Check if s_attr starts within p_attr + if s_attr.start > p_attr.start and s_attr.start < p_attr.end: + # Change s_attr start to p_attr's start + s_attrs[i].start = p_attr.start + # Check if s_attr ends within p_attr + if s_attr.end < p_attr.end and s_attr.end > p_attr.start: + # Change s_attr end to p_attr's end + s_attrs[i].end = p_attr.end + # Check if s_attr starts/ends before/after p_attr + if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start: + # No further Checking needed (just because p_attrs are sorted) + break + s_attr_start_buffer = {} + s_attr_end_buffer = {} + for i, s_attr in enumerate(s_attrs): + if s_attr_start_buffer[s_attr.start]: + s_attr_start_buffer[s_attr.start].append(i) + else: + s_attr_start_buffer[s_attr.start] = [i] + if s_attr_end_buffer[s_attr.end]: + s_attr_end_buffer[s_attr.end].append(i) + else: + s_attr_end_buffer[s_attr.end] = [1] + vrt = '' + # TODO do the work! diff --git a/spacy-nlp b/spacy-nlp index d55bfa0..be4dfa3 100755 --- a/spacy-nlp +++ b/spacy-nlp @@ -8,6 +8,14 @@ import json import os import spacy import textwrap +import uuid + + +def UUIDnopaque(name): + return 'nopaque_{}'.format( + uuid.uuid3(uuid.NAMESPACE_DNS, + '{}@nopaque.sfb1288.uni-bielefeld.de'.format(name)) + ) spacy_models = {spacy.info(pipeline)['lang']: pipeline @@ -70,65 +78,167 @@ meta = { } } - -tags = { - 'token': { - 'description': '', - 'properties': { - 'lemma': { +tags = [ + { + 'id': UUIDnopaque('token'), + 'name': 'token', + 'description': 'An individual token — i.e. a word, punctuation symbol, whitespace, etc.', + 'properties': [ + { + 'id': UUIDnopaque('token.lemma'), + 'name': 'lemma', 'description': 'The base form of the word', 'flags': ['required'], - 'tagset': None + 'labels': [] }, - 'pos': { + { + 'id': UUIDnopaque('token.pos'), + 'name': 'pos', 'description': 'The detailed part-of-speech tag', 'flags': ['required'], - 'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['tagger']} # noqa + 'labels': [ + { + 'id': UUIDnopaque('token.pos={}'.format(label)), + 'name': label, + 'description': spacy.explain(label) or '' + } for label in spacy.info(model)['labels']['tagger'] + ] }, - 'simple_pos': { + { + 'id': UUIDnopaque('token.simple_pos'), + 'name': 'simple_pos', 'description': 'The simple UPOS part-of-speech tag', 'flags': ['required'], - 'tagset': { - 'ADJ': 'adjective', - 'ADP': 'adposition', - 'ADV': 'adverb', - 'AUX': 'auxiliary verb', - 'CONJ': 'coordinating conjunction', - 'DET': 'determiner', - 'INTJ': 'interjection', - 'NOUN': 'noun', - 'NUM': 'numeral', - 'PART': 'particle', - 'PRON': 'pronoun', - 'PROPN': 'proper noun', - 'PUNCT': 'punctuation', - 'SCONJ': 'subordinating conjunction', - 'SYM': 'symbol', - 'VERB': 'verb', - 'X': 'other' - } + 'labels': [ + { + 'id': UUIDnopaque('token.simple_pos=ADJ'), + 'name': 'ADJ', + 'description': 'adjective' + }, + { + 'id': UUIDnopaque('token.simple_pos=ADJ'), + 'name': 'ADP', + 'description': 'adposition' + }, + { + 'id': UUIDnopaque('token.simple_pos=ADJ'), + 'name': 'ADV', + 'description': 'adverb' + }, + { + 'id': UUIDnopaque('token.simple_pos=ADJ'), + 'name': 'AUX', + 'description': 'auxiliary verb' + }, + { + 'id': UUIDnopaque('token.simple_pos=ADJ'), + 'name': 'CONJ', + 'description': 'coordinating conjunction' + }, + { + 'id': UUIDnopaque('token.simple_pos=ADJ'), + 'name': 'DET', + 'description': 'determiner' + }, + { + 'id': UUIDnopaque('token.simple_pos=ADJ'), + 'name': 'INTJ', + 'description': 'interjection' + }, + { + 'id': UUIDnopaque('token.simple_pos=ADJ'), + 'name': 'NOUN', + 'description': 'noun' + }, + { + 'id': UUIDnopaque('token.simple_pos=ADJ'), + 'name': 'NUM', + 'description': 'numeral' + }, + { + 'id': UUIDnopaque('token.simple_pos=ADJ'), + 'name': 'PART', + 'description': 'particle' + }, + { + 'id': UUIDnopaque('token.simple_pos=ADJ'), + 'name': 'PRON', + 'description': 'pronoun' + }, + { + 'id': UUIDnopaque('token.simple_pos=ADJ'), + 'name': 'PROPN', + 'description': 'proper noun' + }, + { + 'id': UUIDnopaque('token.simple_pos=ADJ'), + 'name': 'PUNCT', + 'description': 'punctuation' + }, + { + 'id': UUIDnopaque('token.simple_pos=ADJ'), + 'name': 'SCONJ', + 'description': 'subordinating conjunction' + }, + { + 'id': UUIDnopaque('token.simple_pos=ADJ'), + 'name': 'SYM', + 'description': 'symbol' + }, + { + 'id': UUIDnopaque('token.simple_pos=ADJ'), + 'name': 'VERB', + 'description': 'verb' + }, + { + 'id': UUIDnopaque('token.simple_pos=ADJ'), + 'name': 'X', + 'description': 'other' + } + ] }, - 'ner': { - 'description': 'Label indicating the type of the entity', - 'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['ner']} # noqa - } - } - }, - 's': { - 'description': 'Encodes the start and end of a sentence', - 'properties': None - }, - 'ent': { - 'description': 'Encodes the start and end of a named entity', - 'properties': { - 'type': { + { + 'id': UUIDnopaque('token.ner'), + 'name': 'ner', 'description': 'Label indicating the type of the entity', 'flags': ['required'], - 'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['ner']} # noqa + 'labels': [ + { + 'id': UUIDnopaque('token.ner={}'.format(label)), + 'name': label, + 'description': spacy.explain(label) or '' + } for label in spacy.info(model)['labels']['ner'] + ] } - } + ] + }, + { + 'id': UUIDnopaque('s'), + 'name': 's', + 'description': 'Encodes the start and end of a sentence', + 'properties': [] + }, + { + 'id': UUIDnopaque('ent'), + 'name': 'ent', + 'description': 'Encodes the start and end of a named entity', + 'properties': [ + { + 'id': UUIDnopaque('ent.type'), + 'name': 'type', + 'description': 'Label indicating the type of the entity', + 'flags': ['required'], + 'labels': [ + { + 'id': UUIDnopaque('ent.type={}'.format(label)), + 'name': label, + 'description': spacy.explain(label) or '' + } for label in spacy.info(model)['labels']['ner'] + ] + } + ] } -} +] annotations = [] @@ -142,27 +252,50 @@ while text_chunks: if token.is_sent_start: annotation = {'start': token.sent.start_char + chunk_offset, 'end': token.sent.end_char + chunk_offset, - 'tag': 's'} + 'tag_id': UUIDnopaque('s'), + 'properties': []} annotations.append(annotation) # Check if the token is the start of an entity if token.ent_iob == 3: for ent_candidate in token.sent.ents: if ent_candidate.start_char == token.idx: ent = ent_candidate - annotation = {'start': ent.start_char + chunk_offset, - 'end': ent.end_char + chunk_offset, - 'tag': 'ent', - 'properties': {'type': token.ent_type_}} + annotation = { + 'start': ent.start_char + chunk_offset, + 'end': ent.end_char + chunk_offset, + 'tag_id': UUIDnopaque('ent'), + 'properties': [ + { + 'property_id': UUIDnopaque('ent.type'), + 'value': token.ent_type_ + } + ] + } annotations.append(annotation) break - annotation = {'start': token.idx + chunk_offset, - 'end': token.idx + len(token.text) + chunk_offset, - 'tag': 'token', - 'properties': {'pos': token.tag_, - 'lemma': token.lemma_, - 'simple_pos': token.pos_}} - if token.ent_type_: - annotation['properties']['ner'] = token.ent_type_ + annotation = { + 'start': token.idx + chunk_offset, + 'end': token.idx + len(token.text) + chunk_offset, + 'tag_id': UUIDnopaque('token'), + 'properties': [ + { + 'property_id': UUIDnopaque('token.pos'), + 'value': token.tag_ + }, + { + 'property_id': UUIDnopaque('token.lemma'), + 'value': token.lemma_ + }, + { + 'property_id': UUIDnopaque('token.simple_pos'), + 'value': token.pos_ + }, + { + 'property_id': UUIDnopaque('token.ner'), + 'value': token.ent_type_ if token.ent_type_ else 'None' + } + ] + } annotations.append(annotation) chunk_offset += len(text_chunk) text_chunk = None