mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
				synced 2025-10-31 17:22:47 +00:00 
			
		
		
		
	Preliminary work
This commit is contained in:
		
							
								
								
									
										11
									
								
								nlp
									
									
									
									
									
								
							
							
						
						
									
										11
									
								
								nlp
									
									
									
									
									
								
							| @@ -37,6 +37,11 @@ class NLPPipelineJob: | ||||
|         self.file = file | ||||
|         self.name = os.path.basename(file).rsplit('.', 1)[0] | ||||
|         self.output_dir = output_dir | ||||
|         catma_stand_off_data_file = file.rsplit('.', 1)[0] + '.catma-stand-off.json'  # noqa | ||||
|         if os.path.exists(catma_stand_off_data_file): | ||||
|             self.catma_stand_off_data_file = catma_stand_off_data_file | ||||
|         else: | ||||
|             self.catma_stand_off_data_file = None | ||||
|  | ||||
|  | ||||
| class NLPPipeline(WorkflowRunner): | ||||
| @@ -93,10 +98,12 @@ class NLPPipeline(WorkflowRunner): | ||||
|         vrt_creation_tasks = [] | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             output_file = os.path.join(job.output_dir, '{}.vrt'.format(job.name))  # noqa | ||||
|             nlp_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name))  # noqa | ||||
|             nopaque_stand_off_data_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name))  # noqa | ||||
|             cmd = 'vrt-creator' | ||||
|             cmd += ' "{}"'.format(job.file) | ||||
|             cmd += ' "{}"'.format(nlp_file) | ||||
|             cmd += ' "{}"'.format(nopaque_stand_off_data_file) | ||||
|             if job.catma_stand_off_data_file is not None: | ||||
|                 cmd += ' --catma-stand-off-data "{}"'.format(job.catma_stand_off_data_file)  # noqa | ||||
|             cmd += ' "{}"'.format(output_file) | ||||
|             deps = 'nlp_-_{}'.format(i) | ||||
|             lbl = 'vrt_creation_-_{}'.format(i) | ||||
|   | ||||
							
								
								
									
										0
									
								
								packages/stand-off-data-py/setup.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								packages/stand-off-data-py/setup.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										126
									
								
								packages/stand-off-data-py/stand_off_data/models.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										126
									
								
								packages/stand-off-data-py/stand_off_data/models.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,126 @@ | ||||
| ''' | ||||
|     'generator': { | ||||
|         'name': 'nopaque NLP service', | ||||
|         'version': '1.0.0', | ||||
|         'arguments': { | ||||
|             'check_encoding': args.check_encoding, | ||||
|             'language': args.language | ||||
|         } | ||||
|     }, | ||||
|     'file': { | ||||
|         'encoding': encoding, | ||||
|         'md5': text_md5.hexdigest(), | ||||
|         'name': os.path.basename(args.input) | ||||
|     } | ||||
| ''' | ||||
|  | ||||
|  | ||||
| class StandOffData: | ||||
|     def __init__(self, attrs): | ||||
|         self.tags = {tag_definition.id: tag_definition for tag_definition in | ||||
|                      [TagDefinition(x) for x in attrs.get('tags', [])]} | ||||
|         self.annotations = [TagAnnotation(x, self.tags) for x in | ||||
|                             attrs.get('annotations', [])] | ||||
|  | ||||
|  | ||||
| class TagAnnotation: | ||||
|     def __init__(self, attrs, tag_lookup): | ||||
|         self.tag_id = attrs['tag_id'] | ||||
|         self.tag_lookup = tag_lookup | ||||
|         if self.tag_id not in self.tag_lookup: | ||||
|             raise Exception('Unknown tag id: {}'.format(self.tag_id)) | ||||
|         self.start = attrs['start'] | ||||
|         self.end = attrs['end'] | ||||
|         if self.start >= self.end: | ||||
|             raise Exception('start must be lower then end') | ||||
|         self.description = attrs.get('description', '') | ||||
|         self.properties = [ | ||||
|             PropertyAnnotation(x, self.tag_lookup[self.tag_id].properties) | ||||
|             for x in attrs.get('properties', []) | ||||
|         ] | ||||
|         for required_property_id in self.tag_lookup[self.tag_id].required_properties: | ||||
|             if required_property_id not in self.properties: | ||||
|                 raise Exception('Missing required property: {}'.format(required_property_id)) | ||||
|  | ||||
|     @property | ||||
|     def name(self): | ||||
|         return self.tag_lookup[self.tag_id].name | ||||
|  | ||||
|     def __lt__(self, other): | ||||
|         if self.start == other.start: | ||||
|             return self.name == 'token' and other.name != 'token' | ||||
|         else: | ||||
|             return self.start < other.start | ||||
|  | ||||
|     def __le__(self, other): | ||||
|         if self.start == other.start: | ||||
|             return self.name == 'token' or other.name != 'token' | ||||
|         else: | ||||
|             return self.start < other.start | ||||
|  | ||||
|     def __eq__(self, other): | ||||
|         return self.start == other.start and self.name == other.name | ||||
|  | ||||
|     def __ne__(self, other): | ||||
|         return self.start != other.start and self.name != other.name | ||||
|  | ||||
|     def __gt__(self, other): | ||||
|         if self.start == other.start: | ||||
|             return self.name != 'token' and other.name == 'token' | ||||
|         else: | ||||
|             return self.start > other.start | ||||
|  | ||||
|     def __ge__(self, other): | ||||
|         if self.start == other.start: | ||||
|             return self.name != 'token' or other.name == 'token' | ||||
|         else: | ||||
|             return self.start > other.start | ||||
|  | ||||
|  | ||||
| class PropertyAnnotation: | ||||
|     def __init__(self, attrs, property_lookup): | ||||
|         self.property_id = property['property_id'] | ||||
|         self.property_lookup = property_lookup | ||||
|         if self.property_id not in self.property_lookup: | ||||
|             raise Exception('Unknown property id: {}'.format(self.property_id)) | ||||
|         self.value = property['value'] | ||||
|         # TODO: Process attrs['possibleValues'] as self.labels (no id?) | ||||
|  | ||||
|     @property | ||||
|     def name(self): | ||||
|         return self.property_lookup[self.property_id].name | ||||
|  | ||||
|  | ||||
| class TagDefinition: | ||||
|     def __init__(self, attrs): | ||||
|         self.id = attrs['id'] | ||||
|         self.name = attrs['name'] | ||||
|         self.description = attrs.get('description', '') | ||||
|         self.properties = { | ||||
|             property_definition.id: property_definition | ||||
|             for property_definition in [ | ||||
|                 PropertyDefinition(x) for x in attrs.get('properties', []) | ||||
|             ] | ||||
|         } | ||||
|  | ||||
|     @property | ||||
|     def required_properties(self): | ||||
|         return {property.id: property for property in self.properties | ||||
|                 if property.is_required} | ||||
|  | ||||
|  | ||||
| class PropertyDefinition: | ||||
|     def __init__(self, attrs): | ||||
|         self.id = attrs['id'] | ||||
|         self.name = attrs['name'] | ||||
|         self.description = attrs.get('description', '') | ||||
|         self.flags = attrs.get('flags', []) | ||||
|         self.labels = attrs.get('labels', []) | ||||
|  | ||||
|     @property | ||||
|     def is_required(self): | ||||
|         return 'required' in self.flags | ||||
|  | ||||
|     @property | ||||
|     def has_multiple_values(self): | ||||
|         return 'multiple' in self.flags | ||||
							
								
								
									
										47
									
								
								packages/stand-off-data-py/stand_off_data/utils.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										47
									
								
								packages/stand-off-data-py/stand_off_data/utils.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,47 @@ | ||||
| def create_vrt(text, stand_off_data): | ||||
|     # Devide annotations into CWB's verticalized text format (.vrt) logic | ||||
|     p_attrs = []    # positional attributes | ||||
|     s_attrs = []    # structural attributes | ||||
|     for annotation in stand_off_data.annotations: | ||||
|         if annotation.name == 'token': | ||||
|             p_attrs.append(annotation) | ||||
|         else: | ||||
|             s_attrs.append(annotation) | ||||
|     # Sort annotations, necessary for the next checks | ||||
|     p_attrs.sort() | ||||
|     s_attrs.sort() | ||||
|     # Check for p_attr<->p_attr overlap | ||||
|     for i, p_attr in enumerate(p_attrs[:-1]): | ||||
|         next_p_attr = p_attrs[i + 1] | ||||
|         # Check if first_p_attr starts/ends within second_p_attr | ||||
|         if ((p_attr.start >= next_p_attr.start) and (p_attr.start <= next_p_attr.end)  # noqa | ||||
|             or (p_attr.end >= next_p_attr.start) and (p_attr.end <= next_p_attr.end)):  # noqa | ||||
|             raise Exception('Positional attribute overlaps another') | ||||
|     # Check for s_attr<->p_attr overlap | ||||
|     for i, s_attr in enumerate(s_attrs): | ||||
|         for p_attr in p_attrs: | ||||
|             # Check if s_attr starts within p_attr | ||||
|             if s_attr.start > p_attr.start and s_attr.start < p_attr.end: | ||||
|                 # Change s_attr start to p_attr's start | ||||
|                 s_attrs[i].start = p_attr.start | ||||
|             # Check if s_attr ends within p_attr | ||||
|             if s_attr.end < p_attr.end and s_attr.end > p_attr.start: | ||||
|                 # Change s_attr end to p_attr's end | ||||
|                 s_attrs[i].end = p_attr.end | ||||
|             # Check if s_attr starts/ends before/after p_attr | ||||
|             if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start: | ||||
|                 # No further Checking needed (just because p_attrs are sorted) | ||||
|                 break | ||||
|     s_attr_start_buffer = {} | ||||
|     s_attr_end_buffer = {} | ||||
|     for i, s_attr in enumerate(s_attrs): | ||||
|         if s_attr_start_buffer[s_attr.start]: | ||||
|             s_attr_start_buffer[s_attr.start].append(i) | ||||
|         else: | ||||
|             s_attr_start_buffer[s_attr.start] = [i] | ||||
|         if s_attr_end_buffer[s_attr.end]: | ||||
|             s_attr_end_buffer[s_attr.end].append(i) | ||||
|         else: | ||||
|             s_attr_end_buffer[s_attr.end] = [1] | ||||
|     vrt = '' | ||||
|     # TODO do the work! | ||||
							
								
								
									
										251
									
								
								spacy-nlp
									
									
									
									
									
								
							
							
						
						
									
										251
									
								
								spacy-nlp
									
									
									
									
									
								
							| @@ -8,6 +8,14 @@ import json | ||||
| import os | ||||
| import spacy | ||||
| import textwrap | ||||
| import uuid | ||||
|  | ||||
|  | ||||
| def UUIDnopaque(name): | ||||
|     return 'nopaque_{}'.format( | ||||
|         uuid.uuid3(uuid.NAMESPACE_DNS, | ||||
|                    '{}@nopaque.sfb1288.uni-bielefeld.de'.format(name)) | ||||
|     ) | ||||
|  | ||||
|  | ||||
| spacy_models = {spacy.info(pipeline)['lang']: pipeline | ||||
| @@ -70,65 +78,167 @@ meta = { | ||||
|     } | ||||
| } | ||||
|  | ||||
|  | ||||
| tags = { | ||||
|     'token': { | ||||
|         'description': '', | ||||
|         'properties': { | ||||
|             'lemma': { | ||||
| tags = [ | ||||
|     { | ||||
|         'id': UUIDnopaque('token'), | ||||
|         'name': 'token', | ||||
|         'description': 'An individual token — i.e. a word, punctuation symbol, whitespace, etc.', | ||||
|         'properties': [ | ||||
|             { | ||||
|                 'id': UUIDnopaque('token.lemma'), | ||||
|                 'name': 'lemma', | ||||
|                 'description': 'The base form of the word', | ||||
|                 'flags': ['required'], | ||||
|                 'tagset': None | ||||
|                 'labels': [] | ||||
|             }, | ||||
|             'pos': { | ||||
|             { | ||||
|                 'id': UUIDnopaque('token.pos'), | ||||
|                 'name': 'pos', | ||||
|                 'description': 'The detailed part-of-speech tag', | ||||
|                 'flags': ['required'], | ||||
|                 'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['tagger']}  # noqa | ||||
|                 'labels': [ | ||||
|                     { | ||||
|                         'id': UUIDnopaque('token.pos={}'.format(label)), | ||||
|                         'name': label, | ||||
|                         'description': spacy.explain(label) or '' | ||||
|                     } for label in spacy.info(model)['labels']['tagger'] | ||||
|                 ] | ||||
|             }, | ||||
|             'simple_pos': { | ||||
|             { | ||||
|                 'id': UUIDnopaque('token.simple_pos'), | ||||
|                 'name': 'simple_pos', | ||||
|                 'description': 'The simple UPOS part-of-speech tag', | ||||
|                 'flags': ['required'], | ||||
|                 'tagset': { | ||||
|                     'ADJ': 'adjective', | ||||
|                     'ADP': 'adposition', | ||||
|                     'ADV': 'adverb', | ||||
|                     'AUX': 'auxiliary verb', | ||||
|                     'CONJ': 'coordinating conjunction', | ||||
|                     'DET': 'determiner', | ||||
|                     'INTJ': 'interjection', | ||||
|                     'NOUN': 'noun', | ||||
|                     'NUM': 'numeral', | ||||
|                     'PART': 'particle', | ||||
|                     'PRON': 'pronoun', | ||||
|                     'PROPN': 'proper noun', | ||||
|                     'PUNCT': 'punctuation', | ||||
|                     'SCONJ': 'subordinating conjunction', | ||||
|                     'SYM': 'symbol', | ||||
|                     'VERB': 'verb', | ||||
|                     'X': 'other' | ||||
|                 } | ||||
|                 'labels': [ | ||||
|                     { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                         'name': 'ADJ', | ||||
|                         'description': 'adjective' | ||||
|                     }, | ||||
|                     { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                         'name': 'ADP', | ||||
|                         'description': 'adposition' | ||||
|                     }, | ||||
|                     { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                         'name': 'ADV', | ||||
|                         'description': 'adverb' | ||||
|                     }, | ||||
|                     { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                         'name': 'AUX', | ||||
|                         'description': 'auxiliary verb' | ||||
|                     }, | ||||
|                     { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                         'name': 'CONJ', | ||||
|                         'description': 'coordinating conjunction' | ||||
|                     }, | ||||
|                     { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                         'name': 'DET', | ||||
|                         'description': 'determiner' | ||||
|                     }, | ||||
|                     { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                         'name': 'INTJ', | ||||
|                         'description': 'interjection' | ||||
|                     }, | ||||
|                     { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                         'name': 'NOUN', | ||||
|                         'description': 'noun' | ||||
|                     }, | ||||
|                     { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                         'name': 'NUM', | ||||
|                         'description': 'numeral' | ||||
|                     }, | ||||
|                     { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                         'name': 'PART', | ||||
|                         'description': 'particle' | ||||
|                     }, | ||||
|                     { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                         'name': 'PRON', | ||||
|                         'description': 'pronoun' | ||||
|                     }, | ||||
|                     { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                         'name': 'PROPN', | ||||
|                         'description': 'proper noun' | ||||
|                     }, | ||||
|                     { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                         'name': 'PUNCT', | ||||
|                         'description': 'punctuation' | ||||
|                     }, | ||||
|                     { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                         'name': 'SCONJ', | ||||
|                         'description': 'subordinating conjunction' | ||||
|                     }, | ||||
|                     { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                         'name': 'SYM', | ||||
|                         'description': 'symbol' | ||||
|                     }, | ||||
|                     { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                         'name': 'VERB', | ||||
|                         'description': 'verb' | ||||
|                     }, | ||||
|                     { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                         'name': 'X', | ||||
|                         'description': 'other' | ||||
|                     } | ||||
|                 ] | ||||
|             }, | ||||
|             'ner': { | ||||
|                 'description': 'Label indicating the type of the entity', | ||||
|                 'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['ner']}  # noqa | ||||
|             } | ||||
|         } | ||||
|     }, | ||||
|     's': { | ||||
|         'description': 'Encodes the start and end of a sentence', | ||||
|         'properties': None | ||||
|     }, | ||||
|     'ent': { | ||||
|         'description': 'Encodes the start and end of a named entity', | ||||
|         'properties': { | ||||
|             'type': { | ||||
|             { | ||||
|                 'id': UUIDnopaque('token.ner'), | ||||
|                 'name': 'ner', | ||||
|                 'description': 'Label indicating the type of the entity', | ||||
|                 'flags': ['required'], | ||||
|                 'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['ner']}  # noqa | ||||
|                 'labels': [ | ||||
|                     { | ||||
|                         'id': UUIDnopaque('token.ner={}'.format(label)), | ||||
|                         'name': label, | ||||
|                         'description': spacy.explain(label) or '' | ||||
|                     } for label in spacy.info(model)['labels']['ner'] | ||||
|                 ] | ||||
|             } | ||||
|         } | ||||
|         ] | ||||
|     }, | ||||
|     { | ||||
|         'id': UUIDnopaque('s'), | ||||
|         'name': 's', | ||||
|         'description': 'Encodes the start and end of a sentence', | ||||
|         'properties': [] | ||||
|     }, | ||||
|     { | ||||
|         'id': UUIDnopaque('ent'), | ||||
|         'name': 'ent', | ||||
|         'description': 'Encodes the start and end of a named entity', | ||||
|         'properties': [ | ||||
|             { | ||||
|                 'id': UUIDnopaque('ent.type'), | ||||
|                 'name': 'type', | ||||
|                 'description': 'Label indicating the type of the entity', | ||||
|                 'flags': ['required'], | ||||
|                 'labels': [ | ||||
|                     { | ||||
|                         'id': UUIDnopaque('ent.type={}'.format(label)), | ||||
|                         'name': label, | ||||
|                         'description': spacy.explain(label) or '' | ||||
|                     } for label in spacy.info(model)['labels']['ner'] | ||||
|                 ] | ||||
|             } | ||||
|         ] | ||||
|     } | ||||
| } | ||||
| ] | ||||
|  | ||||
| annotations = [] | ||||
|  | ||||
| @@ -142,27 +252,50 @@ while text_chunks: | ||||
|         if token.is_sent_start: | ||||
|             annotation = {'start': token.sent.start_char + chunk_offset, | ||||
|                           'end': token.sent.end_char + chunk_offset, | ||||
|                           'tag': 's'} | ||||
|                           'tag_id': UUIDnopaque('s'), | ||||
|                           'properties': []} | ||||
|             annotations.append(annotation) | ||||
|         # Check if the token is the start of an entity | ||||
|         if token.ent_iob == 3: | ||||
|             for ent_candidate in token.sent.ents: | ||||
|                 if ent_candidate.start_char == token.idx: | ||||
|                     ent = ent_candidate | ||||
|                     annotation = {'start': ent.start_char + chunk_offset, | ||||
|                                   'end': ent.end_char + chunk_offset, | ||||
|                                   'tag': 'ent', | ||||
|                                   'properties': {'type': token.ent_type_}} | ||||
|                     annotation = { | ||||
|                         'start': ent.start_char + chunk_offset, | ||||
|                         'end': ent.end_char + chunk_offset, | ||||
|                         'tag_id': UUIDnopaque('ent'), | ||||
|                         'properties': [ | ||||
|                             { | ||||
|                                 'property_id': UUIDnopaque('ent.type'), | ||||
|                                 'value': token.ent_type_ | ||||
|                             } | ||||
|                         ] | ||||
|                     } | ||||
|                     annotations.append(annotation) | ||||
|                     break | ||||
|         annotation = {'start': token.idx + chunk_offset, | ||||
|                       'end': token.idx + len(token.text) + chunk_offset, | ||||
|                       'tag': 'token', | ||||
|                       'properties': {'pos': token.tag_, | ||||
|                                      'lemma': token.lemma_, | ||||
|                                      'simple_pos': token.pos_}} | ||||
|         if token.ent_type_: | ||||
|             annotation['properties']['ner'] = token.ent_type_ | ||||
|         annotation = { | ||||
|             'start': token.idx + chunk_offset, | ||||
|             'end': token.idx + len(token.text) + chunk_offset, | ||||
|             'tag_id': UUIDnopaque('token'), | ||||
|             'properties': [ | ||||
|                 { | ||||
|                    'property_id': UUIDnopaque('token.pos'), | ||||
|                    'value': token.tag_ | ||||
|                 }, | ||||
|                 { | ||||
|                     'property_id': UUIDnopaque('token.lemma'), | ||||
|                     'value': token.lemma_ | ||||
|                 }, | ||||
|                 { | ||||
|                     'property_id': UUIDnopaque('token.simple_pos'), | ||||
|                     'value': token.pos_ | ||||
|                 }, | ||||
|                 { | ||||
|                     'property_id': UUIDnopaque('token.ner'), | ||||
|                     'value': token.ent_type_ if token.ent_type_ else 'None' | ||||
|                 } | ||||
|             ] | ||||
|         } | ||||
|         annotations.append(annotation) | ||||
|     chunk_offset += len(text_chunk) | ||||
|     text_chunk = None | ||||
|   | ||||
		Reference in New Issue
	
	Block a user