mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
				synced 2025-10-31 17:32:45 +00:00 
			
		
		
		
	WIP use the new package
This commit is contained in:
		
							
								
								
									
										80
									
								
								packages/stand-off-data-py/stand_off_data/export.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										80
									
								
								packages/stand-off-data-py/stand_off_data/export.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,80 @@ | ||||
| from xml.sax.saxutils import escape | ||||
|  | ||||
|  | ||||
| class ExportMixin: | ||||
|     def to_vrt(self, text): | ||||
|         # Devide annotations into CWB's verticalized text format (.vrt) logic | ||||
|         p_attrs = []    # positional attributes | ||||
|         s_attrs = []    # structural attributes | ||||
|         for annotation in self.annotations: | ||||
|             if annotation.name == 'token': | ||||
|                 p_attrs.append(annotation) | ||||
|             else: | ||||
|                 s_attrs.append(annotation) | ||||
|         # Sort annotations, necessary for the next checks | ||||
|         p_attrs.sort() | ||||
|         s_attrs.sort() | ||||
|         # Check for p_attr<->p_attr overlap | ||||
|         for i, p_attr in enumerate(p_attrs[:-1]): | ||||
|             next_p_attr = p_attrs[i + 1] | ||||
|             # Check if first_p_attr starts/ends within second_p_attr | ||||
|             if ((p_attr.start >= next_p_attr.start) and (p_attr.start <= next_p_attr.end)  # noqa | ||||
|                 or (p_attr.end >= next_p_attr.start) and (p_attr.end <= next_p_attr.end)):  # noqa | ||||
|                 raise Exception('Positional attribute overlaps another') | ||||
|         # Check for s_attr<->p_attr overlap | ||||
|         for i, s_attr in enumerate(s_attrs): | ||||
|             for p_attr in p_attrs: | ||||
|                 # Check if s_attr starts within p_attr | ||||
|                 if s_attr.start > p_attr.start and s_attr.start < p_attr.end: | ||||
|                     # Change s_attr start to p_attr's start | ||||
|                     s_attrs[i].start = p_attr.start | ||||
|                 # Check if s_attr ends within p_attr | ||||
|                 if s_attr.end < p_attr.end and s_attr.end > p_attr.start: | ||||
|                     # Change s_attr end to p_attr's end | ||||
|                     s_attrs[i].end = p_attr.end | ||||
|                 # Check if s_attr starts/ends before/after p_attr | ||||
|                 if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start: | ||||
|                     # No further Checking needed (just because p_attrs are sorted) | ||||
|                     break | ||||
|         s_attr_start_buffer = {} | ||||
|         s_attr_end_buffer = {} | ||||
|         for i, s_attr in enumerate(s_attrs): | ||||
|             if s_attr_start_buffer[s_attr.start]: | ||||
|                 s_attr_start_buffer[s_attr.start].append(i) | ||||
|             else: | ||||
|                 s_attr_start_buffer[s_attr.start] = [i] | ||||
|             if s_attr_end_buffer[s_attr.end]: | ||||
|                 s_attr_end_buffer[s_attr.end].append(i) | ||||
|             else: | ||||
|                 s_attr_end_buffer[s_attr.end] = [i] | ||||
|         vrt = '' | ||||
|         vrt += '<text>\n' | ||||
|         for p_attr in p_attrs: | ||||
|             # s_attr_starts | ||||
|             for k in {k: v for k, v in s_attr_start_buffer.items() if k <= p_attr.start}:  # noqa | ||||
|                 s_attrs = s_attr_start_buffer.pop(k) | ||||
|                 for s_attr in s_attrs: | ||||
|                     foo = '' | ||||
|                     for property in s_attr.properties: | ||||
|                         foo += ' {}="{}"'.format(escape(property.name), | ||||
|                                                  escape(property.value)) | ||||
|                     vrt += '<{}{}>\n'.format(escape(s_attr.name), foo) | ||||
|             for k in {k: v for k, v in s_attr_end_buffer.items() if k <= p_attr.start}:  # noqa | ||||
|                 s_attrs = s_attr_end_buffer.pop(k) | ||||
|                 for s_attr in s_attrs: | ||||
|                     vrt += '</{}>\n'.format(escape(s_attr.name)) | ||||
|             # s_attr_ends | ||||
|             foo = {'lemma': None, 'ner': None, 'pos': None, 'simple_pos': None, 'word': None}  # noqa | ||||
|             for property in p_attrs.properties: | ||||
|                 if property.name == 'lemma': | ||||
|                     foo['lemma'] = escape(property.value) | ||||
|                 elif property.name == 'ner': | ||||
|                     foo['ner'] = escape(property.value) | ||||
|                 elif property.name == 'pos': | ||||
|                     foo['pos'] = escape(property.value) | ||||
|                 elif property.name == 'simple_pos': | ||||
|                     foo['simple_pos'] = escape(property.value) | ||||
|             foo['word'] = escape(text[p_attr.start:p_attr.end]) | ||||
|             vrt += '{word}\t{pos}\t{lemma}\t{simple_pos}\t{ner}\n'.format( | ||||
|                 **foo) | ||||
|         vrt += '</text>\n' | ||||
| @@ -1,4 +1,7 @@ | ||||
| class StandOffData: | ||||
| from .export import ExportMixin | ||||
|  | ||||
|  | ||||
| class StandOffData(ExportMixin): | ||||
|     def __init__(self, attrs): | ||||
|         self.meta = attrs.get('meta', {}) | ||||
|         self.lookup = {tag_definition.id: tag_definition for tag_definition in | ||||
| @@ -69,7 +72,7 @@ class PropertyAnnotation: | ||||
|         self.tag_id = attrs['tag_id'] | ||||
|         if self.property_id not in self.lookup[self.tag_id].properties: | ||||
|             raise Exception('Unknown property id: {}'.format(self.property_id)) | ||||
|         self.value = property['value'] | ||||
|         self.value = attrs['value'] | ||||
|         # TODO: Process attrs['possibleValues'] as self.labels (no id?) | ||||
|  | ||||
|     @property | ||||
| @@ -91,7 +94,7 @@ class TagDefinition: | ||||
|  | ||||
|     @property | ||||
|     def required_properties(self): | ||||
|         return {property.id: property for property in self.properties | ||||
|         return {property.id: property for property in self.properties.values() | ||||
|                 if property.is_required} | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -1,78 +0,0 @@ | ||||
| from xml.sax.saxutils import escape | ||||
|  | ||||
|  | ||||
| def create_vrt(text, stand_off_data): | ||||
|     # Devide annotations into CWB's verticalized text format (.vrt) logic | ||||
|     p_attrs = []    # positional attributes | ||||
|     s_attrs = []    # structural attributes | ||||
|     for annotation in stand_off_data.annotations: | ||||
|         if annotation.name == 'token': | ||||
|             p_attrs.append(annotation) | ||||
|         else: | ||||
|             s_attrs.append(annotation) | ||||
|     # Sort annotations, necessary for the next checks | ||||
|     p_attrs.sort() | ||||
|     s_attrs.sort() | ||||
|     # Check for p_attr<->p_attr overlap | ||||
|     for i, p_attr in enumerate(p_attrs[:-1]): | ||||
|         next_p_attr = p_attrs[i + 1] | ||||
|         # Check if first_p_attr starts/ends within second_p_attr | ||||
|         if ((p_attr.start >= next_p_attr.start) and (p_attr.start <= next_p_attr.end)  # noqa | ||||
|             or (p_attr.end >= next_p_attr.start) and (p_attr.end <= next_p_attr.end)):  # noqa | ||||
|             raise Exception('Positional attribute overlaps another') | ||||
|     # Check for s_attr<->p_attr overlap | ||||
|     for i, s_attr in enumerate(s_attrs): | ||||
|         for p_attr in p_attrs: | ||||
|             # Check if s_attr starts within p_attr | ||||
|             if s_attr.start > p_attr.start and s_attr.start < p_attr.end: | ||||
|                 # Change s_attr start to p_attr's start | ||||
|                 s_attrs[i].start = p_attr.start | ||||
|             # Check if s_attr ends within p_attr | ||||
|             if s_attr.end < p_attr.end and s_attr.end > p_attr.start: | ||||
|                 # Change s_attr end to p_attr's end | ||||
|                 s_attrs[i].end = p_attr.end | ||||
|             # Check if s_attr starts/ends before/after p_attr | ||||
|             if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start: | ||||
|                 # No further Checking needed (just because p_attrs are sorted) | ||||
|                 break | ||||
|     s_attr_start_buffer = {} | ||||
|     s_attr_end_buffer = {} | ||||
|     for i, s_attr in enumerate(s_attrs): | ||||
|         if s_attr_start_buffer[s_attr.start]: | ||||
|             s_attr_start_buffer[s_attr.start].append(i) | ||||
|         else: | ||||
|             s_attr_start_buffer[s_attr.start] = [i] | ||||
|         if s_attr_end_buffer[s_attr.end]: | ||||
|             s_attr_end_buffer[s_attr.end].append(i) | ||||
|         else: | ||||
|             s_attr_end_buffer[s_attr.end] = [i] | ||||
|     vrt = '' | ||||
|     vrt += '<text>\n' | ||||
|     for p_attr in p_attrs: | ||||
|         # s_attr_starts | ||||
|         for k in {k: v for k, v in s_attr_start_buffer.items() if k <= p_attr.start}:  # noqa | ||||
|             s_attrs = s_attr_start_buffer.pop(k) | ||||
|             for s_attr in s_attrs: | ||||
|                 foo = '' | ||||
|                 for property in s_attr.properties: | ||||
|                     foo += ' {}="{}"'.format(escape(property.name), | ||||
|                                              escape(property.value)) | ||||
|                 vrt += '<{}{}>\n'.format(escape(s_attr.name), foo) | ||||
|         for k in {k: v for k, v in s_attr_end_buffer.items() if k <= p_attr.start}:  # noqa | ||||
|             s_attrs = s_attr_end_buffer.pop(k) | ||||
|             for s_attr in s_attrs: | ||||
|                 vrt += '</{}>\n'.format(escape(s_attr.name)) | ||||
|         # s_attr_ends | ||||
|         foo = {'lemma': None, 'ner': None, 'pos': None, 'simple_pos': None, 'word': None}  # noqa | ||||
|         for property in p_attrs.properties: | ||||
|             if property.name == 'lemma': | ||||
|                 foo['lemma'] = escape(property.value) | ||||
|             elif property.name == 'ner': | ||||
|                 foo['ner'] = escape(property.value) | ||||
|             elif property.name == 'pos': | ||||
|                 foo['pos'] = escape(property.value) | ||||
|             elif property.name == 'simple_pos': | ||||
|                 foo['simple_pos'] = escape(property.value) | ||||
|         foo['word'] = escape(text[p_attr.start:p_attr.end]) | ||||
|         vrt += '{word}\t{pos}\t{lemma}\t{simple_pos}\t{ner}\n'.format(**foo) | ||||
|     vrt += '</text>\n' | ||||
							
								
								
									
										106
									
								
								vrt-creator
									
									
									
									
									
								
							
							
						
						
									
										106
									
								
								vrt-creator
									
									
									
									
									
								
							| @@ -2,96 +2,12 @@ | ||||
| # coding=utf-8 | ||||
|  | ||||
| from argparse import ArgumentParser | ||||
| from xml.sax.saxutils import escape | ||||
| from stand_off_data import StandOffData | ||||
| import hashlib | ||||
| import json | ||||
|  | ||||
|  | ||||
| # Two global ressources - Not very elegant but it works for now | ||||
| stand_off_data = None | ||||
| text = None | ||||
|  | ||||
|  | ||||
| def meta_to_string(): | ||||
|     string = '' | ||||
|     string += '<generator software="{} ({})" arguments="check_encoding: {}; language: {}"/>\n'.format(  # noqa | ||||
|         stand_off_data['meta']['generator']['name'], | ||||
|         stand_off_data['meta']['generator']['version'], | ||||
|         stand_off_data['meta']['generator']['arguments']['check_encoding'], | ||||
|         stand_off_data['meta']['generator']['arguments']['language'] | ||||
|     ) | ||||
|     string += '<file encoding="{}" name="{}" md5="{}"/>\n'.format( | ||||
|         stand_off_data['meta']['file']['encoding'], | ||||
|         stand_off_data['meta']['file']['name'], | ||||
|         stand_off_data['meta']['file']['md5'] | ||||
|     ) | ||||
|     return string | ||||
|  | ||||
|  | ||||
| def tags_to_string(): | ||||
|     return '' | ||||
|  | ||||
|  | ||||
| def annotations_to_string(end=float('inf')): | ||||
|     string = '' | ||||
|     while stand_off_data['annotations']: | ||||
|         if stand_off_data['annotations'][0]['start'] >= end: | ||||
|             break | ||||
|         annotation = stand_off_data['annotations'].pop(0) | ||||
|         ####################################################################### | ||||
|         # Check for malformed annotations                                     # | ||||
|         ####################################################################### | ||||
|         if 'tag' not in annotation: | ||||
|             raise Exception('Annotation tag is missing') | ||||
|  | ||||
|         if annotation['tag'] not in stand_off_data['tags']: | ||||
|             raise Exception('Unknown annotation tag: ' + annotation['tag']) | ||||
|  | ||||
|         tag_model = stand_off_data['tags'][annotation['tag']] | ||||
|         if 'properties' in tag_model: | ||||
|             properties_model = tag_model['properties'] | ||||
|             if properties_model is not None: | ||||
|                 required_properties = filter(lambda x: 'flags' in x and 'required' in x['flags'], properties_model)  # noqa | ||||
|                 if required_properties and annotation['properties'] is None: | ||||
|                     raise Exception('There are required properties but the "Properties" attribute is missing')  # noqa | ||||
|                 for property in required_properties: | ||||
|                     if property not in annotation['properties']: | ||||
|                         raise Exception('Required property is missing: ' + property)  # noqa | ||||
|         ####################################################################### | ||||
|         # Process tokens ~ cwb's positional attributes                        # | ||||
|         ####################################################################### | ||||
|         if annotation['tag'] == 'token': | ||||
|             string += '{}\t{}\t{}\t{}\t{}\n'.format( | ||||
|                 escape(text[annotation['start']:annotation['end']]), | ||||
|                 escape(annotation['properties']['pos']), | ||||
|                 escape(annotation['properties']['lemma']), | ||||
|                 escape(annotation['properties']['simple_pos']), | ||||
|                 escape(annotation['properties']['ner'] if 'ner' in annotation['properties'] else 'None')  # noqa | ||||
|             ) | ||||
|         ####################################################################### | ||||
|         # Process other tags ~ cwb's structural attributes                    # | ||||
|         ####################################################################### | ||||
|         else: | ||||
|             properties = '' | ||||
|             if 'properties' in annotation and annotation['properties'] is not None:  # noqa | ||||
|                 for property, value in annotation['properties'].items(): | ||||
|                     if not value: | ||||
|                         continue | ||||
|                     if properties_model and property in properties_model: | ||||
|                         if 'flags' in properties_model and 'multiple' in properties_model['flags']:  # noqa | ||||
|                             properties += ' {}="|{}|"'.format(property, '|'.join(value))  # noqa | ||||
|                         else: | ||||
|                             properties += ' {}="{}"'.format(property, value) | ||||
|             string += '<' + annotation['tag'] + properties + '>\n' | ||||
|             string += annotations_to_string(end=min(annotation['end'], end)) | ||||
|             string += '</' + annotation['tag'] + '>\n' | ||||
|     return string | ||||
|  | ||||
|  | ||||
| def main(): | ||||
|     global stand_off_data | ||||
|     global text | ||||
|  | ||||
|     # Parse the given arguments | ||||
|     parser = ArgumentParser(description='Create a vrt from JSON and txt') | ||||
|     parser.add_argument('text', help='Path to txt file') | ||||
| @@ -100,31 +16,15 @@ def main(): | ||||
|     args = parser.parse_args() | ||||
|  | ||||
|     with open(args.stand_off_data) as stand_of_data_file: | ||||
|         stand_off_data = json.load(stand_of_data_file) | ||||
|         stand_off_data = StandOffData(json.load(stand_of_data_file)) | ||||
|  | ||||
|     with open(args.text, "rb") as text_file: | ||||
|         text_md5 = hashlib.md5() | ||||
|         for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''):  # noqa | ||||
|             text_md5.update(chunk) | ||||
|         if text_md5.hexdigest() != stand_off_data['meta']['file']['md5']: | ||||
|         if text_md5.hexdigest() != stand_off_data.meta['file']['md5']: | ||||
|             raise Exception('md5 not equal') | ||||
|  | ||||
|     with open(args.text, encoding=stand_off_data['meta']['file']['encoding']) as text_file:  # noqa | ||||
|         text = text_file.read() | ||||
|  | ||||
|     vrt = '' | ||||
|     vrt += '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n' | ||||
|     vrt += '<corpus>\n' | ||||
|     vrt += '<text>\n' | ||||
|     vrt += meta_to_string() | ||||
|     vrt += tags_to_string() | ||||
|     vrt += annotations_to_string() | ||||
|     vrt += '</text>\n' | ||||
|     vrt += '</corpus>' | ||||
|  | ||||
|     with open(args.output, 'w') as vrt_file: | ||||
|         vrt_file.write(vrt) | ||||
|  | ||||
|  | ||||
| if __name__ == '__main__': | ||||
|     main() | ||||
|   | ||||
							
								
								
									
										130
									
								
								vrt-creator.bak
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										130
									
								
								vrt-creator.bak
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,130 @@ | ||||
| #!/usr/bin/env python3.7 | ||||
| # coding=utf-8 | ||||
|  | ||||
| from argparse import ArgumentParser | ||||
| from xml.sax.saxutils import escape | ||||
| import hashlib | ||||
| import json | ||||
|  | ||||
|  | ||||
| # Two global ressources - Not very elegant but it works for now | ||||
| stand_off_data = None | ||||
| text = None | ||||
|  | ||||
|  | ||||
| def meta_to_string(): | ||||
|     string = '' | ||||
|     string += '<generator software="{} ({})" arguments="check_encoding: {}; language: {}"/>\n'.format(  # noqa | ||||
|         stand_off_data['meta']['generator']['name'], | ||||
|         stand_off_data['meta']['generator']['version'], | ||||
|         stand_off_data['meta']['generator']['arguments']['check_encoding'], | ||||
|         stand_off_data['meta']['generator']['arguments']['language'] | ||||
|     ) | ||||
|     string += '<file encoding="{}" name="{}" md5="{}"/>\n'.format( | ||||
|         stand_off_data['meta']['file']['encoding'], | ||||
|         stand_off_data['meta']['file']['name'], | ||||
|         stand_off_data['meta']['file']['md5'] | ||||
|     ) | ||||
|     return string | ||||
|  | ||||
|  | ||||
| def tags_to_string(): | ||||
|     return '' | ||||
|  | ||||
|  | ||||
| def annotations_to_string(end=float('inf')): | ||||
|     string = '' | ||||
|     while stand_off_data['annotations']: | ||||
|         if stand_off_data['annotations'][0]['start'] >= end: | ||||
|             break | ||||
|         annotation = stand_off_data['annotations'].pop(0) | ||||
|         ####################################################################### | ||||
|         # Check for malformed annotations                                     # | ||||
|         ####################################################################### | ||||
|         if 'tag' not in annotation: | ||||
|             raise Exception('Annotation tag is missing') | ||||
|  | ||||
|         if annotation['tag'] not in stand_off_data['tags']: | ||||
|             raise Exception('Unknown annotation tag: ' + annotation['tag']) | ||||
|  | ||||
|         tag_model = stand_off_data['tags'][annotation['tag']] | ||||
|         if 'properties' in tag_model: | ||||
|             properties_model = tag_model['properties'] | ||||
|             if properties_model is not None: | ||||
|                 required_properties = filter(lambda x: 'flags' in x and 'required' in x['flags'], properties_model)  # noqa | ||||
|                 if required_properties and annotation['properties'] is None: | ||||
|                     raise Exception('There are required properties but the "Properties" attribute is missing')  # noqa | ||||
|                 for property in required_properties: | ||||
|                     if property not in annotation['properties']: | ||||
|                         raise Exception('Required property is missing: ' + property)  # noqa | ||||
|         ####################################################################### | ||||
|         # Process tokens ~ cwb's positional attributes                        # | ||||
|         ####################################################################### | ||||
|         if annotation['tag'] == 'token': | ||||
|             string += '{}\t{}\t{}\t{}\t{}\n'.format( | ||||
|                 escape(text[annotation['start']:annotation['end']]), | ||||
|                 escape(annotation['properties']['pos']), | ||||
|                 escape(annotation['properties']['lemma']), | ||||
|                 escape(annotation['properties']['simple_pos']), | ||||
|                 escape(annotation['properties']['ner'] if 'ner' in annotation['properties'] else 'None')  # noqa | ||||
|             ) | ||||
|         ####################################################################### | ||||
|         # Process other tags ~ cwb's structural attributes                    # | ||||
|         ####################################################################### | ||||
|         else: | ||||
|             properties = '' | ||||
|             if 'properties' in annotation and annotation['properties'] is not None:  # noqa | ||||
|                 for property, value in annotation['properties'].items(): | ||||
|                     if not value: | ||||
|                         continue | ||||
|                     if properties_model and property in properties_model: | ||||
|                         if 'flags' in properties_model and 'multiple' in properties_model['flags']:  # noqa | ||||
|                             properties += ' {}="|{}|"'.format(property, '|'.join(value))  # noqa | ||||
|                         else: | ||||
|                             properties += ' {}="{}"'.format(property, value) | ||||
|             string += '<' + annotation['tag'] + properties + '>\n' | ||||
|             string += annotations_to_string(end=min(annotation['end'], end)) | ||||
|             string += '</' + annotation['tag'] + '>\n' | ||||
|     return string | ||||
|  | ||||
|  | ||||
| def main(): | ||||
|     global stand_off_data | ||||
|     global text | ||||
|  | ||||
|     # Parse the given arguments | ||||
|     parser = ArgumentParser(description='Create a vrt from JSON and txt') | ||||
|     parser.add_argument('text', help='Path to txt file') | ||||
|     parser.add_argument('stand_off_data', help='Path to JSON file') | ||||
|     parser.add_argument('output', help='Path to vrt output file') | ||||
|     args = parser.parse_args() | ||||
|  | ||||
|     with open(args.stand_off_data) as stand_of_data_file: | ||||
|         stand_off_data = json.load(stand_of_data_file) | ||||
|  | ||||
|     with open(args.text, "rb") as text_file: | ||||
|         text_md5 = hashlib.md5() | ||||
|         for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''):  # noqa | ||||
|             text_md5.update(chunk) | ||||
|         if text_md5.hexdigest() != stand_off_data['meta']['file']['md5']: | ||||
|             raise Exception('md5 not equal') | ||||
|  | ||||
|     with open(args.text, encoding=stand_off_data['meta']['file']['encoding']) as text_file:  # noqa | ||||
|         text = text_file.read() | ||||
|  | ||||
|     vrt = '' | ||||
|     vrt += '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n' | ||||
|     vrt += '<corpus>\n' | ||||
|     vrt += '<text>\n' | ||||
|     vrt += meta_to_string() | ||||
|     vrt += tags_to_string() | ||||
|     vrt += annotations_to_string() | ||||
|     vrt += '</text>\n' | ||||
|     vrt += '</corpus>' | ||||
|  | ||||
|     with open(args.output, 'w') as vrt_file: | ||||
|         vrt_file.write(vrt) | ||||
|  | ||||
|  | ||||
| if __name__ == '__main__': | ||||
|     main() | ||||
		Reference in New Issue
	
	Block a user