mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
				synced 2025-10-31 11:12:43 +00:00 
			
		
		
		
	WIP use the new package
This commit is contained in:
		
							
								
								
									
										80
									
								
								packages/stand-off-data-py/stand_off_data/export.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										80
									
								
								packages/stand-off-data-py/stand_off_data/export.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,80 @@ | |||||||
|  | from xml.sax.saxutils import escape | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ExportMixin: | ||||||
|  |     def to_vrt(self, text): | ||||||
|  |         # Devide annotations into CWB's verticalized text format (.vrt) logic | ||||||
|  |         p_attrs = []    # positional attributes | ||||||
|  |         s_attrs = []    # structural attributes | ||||||
|  |         for annotation in self.annotations: | ||||||
|  |             if annotation.name == 'token': | ||||||
|  |                 p_attrs.append(annotation) | ||||||
|  |             else: | ||||||
|  |                 s_attrs.append(annotation) | ||||||
|  |         # Sort annotations, necessary for the next checks | ||||||
|  |         p_attrs.sort() | ||||||
|  |         s_attrs.sort() | ||||||
|  |         # Check for p_attr<->p_attr overlap | ||||||
|  |         for i, p_attr in enumerate(p_attrs[:-1]): | ||||||
|  |             next_p_attr = p_attrs[i + 1] | ||||||
|  |             # Check if first_p_attr starts/ends within second_p_attr | ||||||
|  |             if ((p_attr.start >= next_p_attr.start) and (p_attr.start <= next_p_attr.end)  # noqa | ||||||
|  |                 or (p_attr.end >= next_p_attr.start) and (p_attr.end <= next_p_attr.end)):  # noqa | ||||||
|  |                 raise Exception('Positional attribute overlaps another') | ||||||
|  |         # Check for s_attr<->p_attr overlap | ||||||
|  |         for i, s_attr in enumerate(s_attrs): | ||||||
|  |             for p_attr in p_attrs: | ||||||
|  |                 # Check if s_attr starts within p_attr | ||||||
|  |                 if s_attr.start > p_attr.start and s_attr.start < p_attr.end: | ||||||
|  |                     # Change s_attr start to p_attr's start | ||||||
|  |                     s_attrs[i].start = p_attr.start | ||||||
|  |                 # Check if s_attr ends within p_attr | ||||||
|  |                 if s_attr.end < p_attr.end and s_attr.end > p_attr.start: | ||||||
|  |                     # Change s_attr end to p_attr's end | ||||||
|  |                     s_attrs[i].end = p_attr.end | ||||||
|  |                 # Check if s_attr starts/ends before/after p_attr | ||||||
|  |                 if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start: | ||||||
|  |                     # No further Checking needed (just because p_attrs are sorted) | ||||||
|  |                     break | ||||||
|  |         s_attr_start_buffer = {} | ||||||
|  |         s_attr_end_buffer = {} | ||||||
|  |         for i, s_attr in enumerate(s_attrs): | ||||||
|  |             if s_attr_start_buffer[s_attr.start]: | ||||||
|  |                 s_attr_start_buffer[s_attr.start].append(i) | ||||||
|  |             else: | ||||||
|  |                 s_attr_start_buffer[s_attr.start] = [i] | ||||||
|  |             if s_attr_end_buffer[s_attr.end]: | ||||||
|  |                 s_attr_end_buffer[s_attr.end].append(i) | ||||||
|  |             else: | ||||||
|  |                 s_attr_end_buffer[s_attr.end] = [i] | ||||||
|  |         vrt = '' | ||||||
|  |         vrt += '<text>\n' | ||||||
|  |         for p_attr in p_attrs: | ||||||
|  |             # s_attr_starts | ||||||
|  |             for k in {k: v for k, v in s_attr_start_buffer.items() if k <= p_attr.start}:  # noqa | ||||||
|  |                 s_attrs = s_attr_start_buffer.pop(k) | ||||||
|  |                 for s_attr in s_attrs: | ||||||
|  |                     foo = '' | ||||||
|  |                     for property in s_attr.properties: | ||||||
|  |                         foo += ' {}="{}"'.format(escape(property.name), | ||||||
|  |                                                  escape(property.value)) | ||||||
|  |                     vrt += '<{}{}>\n'.format(escape(s_attr.name), foo) | ||||||
|  |             for k in {k: v for k, v in s_attr_end_buffer.items() if k <= p_attr.start}:  # noqa | ||||||
|  |                 s_attrs = s_attr_end_buffer.pop(k) | ||||||
|  |                 for s_attr in s_attrs: | ||||||
|  |                     vrt += '</{}>\n'.format(escape(s_attr.name)) | ||||||
|  |             # s_attr_ends | ||||||
|  |             foo = {'lemma': None, 'ner': None, 'pos': None, 'simple_pos': None, 'word': None}  # noqa | ||||||
|  |             for property in p_attrs.properties: | ||||||
|  |                 if property.name == 'lemma': | ||||||
|  |                     foo['lemma'] = escape(property.value) | ||||||
|  |                 elif property.name == 'ner': | ||||||
|  |                     foo['ner'] = escape(property.value) | ||||||
|  |                 elif property.name == 'pos': | ||||||
|  |                     foo['pos'] = escape(property.value) | ||||||
|  |                 elif property.name == 'simple_pos': | ||||||
|  |                     foo['simple_pos'] = escape(property.value) | ||||||
|  |             foo['word'] = escape(text[p_attr.start:p_attr.end]) | ||||||
|  |             vrt += '{word}\t{pos}\t{lemma}\t{simple_pos}\t{ner}\n'.format( | ||||||
|  |                 **foo) | ||||||
|  |         vrt += '</text>\n' | ||||||
| @@ -1,4 +1,7 @@ | |||||||
| class StandOffData: | from .export import ExportMixin | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class StandOffData(ExportMixin): | ||||||
|     def __init__(self, attrs): |     def __init__(self, attrs): | ||||||
|         self.meta = attrs.get('meta', {}) |         self.meta = attrs.get('meta', {}) | ||||||
|         self.lookup = {tag_definition.id: tag_definition for tag_definition in |         self.lookup = {tag_definition.id: tag_definition for tag_definition in | ||||||
| @@ -69,7 +72,7 @@ class PropertyAnnotation: | |||||||
|         self.tag_id = attrs['tag_id'] |         self.tag_id = attrs['tag_id'] | ||||||
|         if self.property_id not in self.lookup[self.tag_id].properties: |         if self.property_id not in self.lookup[self.tag_id].properties: | ||||||
|             raise Exception('Unknown property id: {}'.format(self.property_id)) |             raise Exception('Unknown property id: {}'.format(self.property_id)) | ||||||
|         self.value = property['value'] |         self.value = attrs['value'] | ||||||
|         # TODO: Process attrs['possibleValues'] as self.labels (no id?) |         # TODO: Process attrs['possibleValues'] as self.labels (no id?) | ||||||
|  |  | ||||||
|     @property |     @property | ||||||
| @@ -91,7 +94,7 @@ class TagDefinition: | |||||||
|  |  | ||||||
|     @property |     @property | ||||||
|     def required_properties(self): |     def required_properties(self): | ||||||
|         return {property.id: property for property in self.properties |         return {property.id: property for property in self.properties.values() | ||||||
|                 if property.is_required} |                 if property.is_required} | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,78 +0,0 @@ | |||||||
| from xml.sax.saxutils import escape |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def create_vrt(text, stand_off_data): |  | ||||||
|     # Devide annotations into CWB's verticalized text format (.vrt) logic |  | ||||||
|     p_attrs = []    # positional attributes |  | ||||||
|     s_attrs = []    # structural attributes |  | ||||||
|     for annotation in stand_off_data.annotations: |  | ||||||
|         if annotation.name == 'token': |  | ||||||
|             p_attrs.append(annotation) |  | ||||||
|         else: |  | ||||||
|             s_attrs.append(annotation) |  | ||||||
|     # Sort annotations, necessary for the next checks |  | ||||||
|     p_attrs.sort() |  | ||||||
|     s_attrs.sort() |  | ||||||
|     # Check for p_attr<->p_attr overlap |  | ||||||
|     for i, p_attr in enumerate(p_attrs[:-1]): |  | ||||||
|         next_p_attr = p_attrs[i + 1] |  | ||||||
|         # Check if first_p_attr starts/ends within second_p_attr |  | ||||||
|         if ((p_attr.start >= next_p_attr.start) and (p_attr.start <= next_p_attr.end)  # noqa |  | ||||||
|             or (p_attr.end >= next_p_attr.start) and (p_attr.end <= next_p_attr.end)):  # noqa |  | ||||||
|             raise Exception('Positional attribute overlaps another') |  | ||||||
|     # Check for s_attr<->p_attr overlap |  | ||||||
|     for i, s_attr in enumerate(s_attrs): |  | ||||||
|         for p_attr in p_attrs: |  | ||||||
|             # Check if s_attr starts within p_attr |  | ||||||
|             if s_attr.start > p_attr.start and s_attr.start < p_attr.end: |  | ||||||
|                 # Change s_attr start to p_attr's start |  | ||||||
|                 s_attrs[i].start = p_attr.start |  | ||||||
|             # Check if s_attr ends within p_attr |  | ||||||
|             if s_attr.end < p_attr.end and s_attr.end > p_attr.start: |  | ||||||
|                 # Change s_attr end to p_attr's end |  | ||||||
|                 s_attrs[i].end = p_attr.end |  | ||||||
|             # Check if s_attr starts/ends before/after p_attr |  | ||||||
|             if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start: |  | ||||||
|                 # No further Checking needed (just because p_attrs are sorted) |  | ||||||
|                 break |  | ||||||
|     s_attr_start_buffer = {} |  | ||||||
|     s_attr_end_buffer = {} |  | ||||||
|     for i, s_attr in enumerate(s_attrs): |  | ||||||
|         if s_attr_start_buffer[s_attr.start]: |  | ||||||
|             s_attr_start_buffer[s_attr.start].append(i) |  | ||||||
|         else: |  | ||||||
|             s_attr_start_buffer[s_attr.start] = [i] |  | ||||||
|         if s_attr_end_buffer[s_attr.end]: |  | ||||||
|             s_attr_end_buffer[s_attr.end].append(i) |  | ||||||
|         else: |  | ||||||
|             s_attr_end_buffer[s_attr.end] = [i] |  | ||||||
|     vrt = '' |  | ||||||
|     vrt += '<text>\n' |  | ||||||
|     for p_attr in p_attrs: |  | ||||||
|         # s_attr_starts |  | ||||||
|         for k in {k: v for k, v in s_attr_start_buffer.items() if k <= p_attr.start}:  # noqa |  | ||||||
|             s_attrs = s_attr_start_buffer.pop(k) |  | ||||||
|             for s_attr in s_attrs: |  | ||||||
|                 foo = '' |  | ||||||
|                 for property in s_attr.properties: |  | ||||||
|                     foo += ' {}="{}"'.format(escape(property.name), |  | ||||||
|                                              escape(property.value)) |  | ||||||
|                 vrt += '<{}{}>\n'.format(escape(s_attr.name), foo) |  | ||||||
|         for k in {k: v for k, v in s_attr_end_buffer.items() if k <= p_attr.start}:  # noqa |  | ||||||
|             s_attrs = s_attr_end_buffer.pop(k) |  | ||||||
|             for s_attr in s_attrs: |  | ||||||
|                 vrt += '</{}>\n'.format(escape(s_attr.name)) |  | ||||||
|         # s_attr_ends |  | ||||||
|         foo = {'lemma': None, 'ner': None, 'pos': None, 'simple_pos': None, 'word': None}  # noqa |  | ||||||
|         for property in p_attrs.properties: |  | ||||||
|             if property.name == 'lemma': |  | ||||||
|                 foo['lemma'] = escape(property.value) |  | ||||||
|             elif property.name == 'ner': |  | ||||||
|                 foo['ner'] = escape(property.value) |  | ||||||
|             elif property.name == 'pos': |  | ||||||
|                 foo['pos'] = escape(property.value) |  | ||||||
|             elif property.name == 'simple_pos': |  | ||||||
|                 foo['simple_pos'] = escape(property.value) |  | ||||||
|         foo['word'] = escape(text[p_attr.start:p_attr.end]) |  | ||||||
|         vrt += '{word}\t{pos}\t{lemma}\t{simple_pos}\t{ner}\n'.format(**foo) |  | ||||||
|     vrt += '</text>\n' |  | ||||||
							
								
								
									
										106
									
								
								vrt-creator
									
									
									
									
									
								
							
							
						
						
									
										106
									
								
								vrt-creator
									
									
									
									
									
								
							| @@ -2,96 +2,12 @@ | |||||||
| # coding=utf-8 | # coding=utf-8 | ||||||
|  |  | ||||||
| from argparse import ArgumentParser | from argparse import ArgumentParser | ||||||
| from xml.sax.saxutils import escape | from stand_off_data import StandOffData | ||||||
| import hashlib | import hashlib | ||||||
| import json | import json | ||||||
|  |  | ||||||
|  |  | ||||||
| # Two global ressources - Not very elegant but it works for now |  | ||||||
| stand_off_data = None |  | ||||||
| text = None |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def meta_to_string(): |  | ||||||
|     string = '' |  | ||||||
|     string += '<generator software="{} ({})" arguments="check_encoding: {}; language: {}"/>\n'.format(  # noqa |  | ||||||
|         stand_off_data['meta']['generator']['name'], |  | ||||||
|         stand_off_data['meta']['generator']['version'], |  | ||||||
|         stand_off_data['meta']['generator']['arguments']['check_encoding'], |  | ||||||
|         stand_off_data['meta']['generator']['arguments']['language'] |  | ||||||
|     ) |  | ||||||
|     string += '<file encoding="{}" name="{}" md5="{}"/>\n'.format( |  | ||||||
|         stand_off_data['meta']['file']['encoding'], |  | ||||||
|         stand_off_data['meta']['file']['name'], |  | ||||||
|         stand_off_data['meta']['file']['md5'] |  | ||||||
|     ) |  | ||||||
|     return string |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def tags_to_string(): |  | ||||||
|     return '' |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def annotations_to_string(end=float('inf')): |  | ||||||
|     string = '' |  | ||||||
|     while stand_off_data['annotations']: |  | ||||||
|         if stand_off_data['annotations'][0]['start'] >= end: |  | ||||||
|             break |  | ||||||
|         annotation = stand_off_data['annotations'].pop(0) |  | ||||||
|         ####################################################################### |  | ||||||
|         # Check for malformed annotations                                     # |  | ||||||
|         ####################################################################### |  | ||||||
|         if 'tag' not in annotation: |  | ||||||
|             raise Exception('Annotation tag is missing') |  | ||||||
|  |  | ||||||
|         if annotation['tag'] not in stand_off_data['tags']: |  | ||||||
|             raise Exception('Unknown annotation tag: ' + annotation['tag']) |  | ||||||
|  |  | ||||||
|         tag_model = stand_off_data['tags'][annotation['tag']] |  | ||||||
|         if 'properties' in tag_model: |  | ||||||
|             properties_model = tag_model['properties'] |  | ||||||
|             if properties_model is not None: |  | ||||||
|                 required_properties = filter(lambda x: 'flags' in x and 'required' in x['flags'], properties_model)  # noqa |  | ||||||
|                 if required_properties and annotation['properties'] is None: |  | ||||||
|                     raise Exception('There are required properties but the "Properties" attribute is missing')  # noqa |  | ||||||
|                 for property in required_properties: |  | ||||||
|                     if property not in annotation['properties']: |  | ||||||
|                         raise Exception('Required property is missing: ' + property)  # noqa |  | ||||||
|         ####################################################################### |  | ||||||
|         # Process tokens ~ cwb's positional attributes                        # |  | ||||||
|         ####################################################################### |  | ||||||
|         if annotation['tag'] == 'token': |  | ||||||
|             string += '{}\t{}\t{}\t{}\t{}\n'.format( |  | ||||||
|                 escape(text[annotation['start']:annotation['end']]), |  | ||||||
|                 escape(annotation['properties']['pos']), |  | ||||||
|                 escape(annotation['properties']['lemma']), |  | ||||||
|                 escape(annotation['properties']['simple_pos']), |  | ||||||
|                 escape(annotation['properties']['ner'] if 'ner' in annotation['properties'] else 'None')  # noqa |  | ||||||
|             ) |  | ||||||
|         ####################################################################### |  | ||||||
|         # Process other tags ~ cwb's structural attributes                    # |  | ||||||
|         ####################################################################### |  | ||||||
|         else: |  | ||||||
|             properties = '' |  | ||||||
|             if 'properties' in annotation and annotation['properties'] is not None:  # noqa |  | ||||||
|                 for property, value in annotation['properties'].items(): |  | ||||||
|                     if not value: |  | ||||||
|                         continue |  | ||||||
|                     if properties_model and property in properties_model: |  | ||||||
|                         if 'flags' in properties_model and 'multiple' in properties_model['flags']:  # noqa |  | ||||||
|                             properties += ' {}="|{}|"'.format(property, '|'.join(value))  # noqa |  | ||||||
|                         else: |  | ||||||
|                             properties += ' {}="{}"'.format(property, value) |  | ||||||
|             string += '<' + annotation['tag'] + properties + '>\n' |  | ||||||
|             string += annotations_to_string(end=min(annotation['end'], end)) |  | ||||||
|             string += '</' + annotation['tag'] + '>\n' |  | ||||||
|     return string |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def main(): | def main(): | ||||||
|     global stand_off_data |  | ||||||
|     global text |  | ||||||
|  |  | ||||||
|     # Parse the given arguments |     # Parse the given arguments | ||||||
|     parser = ArgumentParser(description='Create a vrt from JSON and txt') |     parser = ArgumentParser(description='Create a vrt from JSON and txt') | ||||||
|     parser.add_argument('text', help='Path to txt file') |     parser.add_argument('text', help='Path to txt file') | ||||||
| @@ -100,31 +16,15 @@ def main(): | |||||||
|     args = parser.parse_args() |     args = parser.parse_args() | ||||||
|  |  | ||||||
|     with open(args.stand_off_data) as stand_of_data_file: |     with open(args.stand_off_data) as stand_of_data_file: | ||||||
|         stand_off_data = json.load(stand_of_data_file) |         stand_off_data = StandOffData(json.load(stand_of_data_file)) | ||||||
|  |  | ||||||
|     with open(args.text, "rb") as text_file: |     with open(args.text, "rb") as text_file: | ||||||
|         text_md5 = hashlib.md5() |         text_md5 = hashlib.md5() | ||||||
|         for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''):  # noqa |         for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''):  # noqa | ||||||
|             text_md5.update(chunk) |             text_md5.update(chunk) | ||||||
|         if text_md5.hexdigest() != stand_off_data['meta']['file']['md5']: |         if text_md5.hexdigest() != stand_off_data.meta['file']['md5']: | ||||||
|             raise Exception('md5 not equal') |             raise Exception('md5 not equal') | ||||||
|  |  | ||||||
|     with open(args.text, encoding=stand_off_data['meta']['file']['encoding']) as text_file:  # noqa |  | ||||||
|         text = text_file.read() |  | ||||||
|  |  | ||||||
|     vrt = '' |  | ||||||
|     vrt += '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n' |  | ||||||
|     vrt += '<corpus>\n' |  | ||||||
|     vrt += '<text>\n' |  | ||||||
|     vrt += meta_to_string() |  | ||||||
|     vrt += tags_to_string() |  | ||||||
|     vrt += annotations_to_string() |  | ||||||
|     vrt += '</text>\n' |  | ||||||
|     vrt += '</corpus>' |  | ||||||
|  |  | ||||||
|     with open(args.output, 'w') as vrt_file: |  | ||||||
|         vrt_file.write(vrt) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||||
|     main() |     main() | ||||||
|   | |||||||
							
								
								
									
										130
									
								
								vrt-creator.bak
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										130
									
								
								vrt-creator.bak
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,130 @@ | |||||||
|  | #!/usr/bin/env python3.7 | ||||||
|  | # coding=utf-8 | ||||||
|  |  | ||||||
|  | from argparse import ArgumentParser | ||||||
|  | from xml.sax.saxutils import escape | ||||||
|  | import hashlib | ||||||
|  | import json | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # Two global ressources - Not very elegant but it works for now | ||||||
|  | stand_off_data = None | ||||||
|  | text = None | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def meta_to_string(): | ||||||
|  |     string = '' | ||||||
|  |     string += '<generator software="{} ({})" arguments="check_encoding: {}; language: {}"/>\n'.format(  # noqa | ||||||
|  |         stand_off_data['meta']['generator']['name'], | ||||||
|  |         stand_off_data['meta']['generator']['version'], | ||||||
|  |         stand_off_data['meta']['generator']['arguments']['check_encoding'], | ||||||
|  |         stand_off_data['meta']['generator']['arguments']['language'] | ||||||
|  |     ) | ||||||
|  |     string += '<file encoding="{}" name="{}" md5="{}"/>\n'.format( | ||||||
|  |         stand_off_data['meta']['file']['encoding'], | ||||||
|  |         stand_off_data['meta']['file']['name'], | ||||||
|  |         stand_off_data['meta']['file']['md5'] | ||||||
|  |     ) | ||||||
|  |     return string | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def tags_to_string(): | ||||||
|  |     return '' | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def annotations_to_string(end=float('inf')): | ||||||
|  |     string = '' | ||||||
|  |     while stand_off_data['annotations']: | ||||||
|  |         if stand_off_data['annotations'][0]['start'] >= end: | ||||||
|  |             break | ||||||
|  |         annotation = stand_off_data['annotations'].pop(0) | ||||||
|  |         ####################################################################### | ||||||
|  |         # Check for malformed annotations                                     # | ||||||
|  |         ####################################################################### | ||||||
|  |         if 'tag' not in annotation: | ||||||
|  |             raise Exception('Annotation tag is missing') | ||||||
|  |  | ||||||
|  |         if annotation['tag'] not in stand_off_data['tags']: | ||||||
|  |             raise Exception('Unknown annotation tag: ' + annotation['tag']) | ||||||
|  |  | ||||||
|  |         tag_model = stand_off_data['tags'][annotation['tag']] | ||||||
|  |         if 'properties' in tag_model: | ||||||
|  |             properties_model = tag_model['properties'] | ||||||
|  |             if properties_model is not None: | ||||||
|  |                 required_properties = filter(lambda x: 'flags' in x and 'required' in x['flags'], properties_model)  # noqa | ||||||
|  |                 if required_properties and annotation['properties'] is None: | ||||||
|  |                     raise Exception('There are required properties but the "Properties" attribute is missing')  # noqa | ||||||
|  |                 for property in required_properties: | ||||||
|  |                     if property not in annotation['properties']: | ||||||
|  |                         raise Exception('Required property is missing: ' + property)  # noqa | ||||||
|  |         ####################################################################### | ||||||
|  |         # Process tokens ~ cwb's positional attributes                        # | ||||||
|  |         ####################################################################### | ||||||
|  |         if annotation['tag'] == 'token': | ||||||
|  |             string += '{}\t{}\t{}\t{}\t{}\n'.format( | ||||||
|  |                 escape(text[annotation['start']:annotation['end']]), | ||||||
|  |                 escape(annotation['properties']['pos']), | ||||||
|  |                 escape(annotation['properties']['lemma']), | ||||||
|  |                 escape(annotation['properties']['simple_pos']), | ||||||
|  |                 escape(annotation['properties']['ner'] if 'ner' in annotation['properties'] else 'None')  # noqa | ||||||
|  |             ) | ||||||
|  |         ####################################################################### | ||||||
|  |         # Process other tags ~ cwb's structural attributes                    # | ||||||
|  |         ####################################################################### | ||||||
|  |         else: | ||||||
|  |             properties = '' | ||||||
|  |             if 'properties' in annotation and annotation['properties'] is not None:  # noqa | ||||||
|  |                 for property, value in annotation['properties'].items(): | ||||||
|  |                     if not value: | ||||||
|  |                         continue | ||||||
|  |                     if properties_model and property in properties_model: | ||||||
|  |                         if 'flags' in properties_model and 'multiple' in properties_model['flags']:  # noqa | ||||||
|  |                             properties += ' {}="|{}|"'.format(property, '|'.join(value))  # noqa | ||||||
|  |                         else: | ||||||
|  |                             properties += ' {}="{}"'.format(property, value) | ||||||
|  |             string += '<' + annotation['tag'] + properties + '>\n' | ||||||
|  |             string += annotations_to_string(end=min(annotation['end'], end)) | ||||||
|  |             string += '</' + annotation['tag'] + '>\n' | ||||||
|  |     return string | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def main(): | ||||||
|  |     global stand_off_data | ||||||
|  |     global text | ||||||
|  |  | ||||||
|  |     # Parse the given arguments | ||||||
|  |     parser = ArgumentParser(description='Create a vrt from JSON and txt') | ||||||
|  |     parser.add_argument('text', help='Path to txt file') | ||||||
|  |     parser.add_argument('stand_off_data', help='Path to JSON file') | ||||||
|  |     parser.add_argument('output', help='Path to vrt output file') | ||||||
|  |     args = parser.parse_args() | ||||||
|  |  | ||||||
|  |     with open(args.stand_off_data) as stand_of_data_file: | ||||||
|  |         stand_off_data = json.load(stand_of_data_file) | ||||||
|  |  | ||||||
|  |     with open(args.text, "rb") as text_file: | ||||||
|  |         text_md5 = hashlib.md5() | ||||||
|  |         for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''):  # noqa | ||||||
|  |             text_md5.update(chunk) | ||||||
|  |         if text_md5.hexdigest() != stand_off_data['meta']['file']['md5']: | ||||||
|  |             raise Exception('md5 not equal') | ||||||
|  |  | ||||||
|  |     with open(args.text, encoding=stand_off_data['meta']['file']['encoding']) as text_file:  # noqa | ||||||
|  |         text = text_file.read() | ||||||
|  |  | ||||||
|  |     vrt = '' | ||||||
|  |     vrt += '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n' | ||||||
|  |     vrt += '<corpus>\n' | ||||||
|  |     vrt += '<text>\n' | ||||||
|  |     vrt += meta_to_string() | ||||||
|  |     vrt += tags_to_string() | ||||||
|  |     vrt += annotations_to_string() | ||||||
|  |     vrt += '</text>\n' | ||||||
|  |     vrt += '</corpus>' | ||||||
|  |  | ||||||
|  |     with open(args.output, 'w') as vrt_file: | ||||||
|  |         vrt_file.write(vrt) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | if __name__ == '__main__': | ||||||
|  |     main() | ||||||
		Reference in New Issue
	
	Block a user