#!/usr/bin/env python3.7 # coding=utf-8 from argparse import ArgumentParser from xml.sax.saxutils import escape import hashlib import json # Two global ressources - Not very elegant but it works for now stand_off_data = None text = None def meta_to_string(): string = '' string += '\n'.format( # noqa stand_off_data['meta']['generator']['name'], stand_off_data['meta']['generator']['version'], stand_off_data['meta']['generator']['arguments']['check_encoding'], stand_off_data['meta']['generator']['arguments']['language'] ) string += '\n'.format( stand_off_data['meta']['file']['encoding'], stand_off_data['meta']['file']['name'], stand_off_data['meta']['file']['md5'] ) return string def tags_to_string(): return '' def annotations_to_string(end=float('inf')): string = '' while stand_off_data['annotations']: if stand_off_data['annotations'][0]['start'] >= end: break annotation = stand_off_data['annotations'].pop(0) ####################################################################### # Check for malformed annotations # ####################################################################### if 'tag' not in annotation: raise Exception('Annotation tag is missing') if annotation['tag'] not in stand_off_data['tags']: raise Exception('Unknown annotation tag: ' + annotation['tag']) tag_model = stand_off_data['tags'][annotation['tag']] if 'properties' in tag_model: properties_model = tag_model['properties'] if properties_model is not None: required_properties = filter(lambda x: 'flags' in x and 'required' in x['flags'], properties_model) # noqa if required_properties and annotation['properties'] is None: raise Exception('There are required properties but the "Properties" attribute is missing') # noqa for property in required_properties: if property not in annotation['properties']: raise Exception('Required property is missing: ' + property) # noqa ####################################################################### # Process tokens ~ cwb's positional attributes # ####################################################################### if annotation['tag'] == 'token': string += '{}\t{}\t{}\t{}\t{}\n'.format( escape(text[annotation['start']:annotation['end']]), escape(annotation['properties']['pos']), escape(annotation['properties']['lemma']), escape(annotation['properties']['simple_pos']), escape(annotation['properties']['ner'] if 'ner' in annotation['properties'] else 'None') # noqa ) ####################################################################### # Process other tags ~ cwb's structural attributes # ####################################################################### else: properties = '' if 'properties' in annotation and annotation['properties'] is not None: # noqa for property, value in annotation['properties'].items(): if not value: continue if properties_model and property in properties_model: if 'flags' in properties_model and 'multiple' in properties_model['flags']: # noqa properties += ' {}="|{}|"'.format(property, '|'.join(value)) # noqa else: properties += ' {}="{}"'.format(property, value) string += '<' + annotation['tag'] + properties + '>\n' string += annotations_to_string(end=min(annotation['end'], end)) string += '\n' return string def main(): global stand_off_data global text # Parse the given arguments parser = ArgumentParser(description='Create a vrt from JSON and txt') parser.add_argument('text', metavar='Path to txt file') parser.add_argument('stand_off_data', metavar='Path to JSON file') parser.add_argument('output', metavar='Path to vrt output file') args = parser.parse_args() with open(args.stand_off_data) as stand_of_data_file: stand_off_data = json.load(stand_of_data_file) with open(args.text, "rb") as text_file: text_md5 = hashlib.md5() for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''): # noqa text_md5.update(chunk) if text_md5.hexdigest() != stand_off_data['meta']['file']['md5']: raise Exception('md5 not equal') with open(args.text, encoding=stand_off_data['meta']['file']['encoding']) as text_file: # noqa text = text_file.read() vrt = '' vrt += '\n' vrt += '\n' vrt += '\n' vrt += meta_to_string() vrt += tags_to_string() vrt += annotations_to_string() vrt += '\n' vrt += '' with open(args.output, 'w') as vrt_file: vrt_file.write(vrt) if __name__ == '__main__': main()