diff --git a/packages/stand-off-data-py/stand_off_data/export.py b/packages/stand-off-data-py/stand_off_data/export.py
new file mode 100644
index 0000000..afb49f6
--- /dev/null
+++ b/packages/stand-off-data-py/stand_off_data/export.py
@@ -0,0 +1,80 @@
+from xml.sax.saxutils import escape
+
+
+class ExportMixin:
+ def to_vrt(self, text):
+ # Devide annotations into CWB's verticalized text format (.vrt) logic
+ p_attrs = [] # positional attributes
+ s_attrs = [] # structural attributes
+ for annotation in self.annotations:
+ if annotation.name == 'token':
+ p_attrs.append(annotation)
+ else:
+ s_attrs.append(annotation)
+ # Sort annotations, necessary for the next checks
+ p_attrs.sort()
+ s_attrs.sort()
+ # Check for p_attr<->p_attr overlap
+ for i, p_attr in enumerate(p_attrs[:-1]):
+ next_p_attr = p_attrs[i + 1]
+ # Check if first_p_attr starts/ends within second_p_attr
+ if ((p_attr.start >= next_p_attr.start) and (p_attr.start <= next_p_attr.end) # noqa
+ or (p_attr.end >= next_p_attr.start) and (p_attr.end <= next_p_attr.end)): # noqa
+ raise Exception('Positional attribute overlaps another')
+ # Check for s_attr<->p_attr overlap
+ for i, s_attr in enumerate(s_attrs):
+ for p_attr in p_attrs:
+ # Check if s_attr starts within p_attr
+ if s_attr.start > p_attr.start and s_attr.start < p_attr.end:
+ # Change s_attr start to p_attr's start
+ s_attrs[i].start = p_attr.start
+ # Check if s_attr ends within p_attr
+ if s_attr.end < p_attr.end and s_attr.end > p_attr.start:
+ # Change s_attr end to p_attr's end
+ s_attrs[i].end = p_attr.end
+ # Check if s_attr starts/ends before/after p_attr
+ if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start:
+ # No further Checking needed (just because p_attrs are sorted)
+ break
+ s_attr_start_buffer = {}
+ s_attr_end_buffer = {}
+ for i, s_attr in enumerate(s_attrs):
+ if s_attr_start_buffer[s_attr.start]:
+ s_attr_start_buffer[s_attr.start].append(i)
+ else:
+ s_attr_start_buffer[s_attr.start] = [i]
+ if s_attr_end_buffer[s_attr.end]:
+ s_attr_end_buffer[s_attr.end].append(i)
+ else:
+ s_attr_end_buffer[s_attr.end] = [i]
+ vrt = ''
+ vrt += '\n'
+ for p_attr in p_attrs:
+ # s_attr_starts
+ for k in {k: v for k, v in s_attr_start_buffer.items() if k <= p_attr.start}: # noqa
+ s_attrs = s_attr_start_buffer.pop(k)
+ for s_attr in s_attrs:
+ foo = ''
+ for property in s_attr.properties:
+ foo += ' {}="{}"'.format(escape(property.name),
+ escape(property.value))
+ vrt += '<{}{}>\n'.format(escape(s_attr.name), foo)
+ for k in {k: v for k, v in s_attr_end_buffer.items() if k <= p_attr.start}: # noqa
+ s_attrs = s_attr_end_buffer.pop(k)
+ for s_attr in s_attrs:
+ vrt += '{}>\n'.format(escape(s_attr.name))
+ # s_attr_ends
+ foo = {'lemma': None, 'ner': None, 'pos': None, 'simple_pos': None, 'word': None} # noqa
+ for property in p_attrs.properties:
+ if property.name == 'lemma':
+ foo['lemma'] = escape(property.value)
+ elif property.name == 'ner':
+ foo['ner'] = escape(property.value)
+ elif property.name == 'pos':
+ foo['pos'] = escape(property.value)
+ elif property.name == 'simple_pos':
+ foo['simple_pos'] = escape(property.value)
+ foo['word'] = escape(text[p_attr.start:p_attr.end])
+ vrt += '{word}\t{pos}\t{lemma}\t{simple_pos}\t{ner}\n'.format(
+ **foo)
+ vrt += '\n'
diff --git a/packages/stand-off-data-py/stand_off_data/models.py b/packages/stand-off-data-py/stand_off_data/models.py
index 0e426dd..e447fa1 100644
--- a/packages/stand-off-data-py/stand_off_data/models.py
+++ b/packages/stand-off-data-py/stand_off_data/models.py
@@ -1,4 +1,7 @@
-class StandOffData:
+from .export import ExportMixin
+
+
+class StandOffData(ExportMixin):
def __init__(self, attrs):
self.meta = attrs.get('meta', {})
self.lookup = {tag_definition.id: tag_definition for tag_definition in
@@ -69,7 +72,7 @@ class PropertyAnnotation:
self.tag_id = attrs['tag_id']
if self.property_id not in self.lookup[self.tag_id].properties:
raise Exception('Unknown property id: {}'.format(self.property_id))
- self.value = property['value']
+ self.value = attrs['value']
# TODO: Process attrs['possibleValues'] as self.labels (no id?)
@property
@@ -91,7 +94,7 @@ class TagDefinition:
@property
def required_properties(self):
- return {property.id: property for property in self.properties
+ return {property.id: property for property in self.properties.values()
if property.is_required}
diff --git a/packages/stand-off-data-py/stand_off_data/utils.py b/packages/stand-off-data-py/stand_off_data/utils.py
deleted file mode 100644
index 5a225eb..0000000
--- a/packages/stand-off-data-py/stand_off_data/utils.py
+++ /dev/null
@@ -1,78 +0,0 @@
-from xml.sax.saxutils import escape
-
-
-def create_vrt(text, stand_off_data):
- # Devide annotations into CWB's verticalized text format (.vrt) logic
- p_attrs = [] # positional attributes
- s_attrs = [] # structural attributes
- for annotation in stand_off_data.annotations:
- if annotation.name == 'token':
- p_attrs.append(annotation)
- else:
- s_attrs.append(annotation)
- # Sort annotations, necessary for the next checks
- p_attrs.sort()
- s_attrs.sort()
- # Check for p_attr<->p_attr overlap
- for i, p_attr in enumerate(p_attrs[:-1]):
- next_p_attr = p_attrs[i + 1]
- # Check if first_p_attr starts/ends within second_p_attr
- if ((p_attr.start >= next_p_attr.start) and (p_attr.start <= next_p_attr.end) # noqa
- or (p_attr.end >= next_p_attr.start) and (p_attr.end <= next_p_attr.end)): # noqa
- raise Exception('Positional attribute overlaps another')
- # Check for s_attr<->p_attr overlap
- for i, s_attr in enumerate(s_attrs):
- for p_attr in p_attrs:
- # Check if s_attr starts within p_attr
- if s_attr.start > p_attr.start and s_attr.start < p_attr.end:
- # Change s_attr start to p_attr's start
- s_attrs[i].start = p_attr.start
- # Check if s_attr ends within p_attr
- if s_attr.end < p_attr.end and s_attr.end > p_attr.start:
- # Change s_attr end to p_attr's end
- s_attrs[i].end = p_attr.end
- # Check if s_attr starts/ends before/after p_attr
- if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start:
- # No further Checking needed (just because p_attrs are sorted)
- break
- s_attr_start_buffer = {}
- s_attr_end_buffer = {}
- for i, s_attr in enumerate(s_attrs):
- if s_attr_start_buffer[s_attr.start]:
- s_attr_start_buffer[s_attr.start].append(i)
- else:
- s_attr_start_buffer[s_attr.start] = [i]
- if s_attr_end_buffer[s_attr.end]:
- s_attr_end_buffer[s_attr.end].append(i)
- else:
- s_attr_end_buffer[s_attr.end] = [i]
- vrt = ''
- vrt += '\n'
- for p_attr in p_attrs:
- # s_attr_starts
- for k in {k: v for k, v in s_attr_start_buffer.items() if k <= p_attr.start}: # noqa
- s_attrs = s_attr_start_buffer.pop(k)
- for s_attr in s_attrs:
- foo = ''
- for property in s_attr.properties:
- foo += ' {}="{}"'.format(escape(property.name),
- escape(property.value))
- vrt += '<{}{}>\n'.format(escape(s_attr.name), foo)
- for k in {k: v for k, v in s_attr_end_buffer.items() if k <= p_attr.start}: # noqa
- s_attrs = s_attr_end_buffer.pop(k)
- for s_attr in s_attrs:
- vrt += '{}>\n'.format(escape(s_attr.name))
- # s_attr_ends
- foo = {'lemma': None, 'ner': None, 'pos': None, 'simple_pos': None, 'word': None} # noqa
- for property in p_attrs.properties:
- if property.name == 'lemma':
- foo['lemma'] = escape(property.value)
- elif property.name == 'ner':
- foo['ner'] = escape(property.value)
- elif property.name == 'pos':
- foo['pos'] = escape(property.value)
- elif property.name == 'simple_pos':
- foo['simple_pos'] = escape(property.value)
- foo['word'] = escape(text[p_attr.start:p_attr.end])
- vrt += '{word}\t{pos}\t{lemma}\t{simple_pos}\t{ner}\n'.format(**foo)
- vrt += '\n'
diff --git a/vrt-creator b/vrt-creator
index e998903..4624c64 100755
--- a/vrt-creator
+++ b/vrt-creator
@@ -2,96 +2,12 @@
# coding=utf-8
from argparse import ArgumentParser
-from xml.sax.saxutils import escape
+from stand_off_data import StandOffData
import hashlib
import json
-# Two global ressources - Not very elegant but it works for now
-stand_off_data = None
-text = None
-
-
-def meta_to_string():
- string = ''
- string += '\n'.format( # noqa
- stand_off_data['meta']['generator']['name'],
- stand_off_data['meta']['generator']['version'],
- stand_off_data['meta']['generator']['arguments']['check_encoding'],
- stand_off_data['meta']['generator']['arguments']['language']
- )
- string += '\n'.format(
- stand_off_data['meta']['file']['encoding'],
- stand_off_data['meta']['file']['name'],
- stand_off_data['meta']['file']['md5']
- )
- return string
-
-
-def tags_to_string():
- return ''
-
-
-def annotations_to_string(end=float('inf')):
- string = ''
- while stand_off_data['annotations']:
- if stand_off_data['annotations'][0]['start'] >= end:
- break
- annotation = stand_off_data['annotations'].pop(0)
- #######################################################################
- # Check for malformed annotations #
- #######################################################################
- if 'tag' not in annotation:
- raise Exception('Annotation tag is missing')
-
- if annotation['tag'] not in stand_off_data['tags']:
- raise Exception('Unknown annotation tag: ' + annotation['tag'])
-
- tag_model = stand_off_data['tags'][annotation['tag']]
- if 'properties' in tag_model:
- properties_model = tag_model['properties']
- if properties_model is not None:
- required_properties = filter(lambda x: 'flags' in x and 'required' in x['flags'], properties_model) # noqa
- if required_properties and annotation['properties'] is None:
- raise Exception('There are required properties but the "Properties" attribute is missing') # noqa
- for property in required_properties:
- if property not in annotation['properties']:
- raise Exception('Required property is missing: ' + property) # noqa
- #######################################################################
- # Process tokens ~ cwb's positional attributes #
- #######################################################################
- if annotation['tag'] == 'token':
- string += '{}\t{}\t{}\t{}\t{}\n'.format(
- escape(text[annotation['start']:annotation['end']]),
- escape(annotation['properties']['pos']),
- escape(annotation['properties']['lemma']),
- escape(annotation['properties']['simple_pos']),
- escape(annotation['properties']['ner'] if 'ner' in annotation['properties'] else 'None') # noqa
- )
- #######################################################################
- # Process other tags ~ cwb's structural attributes #
- #######################################################################
- else:
- properties = ''
- if 'properties' in annotation and annotation['properties'] is not None: # noqa
- for property, value in annotation['properties'].items():
- if not value:
- continue
- if properties_model and property in properties_model:
- if 'flags' in properties_model and 'multiple' in properties_model['flags']: # noqa
- properties += ' {}="|{}|"'.format(property, '|'.join(value)) # noqa
- else:
- properties += ' {}="{}"'.format(property, value)
- string += '<' + annotation['tag'] + properties + '>\n'
- string += annotations_to_string(end=min(annotation['end'], end))
- string += '' + annotation['tag'] + '>\n'
- return string
-
-
def main():
- global stand_off_data
- global text
-
# Parse the given arguments
parser = ArgumentParser(description='Create a vrt from JSON and txt')
parser.add_argument('text', help='Path to txt file')
@@ -100,31 +16,15 @@ def main():
args = parser.parse_args()
with open(args.stand_off_data) as stand_of_data_file:
- stand_off_data = json.load(stand_of_data_file)
+ stand_off_data = StandOffData(json.load(stand_of_data_file))
with open(args.text, "rb") as text_file:
text_md5 = hashlib.md5()
for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''): # noqa
text_md5.update(chunk)
- if text_md5.hexdigest() != stand_off_data['meta']['file']['md5']:
+ if text_md5.hexdigest() != stand_off_data.meta['file']['md5']:
raise Exception('md5 not equal')
- with open(args.text, encoding=stand_off_data['meta']['file']['encoding']) as text_file: # noqa
- text = text_file.read()
-
- vrt = ''
- vrt += '\n'
- vrt += '\n'
- vrt += '\n'
- vrt += meta_to_string()
- vrt += tags_to_string()
- vrt += annotations_to_string()
- vrt += '\n'
- vrt += ''
-
- with open(args.output, 'w') as vrt_file:
- vrt_file.write(vrt)
-
if __name__ == '__main__':
main()
diff --git a/vrt-creator.bak b/vrt-creator.bak
new file mode 100644
index 0000000..e998903
--- /dev/null
+++ b/vrt-creator.bak
@@ -0,0 +1,130 @@
+#!/usr/bin/env python3.7
+# coding=utf-8
+
+from argparse import ArgumentParser
+from xml.sax.saxutils import escape
+import hashlib
+import json
+
+
+# Two global ressources - Not very elegant but it works for now
+stand_off_data = None
+text = None
+
+
+def meta_to_string():
+ string = ''
+ string += '\n'.format( # noqa
+ stand_off_data['meta']['generator']['name'],
+ stand_off_data['meta']['generator']['version'],
+ stand_off_data['meta']['generator']['arguments']['check_encoding'],
+ stand_off_data['meta']['generator']['arguments']['language']
+ )
+ string += '\n'.format(
+ stand_off_data['meta']['file']['encoding'],
+ stand_off_data['meta']['file']['name'],
+ stand_off_data['meta']['file']['md5']
+ )
+ return string
+
+
+def tags_to_string():
+ return ''
+
+
+def annotations_to_string(end=float('inf')):
+ string = ''
+ while stand_off_data['annotations']:
+ if stand_off_data['annotations'][0]['start'] >= end:
+ break
+ annotation = stand_off_data['annotations'].pop(0)
+ #######################################################################
+ # Check for malformed annotations #
+ #######################################################################
+ if 'tag' not in annotation:
+ raise Exception('Annotation tag is missing')
+
+ if annotation['tag'] not in stand_off_data['tags']:
+ raise Exception('Unknown annotation tag: ' + annotation['tag'])
+
+ tag_model = stand_off_data['tags'][annotation['tag']]
+ if 'properties' in tag_model:
+ properties_model = tag_model['properties']
+ if properties_model is not None:
+ required_properties = filter(lambda x: 'flags' in x and 'required' in x['flags'], properties_model) # noqa
+ if required_properties and annotation['properties'] is None:
+ raise Exception('There are required properties but the "Properties" attribute is missing') # noqa
+ for property in required_properties:
+ if property not in annotation['properties']:
+ raise Exception('Required property is missing: ' + property) # noqa
+ #######################################################################
+ # Process tokens ~ cwb's positional attributes #
+ #######################################################################
+ if annotation['tag'] == 'token':
+ string += '{}\t{}\t{}\t{}\t{}\n'.format(
+ escape(text[annotation['start']:annotation['end']]),
+ escape(annotation['properties']['pos']),
+ escape(annotation['properties']['lemma']),
+ escape(annotation['properties']['simple_pos']),
+ escape(annotation['properties']['ner'] if 'ner' in annotation['properties'] else 'None') # noqa
+ )
+ #######################################################################
+ # Process other tags ~ cwb's structural attributes #
+ #######################################################################
+ else:
+ properties = ''
+ if 'properties' in annotation and annotation['properties'] is not None: # noqa
+ for property, value in annotation['properties'].items():
+ if not value:
+ continue
+ if properties_model and property in properties_model:
+ if 'flags' in properties_model and 'multiple' in properties_model['flags']: # noqa
+ properties += ' {}="|{}|"'.format(property, '|'.join(value)) # noqa
+ else:
+ properties += ' {}="{}"'.format(property, value)
+ string += '<' + annotation['tag'] + properties + '>\n'
+ string += annotations_to_string(end=min(annotation['end'], end))
+ string += '' + annotation['tag'] + '>\n'
+ return string
+
+
+def main():
+ global stand_off_data
+ global text
+
+ # Parse the given arguments
+ parser = ArgumentParser(description='Create a vrt from JSON and txt')
+ parser.add_argument('text', help='Path to txt file')
+ parser.add_argument('stand_off_data', help='Path to JSON file')
+ parser.add_argument('output', help='Path to vrt output file')
+ args = parser.parse_args()
+
+ with open(args.stand_off_data) as stand_of_data_file:
+ stand_off_data = json.load(stand_of_data_file)
+
+ with open(args.text, "rb") as text_file:
+ text_md5 = hashlib.md5()
+ for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''): # noqa
+ text_md5.update(chunk)
+ if text_md5.hexdigest() != stand_off_data['meta']['file']['md5']:
+ raise Exception('md5 not equal')
+
+ with open(args.text, encoding=stand_off_data['meta']['file']['encoding']) as text_file: # noqa
+ text = text_file.read()
+
+ vrt = ''
+ vrt += '\n'
+ vrt += '\n'
+ vrt += '\n'
+ vrt += meta_to_string()
+ vrt += tags_to_string()
+ vrt += annotations_to_string()
+ vrt += '\n'
+ vrt += ''
+
+ with open(args.output, 'w') as vrt_file:
+ vrt_file.write(vrt)
+
+
+if __name__ == '__main__':
+ main()