normalize vrt on build

2026-06-26 22:30:29 +00:00 · 2022-04-12 16:11:40 +02:00
parent 99ddd2e3dd
commit 4146e3789b
3 changed files with 140 additions and 123 deletions
@@ -4,6 +4,7 @@ from app.models import User, Corpus, CorpusFile
 from datetime import datetime
 import json
 import os
+import shutil


 def convert(json_db_file, data_dir):
@@ -14,7 +15,7 @@ def convert(json_db_file, data_dir):
        if not json_user['confirmed']:
            current_app.logger.info(f'Skip unconfirmed user {json_user["username"]}')
            continue
-        user_dir = os.path.join(data_dir, json_user['id'])
+        user_dir = os.path.join(data_dir, str(json_user['id']))
        convert_user(json_user, user_dir)
        db.session.commit()

@@ -42,7 +43,7 @@ def convert_user(json_user, user_dir):
        if not json_corpus['files'].values():
            current_app.logger.info(f'Skip empty corpus {json_corpus["title"]}')
            continue
-        corpus_dir = os.path.join(user_dir, 'corpora', json_corpus['id'])
+        corpus_dir = os.path.join(user_dir, 'corpora', str(json_corpus['id']))
        convert_corpus(json_corpus, user, corpus_dir)
    current_app.logger.info('Done')

@@ -66,12 +67,11 @@ def convert_corpus(json_corpus, user, corpus_dir):
        db.session.rollback()
        raise Exception('Internal Server Error')
    for json_corpus_file in json_corpus['files'].values():
-        corpus_file_dir = os.path.join(corpus_dir, 'files', json_corpus_file['id'])
-        convert_corpus_file(json_corpus_file, corpus, corpus_file_dir)
+        convert_corpus_file(json_corpus_file, corpus, corpus_dir)
    current_app.logger.info('Done')


-def convert_corpus_file(json_corpus_file, corpus, corpus_file_dir):
+def convert_corpus_file(json_corpus_file, corpus, corpus_dir):
    current_app.logger.info(f'Create CorpusFile {json_corpus_file["title"]}...')
    corpus_file = CorpusFile(
        corpus=corpus,
@@ -94,122 +94,15 @@ def convert_corpus_file(json_corpus_file, corpus, corpus_file_dir):
    db.session.flush(objects=[corpus_file])
    db.session.refresh(corpus_file)
    try:
-        convert_vrt(
-            os.path.join(corpus_file_dir, json_corpus_file['filename']),
+        shutil.copy2(
+            os.path.join(corpus_dir, json_corpus_file['filename']),
            corpus_file.path
        )
-    except OSError as e:
-        current_app.logger.error(e)
-        db.session.rollback()
-        raise Exception('Internal Server Error')
+    except:
+        current_app.logger.warning(
+            'Can not convert corpus file: '
+            f'{os.path.join(corpus_dir, json_corpus_file["filename"])}'
+            ' -> '
+            f'{corpus_file.path}'
+        )
    current_app.logger.info('Done')
-
-
-def convert_vrt(input_file, output_file):
-    def check_pos_attribute_order(vrt_lines):
-        # The following orders are possible:
-        # since 26.02.2019: 'word,lemma,simple_pos,pos,ner'
-        # since 26.03.2021: 'word,pos,lemma,simple_pos,ner'
-        # since 27.01.2022: 'word,pos,lemma,simple_pos'
-        # This Function tries to find out which order we have by looking at the
-        # number of attributes and the position of the simple_pos attribute
-        SIMPLE_POS_LABELS = [
-            'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ',
-            'DET', 'INTJ', 'NOUN', 'NUM', 'PART',
-            'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',
-            'VERB', 'X'
-        ]
-        for line in vrt_lines:
-            if line.startswith('<'):
-                continue
-            pos_attrs = line.rstrip('\n').split('\t')
-            num_pos_attrs = len(pos_attrs)
-            if num_pos_attrs == 4:
-                if pos_attrs[3] in SIMPLE_POS_LABELS:
-                    return ['word', 'pos', 'lemma', 'simple_pos']
-                continue
-            elif num_pos_attrs == 5:
-                if pos_attrs[2] in SIMPLE_POS_LABELS:
-                    return ['word', 'lemma', 'simple_pos', 'pos', 'ner']
-                elif pos_attrs[3] in SIMPLE_POS_LABELS:
-                    return ['word', 'pos', 'lemma', 'simple_pos', 'ner']
-                continue
-        return None
-
-
-    def check_has_ent_as_s_attr(vrt_lines):
-        for line in vrt_lines:
-            if line.startswith('<ent'):
-                return True
-        return False
-
-
-    def pos_attrs_to_string_1(pos_attrs):
-        return f'{pos_attrs[0]}\t{pos_attrs[3]}\t{pos_attrs[1]}\t{pos_attrs[2]}\n'
-
-
-    def pos_attrs_to_string_2(pos_attrs):
-        return f'{pos_attrs[0]}\t{pos_attrs[1]}\t{pos_attrs[2]}\t{pos_attrs[3]}\n'
-
-
-    with open(input_file) as f:
-        input_vrt_lines = f.readlines()
-
-    pos_attr_order = check_pos_attribute_order(input_vrt_lines)
-    has_ent_as_s_attr = check_has_ent_as_s_attr(input_vrt_lines)
-
-    print(f'Detected pos_attr_order: [{",".join(pos_attr_order)}]')
-    print(f'Detected has_ent_as_s_attr: {has_ent_as_s_attr}')
-
-    if pos_attr_order == ['word', 'lemma', 'simple_pos', 'pos', 'ner']:
-        pos_attrs_to_string_function = pos_attrs_to_string_1
-    elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos', 'ner']:
-        pos_attrs_to_string_function = pos_attrs_to_string_2
-    elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos']:
-        pos_attrs_to_string_function = pos_attrs_to_string_2
-    else:
-        raise Exception('Can not handle format')
-
-    current_ent = None
-    output_vrt = ''
-    for line in input_vrt_lines:
-        if line.strip() == '':
-            continue
-        if line.startswith('<'):
-            if not has_ent_as_s_attr:
-                if current_ent is not None:
-                    output_vrt += '</ent>\n'
-                    current_ent = None
-            if (
-                line.startswith('<corpus')
-                or line.startswith('</corpus')
-                or line.startswith('<nlp')
-            ):
-                continue
-            elif line.startswith('<text'):
-                output_vrt += '<text>\n'
-                continue
-            elif line.startswith('<s'):
-                output_vrt += '<s>\n'
-                continue
-            output_vrt += line
-            continue
-        pos_attrs = line.rstrip('\n').split('\t')
-        if not has_ent_as_s_attr:
-            if pos_attrs[4].lower() in ['null', 'none']:
-                if current_ent:
-                    output_vrt += '</ent>\n'
-                    current_ent = None
-            else:
-                if current_ent is None:
-                    output_vrt += f'<ent type="{pos_attrs[4]}">\n'
-                    current_ent = pos_attrs[4]
-                elif current_ent != pos_attrs[4]:
-                    output_vrt += '</ent>\n'
-                    current_ent = None
-                    output_vrt += f'<ent type="{pos_attrs[4]}">\n'
-                    current_ent = pos_attrs[4]
-        output_vrt += pos_attrs_to_string_function(pos_attrs)
-
-    with open(output_file, 'w') as f:
-        f.write(output_vrt)
@@ -0,0 +1,117 @@
+from flask import current_app
+
+
+def normalize_vrt_file(input_file, output_file):
+    def check_pos_attribute_order(vrt_lines):
+        # The following orders are possible:
+        # since 26.02.2019: 'word,lemma,simple_pos,pos,ner'
+        # since 26.03.2021: 'word,pos,lemma,simple_pos,ner'
+        # since 27.01.2022: 'word,pos,lemma,simple_pos'
+        # This Function tries to find out which order we have by looking at the
+        # number of attributes and the position of the simple_pos attribute
+        SIMPLE_POS_LABELS = [
+            'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ',
+            'DET', 'INTJ', 'NOUN', 'NUM', 'PART',
+            'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',
+            'VERB', 'X'
+        ]
+        for line in vrt_lines:
+            if line.startswith('<'):
+                continue
+            pos_attrs = line.rstrip('\n').split('\t')
+            num_pos_attrs = len(pos_attrs)
+            if num_pos_attrs == 4:
+                if pos_attrs[3] in SIMPLE_POS_LABELS:
+                    return ['word', 'pos', 'lemma', 'simple_pos']
+                continue
+            elif num_pos_attrs == 5:
+                if pos_attrs[2] in SIMPLE_POS_LABELS:
+                    return ['word', 'lemma', 'simple_pos', 'pos', 'ner']
+                elif pos_attrs[3] in SIMPLE_POS_LABELS:
+                    return ['word', 'pos', 'lemma', 'simple_pos', 'ner']
+                continue
+        return None
+
+
+    def check_has_ent_as_s_attr(vrt_lines):
+        for line in vrt_lines:
+            if line.startswith('<ent'):
+                return True
+        return False
+
+
+    def pos_attrs_to_string_1(pos_attrs):
+        return f'{pos_attrs[0]}\t{pos_attrs[3]}\t{pos_attrs[1]}\t{pos_attrs[2]}\n'
+
+
+    def pos_attrs_to_string_2(pos_attrs):
+        return f'{pos_attrs[0]}\t{pos_attrs[1]}\t{pos_attrs[2]}\t{pos_attrs[3]}\n'
+
+    current_app.logger.info(f'Converting {input_file}...')
+
+    with open(input_file) as f:
+        input_vrt_lines = f.readlines()
+
+    pos_attr_order = check_pos_attribute_order(input_vrt_lines)
+    has_ent_as_s_attr = check_has_ent_as_s_attr(input_vrt_lines)
+
+    current_app.logger.info(f'Detected pos_attr_order: [{",".join(pos_attr_order)}]')
+    current_app.logger.info(f'Detected has_ent_as_s_attr: {has_ent_as_s_attr}')
+
+    if pos_attr_order == ['word', 'lemma', 'simple_pos', 'pos', 'ner']:
+        pos_attrs_to_string_function = pos_attrs_to_string_1
+    elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos', 'ner']:
+        pos_attrs_to_string_function = pos_attrs_to_string_2
+    elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos']:
+        pos_attrs_to_string_function = pos_attrs_to_string_2
+    else:
+        raise Exception('Can not handle format')
+
+    current_ent = None
+    multi_line_tag_definition = False
+    output_vrt = ''
+    for line in input_vrt_lines:
+        if line.strip() == '':
+            continue
+        if line.startswith('<'):
+            if not has_ent_as_s_attr:
+                if current_ent is not None:
+                    output_vrt += '</ent>\n'
+                    current_ent = None
+            if not line.rstrip().endswith('>'):
+                multi_line_tag_definition = True
+            if line.startswith('<text'):
+                output_vrt += '<text>\n'
+            if line.startswith('</text>'):
+                output_vrt += '</text>\n'
+            elif line.startswith('<s'):
+                output_vrt += '<s>\n'
+            elif line.startswith('</s>'):
+                output_vrt += '</s>\n'
+            elif line.startswith('<ent'):
+                output_vrt += line
+            elif line.startswith('</ent>'):
+                output_vrt += line
+            continue
+        if multi_line_tag_definition and line.rstrip().endswith('>'):
+            multi_line_tag_definition = False
+            continue
+        pos_attrs = line.rstrip('\n').split('\t')
+        if not has_ent_as_s_attr:
+            if pos_attrs[4].lower() in ['null', 'none']:
+                if current_ent:
+                    output_vrt += '</ent>\n'
+                    current_ent = None
+            else:
+                if current_ent is None:
+                    output_vrt += f'<ent type="{pos_attrs[4]}">\n'
+                    current_ent = pos_attrs[4]
+                elif current_ent != pos_attrs[4]:
+                    output_vrt += '</ent>\n'
+                    current_ent = None
+                    output_vrt += f'<ent type="{pos_attrs[4]}">\n'
+                    current_ent = pos_attrs[4]
+        output_vrt += pos_attrs_to_string_function(pos_attrs)
+
+    with open(output_file, 'w') as f:
+        f.write(output_vrt)
@@ -1,3 +1,4 @@
+from app.converters.vrt import normalize_vrt_file
 from datetime import datetime, timedelta
 from enum import IntEnum
 from flask import current_app, url_for
@@ -854,7 +855,13 @@ class Corpus(HashidMixin, db.Model):
    def build(self):
        corpus_element = ET.fromstring('<corpus>\n</corpus>')
        for corpus_file in self.files:
-            element_tree = ET.parse(corpus_file.path)
+            normalized_vrt_path = os.path.join(self.path, 'cwb', f'{corpus_file.id}.norm.vrt')
+            try:
+                normalize_vrt_file(corpus_file.path, normalized_vrt_path)
+            except:
+                self.status = CorpusStatus.FAILED
+                return
+            element_tree = ET.parse(normalized_vrt_path)
            text_element = element_tree.getroot()
            text_element.set('address', corpus_file.address or 'NULL')
            text_element.set('author', corpus_file.author)