mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
				synced 2025-11-03 20:02:47 +00:00 
			
		
		
		
	normalize vrt on build
This commit is contained in:
		@@ -4,17 +4,18 @@ from app.models import User, Corpus, CorpusFile
 | 
			
		||||
from datetime import datetime
 | 
			
		||||
import json
 | 
			
		||||
import os
 | 
			
		||||
import shutil
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def convert(json_db_file, data_dir):
 | 
			
		||||
    with open(json_db_file, 'r') as f:
 | 
			
		||||
        json_db = json.loads(f.read())
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    for json_user in json_db:
 | 
			
		||||
        if not json_user['confirmed']:
 | 
			
		||||
            current_app.logger.info(f'Skip unconfirmed user {json_user["username"]}')
 | 
			
		||||
            continue
 | 
			
		||||
        user_dir = os.path.join(data_dir, json_user['id'])
 | 
			
		||||
        user_dir = os.path.join(data_dir, str(json_user['id']))
 | 
			
		||||
        convert_user(json_user, user_dir)
 | 
			
		||||
        db.session.commit()
 | 
			
		||||
 | 
			
		||||
@@ -42,7 +43,7 @@ def convert_user(json_user, user_dir):
 | 
			
		||||
        if not json_corpus['files'].values():
 | 
			
		||||
            current_app.logger.info(f'Skip empty corpus {json_corpus["title"]}')
 | 
			
		||||
            continue
 | 
			
		||||
        corpus_dir = os.path.join(user_dir, 'corpora', json_corpus['id'])
 | 
			
		||||
        corpus_dir = os.path.join(user_dir, 'corpora', str(json_corpus['id']))
 | 
			
		||||
        convert_corpus(json_corpus, user, corpus_dir)
 | 
			
		||||
    current_app.logger.info('Done')
 | 
			
		||||
 | 
			
		||||
@@ -66,12 +67,11 @@ def convert_corpus(json_corpus, user, corpus_dir):
 | 
			
		||||
        db.session.rollback()
 | 
			
		||||
        raise Exception('Internal Server Error')
 | 
			
		||||
    for json_corpus_file in json_corpus['files'].values():
 | 
			
		||||
        corpus_file_dir = os.path.join(corpus_dir, 'files', json_corpus_file['id'])
 | 
			
		||||
        convert_corpus_file(json_corpus_file, corpus, corpus_file_dir)
 | 
			
		||||
        convert_corpus_file(json_corpus_file, corpus, corpus_dir)
 | 
			
		||||
    current_app.logger.info('Done')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def convert_corpus_file(json_corpus_file, corpus, corpus_file_dir):
 | 
			
		||||
def convert_corpus_file(json_corpus_file, corpus, corpus_dir):
 | 
			
		||||
    current_app.logger.info(f'Create CorpusFile {json_corpus_file["title"]}...')
 | 
			
		||||
    corpus_file = CorpusFile(
 | 
			
		||||
        corpus=corpus,
 | 
			
		||||
@@ -94,122 +94,15 @@ def convert_corpus_file(json_corpus_file, corpus, corpus_file_dir):
 | 
			
		||||
    db.session.flush(objects=[corpus_file])
 | 
			
		||||
    db.session.refresh(corpus_file)
 | 
			
		||||
    try:
 | 
			
		||||
        convert_vrt(
 | 
			
		||||
            os.path.join(corpus_file_dir, json_corpus_file['filename']),
 | 
			
		||||
        shutil.copy2(
 | 
			
		||||
            os.path.join(corpus_dir, json_corpus_file['filename']),
 | 
			
		||||
            corpus_file.path
 | 
			
		||||
        )
 | 
			
		||||
    except OSError as e:
 | 
			
		||||
        current_app.logger.error(e)
 | 
			
		||||
        db.session.rollback()
 | 
			
		||||
        raise Exception('Internal Server Error')
 | 
			
		||||
    except:
 | 
			
		||||
        current_app.logger.warning(
 | 
			
		||||
            'Can not convert corpus file: '
 | 
			
		||||
            f'{os.path.join(corpus_dir, json_corpus_file["filename"])}'
 | 
			
		||||
            ' -> '
 | 
			
		||||
            f'{corpus_file.path}'
 | 
			
		||||
        )
 | 
			
		||||
    current_app.logger.info('Done')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def convert_vrt(input_file, output_file):
 | 
			
		||||
    def check_pos_attribute_order(vrt_lines):
 | 
			
		||||
        # The following orders are possible:
 | 
			
		||||
        # since 26.02.2019: 'word,lemma,simple_pos,pos,ner'
 | 
			
		||||
        # since 26.03.2021: 'word,pos,lemma,simple_pos,ner'
 | 
			
		||||
        # since 27.01.2022: 'word,pos,lemma,simple_pos'
 | 
			
		||||
        # This Function tries to find out which order we have by looking at the
 | 
			
		||||
        # number of attributes and the position of the simple_pos attribute
 | 
			
		||||
        SIMPLE_POS_LABELS = [
 | 
			
		||||
            'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ',
 | 
			
		||||
            'DET', 'INTJ', 'NOUN', 'NUM', 'PART',
 | 
			
		||||
            'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',
 | 
			
		||||
            'VERB', 'X'
 | 
			
		||||
        ]
 | 
			
		||||
        for line in vrt_lines:
 | 
			
		||||
            if line.startswith('<'):
 | 
			
		||||
                continue
 | 
			
		||||
            pos_attrs = line.rstrip('\n').split('\t')
 | 
			
		||||
            num_pos_attrs = len(pos_attrs)
 | 
			
		||||
            if num_pos_attrs == 4:
 | 
			
		||||
                if pos_attrs[3] in SIMPLE_POS_LABELS:
 | 
			
		||||
                    return ['word', 'pos', 'lemma', 'simple_pos']
 | 
			
		||||
                continue
 | 
			
		||||
            elif num_pos_attrs == 5:
 | 
			
		||||
                if pos_attrs[2] in SIMPLE_POS_LABELS:
 | 
			
		||||
                    return ['word', 'lemma', 'simple_pos', 'pos', 'ner']
 | 
			
		||||
                elif pos_attrs[3] in SIMPLE_POS_LABELS:
 | 
			
		||||
                    return ['word', 'pos', 'lemma', 'simple_pos', 'ner']
 | 
			
		||||
                continue
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def check_has_ent_as_s_attr(vrt_lines):
 | 
			
		||||
        for line in vrt_lines:
 | 
			
		||||
            if line.startswith('<ent'):
 | 
			
		||||
                return True
 | 
			
		||||
        return False
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def pos_attrs_to_string_1(pos_attrs):
 | 
			
		||||
        return f'{pos_attrs[0]}\t{pos_attrs[3]}\t{pos_attrs[1]}\t{pos_attrs[2]}\n'
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def pos_attrs_to_string_2(pos_attrs):
 | 
			
		||||
        return f'{pos_attrs[0]}\t{pos_attrs[1]}\t{pos_attrs[2]}\t{pos_attrs[3]}\n'
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    with open(input_file) as f:
 | 
			
		||||
        input_vrt_lines = f.readlines()
 | 
			
		||||
 | 
			
		||||
    pos_attr_order = check_pos_attribute_order(input_vrt_lines)
 | 
			
		||||
    has_ent_as_s_attr = check_has_ent_as_s_attr(input_vrt_lines)
 | 
			
		||||
 | 
			
		||||
    print(f'Detected pos_attr_order: [{",".join(pos_attr_order)}]')
 | 
			
		||||
    print(f'Detected has_ent_as_s_attr: {has_ent_as_s_attr}')
 | 
			
		||||
 | 
			
		||||
    if pos_attr_order == ['word', 'lemma', 'simple_pos', 'pos', 'ner']:
 | 
			
		||||
        pos_attrs_to_string_function = pos_attrs_to_string_1
 | 
			
		||||
    elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos', 'ner']:
 | 
			
		||||
        pos_attrs_to_string_function = pos_attrs_to_string_2
 | 
			
		||||
    elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos']:
 | 
			
		||||
        pos_attrs_to_string_function = pos_attrs_to_string_2
 | 
			
		||||
    else:
 | 
			
		||||
        raise Exception('Can not handle format')
 | 
			
		||||
 | 
			
		||||
    current_ent = None
 | 
			
		||||
    output_vrt = ''
 | 
			
		||||
    for line in input_vrt_lines:
 | 
			
		||||
        if line.strip() == '':
 | 
			
		||||
            continue
 | 
			
		||||
        if line.startswith('<'):
 | 
			
		||||
            if not has_ent_as_s_attr:
 | 
			
		||||
                if current_ent is not None:
 | 
			
		||||
                    output_vrt += '</ent>\n'
 | 
			
		||||
                    current_ent = None
 | 
			
		||||
            if (
 | 
			
		||||
                line.startswith('<corpus')
 | 
			
		||||
                or line.startswith('</corpus')
 | 
			
		||||
                or line.startswith('<nlp')
 | 
			
		||||
            ):
 | 
			
		||||
                continue
 | 
			
		||||
            elif line.startswith('<text'):
 | 
			
		||||
                output_vrt += '<text>\n'
 | 
			
		||||
                continue
 | 
			
		||||
            elif line.startswith('<s'):
 | 
			
		||||
                output_vrt += '<s>\n'
 | 
			
		||||
                continue
 | 
			
		||||
            output_vrt += line
 | 
			
		||||
            continue
 | 
			
		||||
        pos_attrs = line.rstrip('\n').split('\t')
 | 
			
		||||
        if not has_ent_as_s_attr:
 | 
			
		||||
            if pos_attrs[4].lower() in ['null', 'none']:
 | 
			
		||||
                if current_ent:
 | 
			
		||||
                    output_vrt += '</ent>\n'
 | 
			
		||||
                    current_ent = None
 | 
			
		||||
            else:
 | 
			
		||||
                if current_ent is None:
 | 
			
		||||
                    output_vrt += f'<ent type="{pos_attrs[4]}">\n'
 | 
			
		||||
                    current_ent = pos_attrs[4]
 | 
			
		||||
                elif current_ent != pos_attrs[4]:
 | 
			
		||||
                    output_vrt += '</ent>\n'
 | 
			
		||||
                    current_ent = None
 | 
			
		||||
                    output_vrt += f'<ent type="{pos_attrs[4]}">\n'
 | 
			
		||||
                    current_ent = pos_attrs[4]
 | 
			
		||||
        output_vrt += pos_attrs_to_string_function(pos_attrs)
 | 
			
		||||
 | 
			
		||||
    with open(output_file, 'w') as f:
 | 
			
		||||
        f.write(output_vrt)
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										117
									
								
								app/converters/vrt.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										117
									
								
								app/converters/vrt.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,117 @@
 | 
			
		||||
from flask import current_app
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def normalize_vrt_file(input_file, output_file):
 | 
			
		||||
    def check_pos_attribute_order(vrt_lines):
 | 
			
		||||
        # The following orders are possible:
 | 
			
		||||
        # since 26.02.2019: 'word,lemma,simple_pos,pos,ner'
 | 
			
		||||
        # since 26.03.2021: 'word,pos,lemma,simple_pos,ner'
 | 
			
		||||
        # since 27.01.2022: 'word,pos,lemma,simple_pos'
 | 
			
		||||
        # This Function tries to find out which order we have by looking at the
 | 
			
		||||
        # number of attributes and the position of the simple_pos attribute
 | 
			
		||||
        SIMPLE_POS_LABELS = [
 | 
			
		||||
            'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ',
 | 
			
		||||
            'DET', 'INTJ', 'NOUN', 'NUM', 'PART',
 | 
			
		||||
            'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',
 | 
			
		||||
            'VERB', 'X'
 | 
			
		||||
        ]
 | 
			
		||||
        for line in vrt_lines:
 | 
			
		||||
            if line.startswith('<'):
 | 
			
		||||
                continue
 | 
			
		||||
            pos_attrs = line.rstrip('\n').split('\t')
 | 
			
		||||
            num_pos_attrs = len(pos_attrs)
 | 
			
		||||
            if num_pos_attrs == 4:
 | 
			
		||||
                if pos_attrs[3] in SIMPLE_POS_LABELS:
 | 
			
		||||
                    return ['word', 'pos', 'lemma', 'simple_pos']
 | 
			
		||||
                continue
 | 
			
		||||
            elif num_pos_attrs == 5:
 | 
			
		||||
                if pos_attrs[2] in SIMPLE_POS_LABELS:
 | 
			
		||||
                    return ['word', 'lemma', 'simple_pos', 'pos', 'ner']
 | 
			
		||||
                elif pos_attrs[3] in SIMPLE_POS_LABELS:
 | 
			
		||||
                    return ['word', 'pos', 'lemma', 'simple_pos', 'ner']
 | 
			
		||||
                continue
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def check_has_ent_as_s_attr(vrt_lines):
 | 
			
		||||
        for line in vrt_lines:
 | 
			
		||||
            if line.startswith('<ent'):
 | 
			
		||||
                return True
 | 
			
		||||
        return False
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def pos_attrs_to_string_1(pos_attrs):
 | 
			
		||||
        return f'{pos_attrs[0]}\t{pos_attrs[3]}\t{pos_attrs[1]}\t{pos_attrs[2]}\n'
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def pos_attrs_to_string_2(pos_attrs):
 | 
			
		||||
        return f'{pos_attrs[0]}\t{pos_attrs[1]}\t{pos_attrs[2]}\t{pos_attrs[3]}\n'
 | 
			
		||||
 | 
			
		||||
    current_app.logger.info(f'Converting {input_file}...')
 | 
			
		||||
 | 
			
		||||
    with open(input_file) as f:
 | 
			
		||||
        input_vrt_lines = f.readlines()
 | 
			
		||||
 | 
			
		||||
    pos_attr_order = check_pos_attribute_order(input_vrt_lines)
 | 
			
		||||
    has_ent_as_s_attr = check_has_ent_as_s_attr(input_vrt_lines)
 | 
			
		||||
 | 
			
		||||
    current_app.logger.info(f'Detected pos_attr_order: [{",".join(pos_attr_order)}]')
 | 
			
		||||
    current_app.logger.info(f'Detected has_ent_as_s_attr: {has_ent_as_s_attr}')
 | 
			
		||||
 | 
			
		||||
    if pos_attr_order == ['word', 'lemma', 'simple_pos', 'pos', 'ner']:
 | 
			
		||||
        pos_attrs_to_string_function = pos_attrs_to_string_1
 | 
			
		||||
    elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos', 'ner']:
 | 
			
		||||
        pos_attrs_to_string_function = pos_attrs_to_string_2
 | 
			
		||||
    elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos']:
 | 
			
		||||
        pos_attrs_to_string_function = pos_attrs_to_string_2
 | 
			
		||||
    else:
 | 
			
		||||
        raise Exception('Can not handle format')
 | 
			
		||||
 | 
			
		||||
    current_ent = None
 | 
			
		||||
    multi_line_tag_definition = False
 | 
			
		||||
    output_vrt = ''
 | 
			
		||||
    for line in input_vrt_lines:
 | 
			
		||||
        if line.strip() == '':
 | 
			
		||||
            continue
 | 
			
		||||
        if line.startswith('<'):
 | 
			
		||||
            if not has_ent_as_s_attr:
 | 
			
		||||
                if current_ent is not None:
 | 
			
		||||
                    output_vrt += '</ent>\n'
 | 
			
		||||
                    current_ent = None
 | 
			
		||||
            if not line.rstrip().endswith('>'):
 | 
			
		||||
                multi_line_tag_definition = True
 | 
			
		||||
            if line.startswith('<text'):
 | 
			
		||||
                output_vrt += '<text>\n'
 | 
			
		||||
            if line.startswith('</text>'):
 | 
			
		||||
                output_vrt += '</text>\n'
 | 
			
		||||
            elif line.startswith('<s'):
 | 
			
		||||
                output_vrt += '<s>\n'
 | 
			
		||||
            elif line.startswith('</s>'):
 | 
			
		||||
                output_vrt += '</s>\n'
 | 
			
		||||
            elif line.startswith('<ent'):
 | 
			
		||||
                output_vrt += line
 | 
			
		||||
            elif line.startswith('</ent>'):
 | 
			
		||||
                output_vrt += line
 | 
			
		||||
            continue
 | 
			
		||||
        if multi_line_tag_definition and line.rstrip().endswith('>'):
 | 
			
		||||
            multi_line_tag_definition = False
 | 
			
		||||
            continue
 | 
			
		||||
        pos_attrs = line.rstrip('\n').split('\t')
 | 
			
		||||
        if not has_ent_as_s_attr:
 | 
			
		||||
            if pos_attrs[4].lower() in ['null', 'none']:
 | 
			
		||||
                if current_ent:
 | 
			
		||||
                    output_vrt += '</ent>\n'
 | 
			
		||||
                    current_ent = None
 | 
			
		||||
            else:
 | 
			
		||||
                if current_ent is None:
 | 
			
		||||
                    output_vrt += f'<ent type="{pos_attrs[4]}">\n'
 | 
			
		||||
                    current_ent = pos_attrs[4]
 | 
			
		||||
                elif current_ent != pos_attrs[4]:
 | 
			
		||||
                    output_vrt += '</ent>\n'
 | 
			
		||||
                    current_ent = None
 | 
			
		||||
                    output_vrt += f'<ent type="{pos_attrs[4]}">\n'
 | 
			
		||||
                    current_ent = pos_attrs[4]
 | 
			
		||||
        output_vrt += pos_attrs_to_string_function(pos_attrs)
 | 
			
		||||
 | 
			
		||||
    with open(output_file, 'w') as f:
 | 
			
		||||
        f.write(output_vrt)
 | 
			
		||||
@@ -1,3 +1,4 @@
 | 
			
		||||
from app.converters.vrt import normalize_vrt_file
 | 
			
		||||
from datetime import datetime, timedelta
 | 
			
		||||
from enum import IntEnum
 | 
			
		||||
from flask import current_app, url_for
 | 
			
		||||
@@ -854,7 +855,13 @@ class Corpus(HashidMixin, db.Model):
 | 
			
		||||
    def build(self):
 | 
			
		||||
        corpus_element = ET.fromstring('<corpus>\n</corpus>')
 | 
			
		||||
        for corpus_file in self.files:
 | 
			
		||||
            element_tree = ET.parse(corpus_file.path)
 | 
			
		||||
            normalized_vrt_path = os.path.join(self.path, 'cwb', f'{corpus_file.id}.norm.vrt')
 | 
			
		||||
            try:
 | 
			
		||||
                normalize_vrt_file(corpus_file.path, normalized_vrt_path)
 | 
			
		||||
            except:
 | 
			
		||||
                self.status = CorpusStatus.FAILED
 | 
			
		||||
                return
 | 
			
		||||
            element_tree = ET.parse(normalized_vrt_path)
 | 
			
		||||
            text_element = element_tree.getroot()
 | 
			
		||||
            text_element.set('address', corpus_file.address or 'NULL')
 | 
			
		||||
            text_element.set('author', corpus_file.author)
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user