mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
				synced 2025-11-04 04:12:45 +00:00 
			
		
		
		
	Use new cqi version. No chunking needed anymore
This commit is contained in:
		@@ -121,10 +121,7 @@ class CQiNamespace(Namespace):
 | 
			
		||||
            socketio.sleep(3)
 | 
			
		||||
            retry_counter -= 1
 | 
			
		||||
            db.session.refresh(db_corpus)
 | 
			
		||||
        cqi_client: CQiClient = CQiClient(
 | 
			
		||||
            f'cqpserver_{db_corpus_id}',
 | 
			
		||||
            timeout=float('inf')
 | 
			
		||||
        )
 | 
			
		||||
        cqi_client: CQiClient = CQiClient(f'cqpserver_{db_corpus_id}')
 | 
			
		||||
        session['cqi_over_sio'] = {
 | 
			
		||||
            'cqi_client': cqi_client,
 | 
			
		||||
            'cqi_client_lock': Lock(),
 | 
			
		||||
 
 | 
			
		||||
@@ -1,6 +1,7 @@
 | 
			
		||||
from collections import Counter
 | 
			
		||||
from cqi import CQiClient
 | 
			
		||||
from cqi.models.corpora import Corpus as CQiCorpus
 | 
			
		||||
from cqi.models.subcorpora import Subcorpus as CQiSubcorpus
 | 
			
		||||
from cqi.models.attributes import (
 | 
			
		||||
    PositionalAttribute as CQiPositionalAttribute,
 | 
			
		||||
    StructuralAttribute as CQiStructuralAttribute
 | 
			
		||||
@@ -40,161 +41,132 @@ def ext_corpus_update_db(corpus: str) -> CQiStatusOk:
 | 
			
		||||
def ext_corpus_static_data(corpus: str) -> Dict:
 | 
			
		||||
    db_corpus_id: int = session['cqi_over_sio']['db_corpus_id']
 | 
			
		||||
    db_corpus: Corpus = Corpus.query.get(db_corpus_id)
 | 
			
		||||
    cache_file_path: str = os.path.join(db_corpus.path, 'cwb', 'static.json.gz')
 | 
			
		||||
    if os.path.exists(cache_file_path):
 | 
			
		||||
        with open(cache_file_path, 'rb') as f:
 | 
			
		||||
 | 
			
		||||
    static_data_file_path: str = os.path.join(db_corpus.path, 'cwb', 'static.json.gz')
 | 
			
		||||
    if os.path.exists(static_data_file_path):
 | 
			
		||||
        with open(static_data_file_path, 'rb') as f:
 | 
			
		||||
            return f.read()
 | 
			
		||||
 | 
			
		||||
    cqi_client: CQiClient = session['cqi_over_sio']['cqi_client']
 | 
			
		||||
    cqi_corpus: CQiCorpus = cqi_client.corpora.get(corpus)
 | 
			
		||||
    cqi_p_attrs: Dict[str, CQiPositionalAttribute] = {
 | 
			
		||||
        p_attr.name: p_attr
 | 
			
		||||
        for p_attr in cqi_corpus.positional_attributes.list()
 | 
			
		||||
    }
 | 
			
		||||
    cqi_s_attrs: Dict[str, CQiStructuralAttribute] = {
 | 
			
		||||
        s_attr.name: s_attr
 | 
			
		||||
        for s_attr in cqi_corpus.structural_attributes.list()
 | 
			
		||||
    }
 | 
			
		||||
    static_corpus_data = {
 | 
			
		||||
    cqi_p_attrs: List[CQiPositionalAttribute] = cqi_corpus.positional_attributes.list()
 | 
			
		||||
    cqi_s_attrs: List[CQiStructuralAttribute] = cqi_corpus.structural_attributes.list()
 | 
			
		||||
 | 
			
		||||
    static_data = {
 | 
			
		||||
        'corpus': {
 | 
			
		||||
            'bounds': [0, cqi_corpus.size - 1],
 | 
			
		||||
            'counts': {
 | 
			
		||||
                'token': cqi_corpus.size
 | 
			
		||||
            },
 | 
			
		||||
            'freqs': {}
 | 
			
		||||
        },
 | 
			
		||||
        'p_attrs': {},
 | 
			
		||||
        's_attrs': {},
 | 
			
		||||
        'values': {'p_attrs': {}, 's_attrs': {}}
 | 
			
		||||
    }
 | 
			
		||||
    for p_attr in cqi_p_attrs.values():
 | 
			
		||||
        static_corpus_data['corpus']['freqs'][p_attr.name] = {}
 | 
			
		||||
        chunk_size = 10000
 | 
			
		||||
        p_attr_id_list = list(range(p_attr.lexicon_size))
 | 
			
		||||
        chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)]
 | 
			
		||||
 | 
			
		||||
    for p_attr in cqi_p_attrs:
 | 
			
		||||
        print(f'corpus.freqs.{p_attr.name}')
 | 
			
		||||
        static_data['corpus']['freqs'][p_attr.name] = []
 | 
			
		||||
        p_attr_id_list: List[int] = list(range(p_attr.lexicon_size))
 | 
			
		||||
        static_data['corpus']['freqs'][p_attr.name].extend(p_attr.freqs_by_ids(p_attr_id_list))
 | 
			
		||||
        del p_attr_id_list
 | 
			
		||||
        for chunk in chunks:
 | 
			
		||||
            # print(f'corpus.freqs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
 | 
			
		||||
            static_corpus_data['corpus']['freqs'][p_attr.name].update(
 | 
			
		||||
                dict(zip(chunk, p_attr.freqs_by_ids(chunk)))
 | 
			
		||||
            )
 | 
			
		||||
        del chunks
 | 
			
		||||
        static_corpus_data['p_attrs'][p_attr.name] = {}
 | 
			
		||||
        cpos_list = list(range(cqi_corpus.size))
 | 
			
		||||
        chunks = [cpos_list[i:i+chunk_size] for i in range(0, len(cpos_list), chunk_size)]
 | 
			
		||||
 | 
			
		||||
        print(f'p_attrs.{p_attr.name}')
 | 
			
		||||
        static_data['p_attrs'][p_attr.name] = []
 | 
			
		||||
        cpos_list: List[int] = list(range(cqi_corpus.size))
 | 
			
		||||
        static_data['p_attrs'][p_attr.name].extend(p_attr.ids_by_cpos(cpos_list))
 | 
			
		||||
        del cpos_list
 | 
			
		||||
        for chunk in chunks:
 | 
			
		||||
            # print(f'p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
 | 
			
		||||
            static_corpus_data['p_attrs'][p_attr.name].update(
 | 
			
		||||
                dict(zip(chunk, p_attr.ids_by_cpos(chunk)))
 | 
			
		||||
            )
 | 
			
		||||
        del chunks
 | 
			
		||||
        static_corpus_data['values']['p_attrs'][p_attr.name] = {}
 | 
			
		||||
        p_attr_id_list = list(range(p_attr.lexicon_size))
 | 
			
		||||
        chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)]
 | 
			
		||||
 | 
			
		||||
        print(f'values.p_attrs.{p_attr.name}')
 | 
			
		||||
        static_data['values']['p_attrs'][p_attr.name] = []
 | 
			
		||||
        p_attr_id_list: List[int] = list(range(p_attr.lexicon_size))
 | 
			
		||||
        static_data['values']['p_attrs'][p_attr.name].extend(p_attr.values_by_ids(p_attr_id_list))
 | 
			
		||||
        del p_attr_id_list
 | 
			
		||||
        for chunk in chunks:
 | 
			
		||||
            # print(f'values.p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
 | 
			
		||||
            static_corpus_data['values']['p_attrs'][p_attr.name].update(
 | 
			
		||||
                dict(zip(chunk, p_attr.values_by_ids(chunk)))
 | 
			
		||||
            )
 | 
			
		||||
        del chunks
 | 
			
		||||
    for s_attr in cqi_s_attrs.values():
 | 
			
		||||
 | 
			
		||||
    for s_attr in cqi_s_attrs:
 | 
			
		||||
        if s_attr.has_values:
 | 
			
		||||
            continue
 | 
			
		||||
        static_corpus_data['corpus']['counts'][s_attr.name] = s_attr.size
 | 
			
		||||
        static_corpus_data['s_attrs'][s_attr.name] = {'lexicon': {}, 'values': None}
 | 
			
		||||
        static_corpus_data['values']['s_attrs'][s_attr.name] = {}
 | 
			
		||||
        ##########################################################################
 | 
			
		||||
        # A faster way to get cpos boundaries for smaller s_attrs                #
 | 
			
		||||
        ##########################################################################
 | 
			
		||||
        # if s_attr.name in ['s', 'ent']:
 | 
			
		||||
        #     cqi_corpus.query('Last', f'<{s_attr.name}> []* </{s_attr.name}>;')
 | 
			
		||||
        #     cqi_subcorpus = cqi_corpus.subcorpora.get('Last')
 | 
			
		||||
        #     first_match = 0
 | 
			
		||||
        #     last_match = cqi_subcorpus.size - 1
 | 
			
		||||
        #     match_boundaries = zip(
 | 
			
		||||
        #         range(first_match, last_match + 1),
 | 
			
		||||
        #         cqi_subcorpus.dump(cqi_subcorpus.fields['match'], first_match, last_match),
 | 
			
		||||
        #         cqi_subcorpus.dump(cqi_subcorpus.fields['matchend'], first_match, last_match)
 | 
			
		||||
        #     )
 | 
			
		||||
        #     for id, lbound, rbound in match_boundaries:
 | 
			
		||||
        #         static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {}
 | 
			
		||||
        #         static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
 | 
			
		||||
        #         static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {}
 | 
			
		||||
        #         static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
 | 
			
		||||
        #     cqi_subcorpus.drop()
 | 
			
		||||
 | 
			
		||||
        static_data['s_attrs'][s_attr.name] = {'lexicon': [], 'values': None}
 | 
			
		||||
 | 
			
		||||
        if s_attr.name in ['s', 'ent']:
 | 
			
		||||
            ##############################################################
 | 
			
		||||
            # A faster way to get cpos boundaries for smaller s_attrs    #
 | 
			
		||||
            # Note: Needs more testing, don't use it in production       #
 | 
			
		||||
            ##############################################################
 | 
			
		||||
            cqi_corpus.query('Last', f'<{s_attr.name}> []* </{s_attr.name}>;')
 | 
			
		||||
            cqi_subcorpus: CQiSubcorpus = cqi_corpus.subcorpora.get('Last')
 | 
			
		||||
            first_match: int = 0
 | 
			
		||||
            last_match: int = cqi_subcorpus.size - 1
 | 
			
		||||
            match_boundaries = zip(
 | 
			
		||||
                range(first_match, last_match + 1),
 | 
			
		||||
                cqi_subcorpus.dump(
 | 
			
		||||
                    cqi_subcorpus.fields['match'],
 | 
			
		||||
                    first_match,
 | 
			
		||||
                    last_match
 | 
			
		||||
                ),
 | 
			
		||||
                cqi_subcorpus.dump(
 | 
			
		||||
                    cqi_subcorpus.fields['matchend'],
 | 
			
		||||
                    first_match,
 | 
			
		||||
                    last_match
 | 
			
		||||
                )
 | 
			
		||||
            )
 | 
			
		||||
            cqi_subcorpus.drop()
 | 
			
		||||
            del cqi_subcorpus, first_match, last_match
 | 
			
		||||
            for id, lbound, rbound in match_boundaries:
 | 
			
		||||
                static_data['s_attrs'][s_attr.name]['lexicon'].append({})
 | 
			
		||||
                print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
 | 
			
		||||
                static_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
 | 
			
		||||
            del match_boundaries
 | 
			
		||||
 | 
			
		||||
        if s_attr.name != 'text':
 | 
			
		||||
            continue
 | 
			
		||||
 | 
			
		||||
        for id in range(0, s_attr.size):
 | 
			
		||||
            # print(f's_attrs.{s_attr.name}.lexicon.{id}')
 | 
			
		||||
            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {
 | 
			
		||||
                'bounds': None,
 | 
			
		||||
                'counts': None,
 | 
			
		||||
                'freqs': None
 | 
			
		||||
            }
 | 
			
		||||
            if s_attr.name != 'text':
 | 
			
		||||
                continue
 | 
			
		||||
            static_data['s_attrs'][s_attr.name]['lexicon'].append({})
 | 
			
		||||
            # This is a very slow operation, thats why we only use it for
 | 
			
		||||
            # the text attribute
 | 
			
		||||
            lbound, rbound = s_attr.cpos_by_id(id)
 | 
			
		||||
            # print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
 | 
			
		||||
            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
 | 
			
		||||
            # print(f's_attrs.{s_attr.name}.lexicon.{id}.counts')
 | 
			
		||||
            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {}
 | 
			
		||||
            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
 | 
			
		||||
            cpos_list = list(range(lbound, rbound + 1))
 | 
			
		||||
            chunks = [cpos_list[i:i+chunk_size] for i in range(0, len(cpos_list), chunk_size)]
 | 
			
		||||
            del cpos_list
 | 
			
		||||
            ent_ids = set()
 | 
			
		||||
            for chunk in chunks:
 | 
			
		||||
                # print(f'Gather ent_ids from cpos: {chunk[0]} - {chunk[-1]}')
 | 
			
		||||
                ent_ids.update({x for x in cqi_s_attrs['ent'].ids_by_cpos(chunk) if x != -1})
 | 
			
		||||
            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['ent'] = len(ent_ids)
 | 
			
		||||
            del ent_ids
 | 
			
		||||
            s_ids = set()
 | 
			
		||||
            for chunk in chunks:
 | 
			
		||||
                # print(f'Gather s_ids from cpos: {chunk[0]} - {chunk[-1]}')
 | 
			
		||||
                s_ids.update({x for x in cqi_s_attrs['s'].ids_by_cpos(chunk) if x != -1})
 | 
			
		||||
            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['s'] = len(s_ids)
 | 
			
		||||
            del s_ids
 | 
			
		||||
            # print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs')
 | 
			
		||||
            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'] = {}
 | 
			
		||||
            for p_attr in cqi_p_attrs.values():
 | 
			
		||||
                p_attr_ids = []
 | 
			
		||||
                for chunk in chunks:
 | 
			
		||||
                    # print(f'Gather p_attr_ids from cpos: {chunk[0]} - {chunk[-1]}')
 | 
			
		||||
                    p_attr_ids.extend(p_attr.ids_by_cpos(chunk))
 | 
			
		||||
                static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'][p_attr.name] = dict(Counter(p_attr_ids))
 | 
			
		||||
            print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
 | 
			
		||||
            static_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
 | 
			
		||||
            static_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'] = {}
 | 
			
		||||
            cpos_list: List[int] = list(range(lbound, rbound + 1))
 | 
			
		||||
            for p_attr in cqi_p_attrs:
 | 
			
		||||
                p_attr_ids: List[int] = []
 | 
			
		||||
                p_attr_ids.extend(p_attr.ids_by_cpos(cpos_list))
 | 
			
		||||
                print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs.{p_attr.name}')
 | 
			
		||||
                static_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'][p_attr.name] = dict(Counter(p_attr_ids))
 | 
			
		||||
                del p_attr_ids
 | 
			
		||||
            del chunks
 | 
			
		||||
        sub_s_attrs = cqi_corpus.structural_attributes.list(filters={'part_of': s_attr})
 | 
			
		||||
        s_attr_value_names: List[str] = [
 | 
			
		||||
            del cpos_list
 | 
			
		||||
 | 
			
		||||
        sub_s_attrs: List[CQiStructuralAttribute] = cqi_corpus.structural_attributes.list(filters={'part_of': s_attr})
 | 
			
		||||
        print(f's_attrs.{s_attr.name}.values')
 | 
			
		||||
        static_data['s_attrs'][s_attr.name]['values'] = [
 | 
			
		||||
            sub_s_attr.name[(len(s_attr.name) + 1):]
 | 
			
		||||
            for sub_s_attr in sub_s_attrs
 | 
			
		||||
        ]
 | 
			
		||||
        s_attr_id_list = list(range(s_attr.size))
 | 
			
		||||
        chunks = [s_attr_id_list[i:i+chunk_size] for i in range(0, len(s_attr_id_list), chunk_size)]
 | 
			
		||||
        del s_attr_id_list
 | 
			
		||||
        sub_s_attr_values = []
 | 
			
		||||
        s_attr_id_list: List[int] = list(range(s_attr.size))
 | 
			
		||||
        sub_s_attr_values: List[str] = []
 | 
			
		||||
        for sub_s_attr in sub_s_attrs:
 | 
			
		||||
            tmp = []
 | 
			
		||||
            for chunk in chunks:
 | 
			
		||||
                tmp.extend(sub_s_attr.values_by_ids(chunk))
 | 
			
		||||
            tmp.extend(sub_s_attr.values_by_ids(s_attr_id_list))
 | 
			
		||||
            sub_s_attr_values.append(tmp)
 | 
			
		||||
            del tmp
 | 
			
		||||
        del chunks
 | 
			
		||||
        # print(f's_attrs.{s_attr.name}.values')
 | 
			
		||||
        static_corpus_data['s_attrs'][s_attr.name]['values'] = s_attr_value_names
 | 
			
		||||
        # print(f'values.s_attrs.{s_attr.name}')
 | 
			
		||||
        static_corpus_data['values']['s_attrs'][s_attr.name] = {
 | 
			
		||||
            s_attr_id: {
 | 
			
		||||
                s_attr_value_name: sub_s_attr_values[s_attr_value_name_idx][s_attr_id_idx]
 | 
			
		||||
        del s_attr_id_list
 | 
			
		||||
        print(f'values.s_attrs.{s_attr.name}')
 | 
			
		||||
        static_data['values']['s_attrs'][s_attr.name] = [
 | 
			
		||||
            {
 | 
			
		||||
                s_attr_value_name: sub_s_attr_values[s_attr_value_name_idx][s_attr_id]
 | 
			
		||||
                for s_attr_value_name_idx, s_attr_value_name in enumerate(
 | 
			
		||||
                    static_corpus_data['s_attrs'][s_attr.name]['values']
 | 
			
		||||
                    static_data['s_attrs'][s_attr.name]['values']
 | 
			
		||||
                )
 | 
			
		||||
            } for s_attr_id_idx, s_attr_id in enumerate(range(0, s_attr.size))
 | 
			
		||||
        }
 | 
			
		||||
            } for s_attr_id in range(0, s_attr.size)
 | 
			
		||||
        ]
 | 
			
		||||
        del sub_s_attr_values
 | 
			
		||||
    with gzip.open(cache_file_path, 'wt') as f:
 | 
			
		||||
        json.dump(static_corpus_data, f)
 | 
			
		||||
    del static_corpus_data
 | 
			
		||||
    with open(cache_file_path, 'rb') as f:
 | 
			
		||||
    print('Saving static data to file')
 | 
			
		||||
    with gzip.open(static_data_file_path, 'wt') as f:
 | 
			
		||||
        json.dump(static_data, f)
 | 
			
		||||
    del static_data
 | 
			
		||||
    print('Sending static data to client')
 | 
			
		||||
    with open(static_data_file_path, 'rb') as f:
 | 
			
		||||
        return f.read()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user