mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
				synced 2025-11-04 04:12:45 +00:00 
			
		
		
		
	remove debug messages and increase chunk size in cqi
This commit is contained in:
		@@ -12,7 +12,6 @@ import gzip
 | 
			
		||||
import json
 | 
			
		||||
import math
 | 
			
		||||
import os
 | 
			
		||||
import shutil
 | 
			
		||||
from app import db
 | 
			
		||||
from app.models import Corpus
 | 
			
		||||
from .utils import lookups_by_cpos, partial_export_subcorpus, export_subcorpus
 | 
			
		||||
@@ -67,12 +66,12 @@ def ext_corpus_static_data(corpus: str) -> Dict:
 | 
			
		||||
    }
 | 
			
		||||
    for p_attr in cqi_p_attrs.values():
 | 
			
		||||
        static_corpus_data['corpus']['freqs'][p_attr.name] = {}
 | 
			
		||||
        chunk_size = 5000
 | 
			
		||||
        chunk_size = 10000
 | 
			
		||||
        p_attr_id_list = list(range(p_attr.lexicon_size))
 | 
			
		||||
        chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)]
 | 
			
		||||
        del p_attr_id_list
 | 
			
		||||
        for chunk in chunks:
 | 
			
		||||
            print(f'corpus.freqs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
 | 
			
		||||
            # print(f'corpus.freqs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
 | 
			
		||||
            static_corpus_data['corpus']['freqs'][p_attr.name].update(
 | 
			
		||||
                dict(zip(chunk, p_attr.freqs_by_ids(chunk)))
 | 
			
		||||
            )
 | 
			
		||||
@@ -82,7 +81,7 @@ def ext_corpus_static_data(corpus: str) -> Dict:
 | 
			
		||||
        chunks = [cpos_list[i:i+chunk_size] for i in range(0, len(cpos_list), chunk_size)]
 | 
			
		||||
        del cpos_list
 | 
			
		||||
        for chunk in chunks:
 | 
			
		||||
            print(f'p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
 | 
			
		||||
            # print(f'p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
 | 
			
		||||
            static_corpus_data['p_attrs'][p_attr.name].update(
 | 
			
		||||
                dict(zip(chunk, p_attr.ids_by_cpos(chunk)))
 | 
			
		||||
            )
 | 
			
		||||
@@ -92,7 +91,7 @@ def ext_corpus_static_data(corpus: str) -> Dict:
 | 
			
		||||
        chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)]
 | 
			
		||||
        del p_attr_id_list
 | 
			
		||||
        for chunk in chunks:
 | 
			
		||||
            print(f'values.p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
 | 
			
		||||
            # print(f'values.p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
 | 
			
		||||
            static_corpus_data['values']['p_attrs'][p_attr.name].update(
 | 
			
		||||
                dict(zip(chunk, p_attr.values_by_ids(chunk)))
 | 
			
		||||
            )
 | 
			
		||||
@@ -123,7 +122,7 @@ def ext_corpus_static_data(corpus: str) -> Dict:
 | 
			
		||||
        #         static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
 | 
			
		||||
        #     cqi_subcorpus.drop()
 | 
			
		||||
        for id in range(0, s_attr.size):
 | 
			
		||||
            print(f's_attrs.{s_attr.name}.lexicon.{id}')
 | 
			
		||||
            # print(f's_attrs.{s_attr.name}.lexicon.{id}')
 | 
			
		||||
            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {
 | 
			
		||||
                'bounds': None,
 | 
			
		||||
                'counts': None,
 | 
			
		||||
@@ -132,9 +131,9 @@ def ext_corpus_static_data(corpus: str) -> Dict:
 | 
			
		||||
            if s_attr.name != 'text':
 | 
			
		||||
                continue
 | 
			
		||||
            lbound, rbound = s_attr.cpos_by_id(id)
 | 
			
		||||
            print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
 | 
			
		||||
            # print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
 | 
			
		||||
            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
 | 
			
		||||
            print(f's_attrs.{s_attr.name}.lexicon.{id}.counts')
 | 
			
		||||
            # print(f's_attrs.{s_attr.name}.lexicon.{id}.counts')
 | 
			
		||||
            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {}
 | 
			
		||||
            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
 | 
			
		||||
            cpos_list = list(range(lbound, rbound + 1))
 | 
			
		||||
@@ -142,22 +141,22 @@ def ext_corpus_static_data(corpus: str) -> Dict:
 | 
			
		||||
            del cpos_list
 | 
			
		||||
            ent_ids = set()
 | 
			
		||||
            for chunk in chunks:
 | 
			
		||||
                print(f'Gather ent_ids from cpos: {chunk[0]} - {chunk[-1]}')
 | 
			
		||||
                # print(f'Gather ent_ids from cpos: {chunk[0]} - {chunk[-1]}')
 | 
			
		||||
                ent_ids.update({x for x in cqi_s_attrs['ent'].ids_by_cpos(chunk) if x != -1})
 | 
			
		||||
            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['ent'] = len(ent_ids)
 | 
			
		||||
            del ent_ids
 | 
			
		||||
            s_ids = set()
 | 
			
		||||
            for chunk in chunks:
 | 
			
		||||
                print(f'Gather s_ids from cpos: {chunk[0]} - {chunk[-1]}')
 | 
			
		||||
                # print(f'Gather s_ids from cpos: {chunk[0]} - {chunk[-1]}')
 | 
			
		||||
                s_ids.update({x for x in cqi_s_attrs['s'].ids_by_cpos(chunk) if x != -1})
 | 
			
		||||
            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['s'] = len(s_ids)
 | 
			
		||||
            del s_ids
 | 
			
		||||
            print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs')
 | 
			
		||||
            # print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs')
 | 
			
		||||
            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'] = {}
 | 
			
		||||
            for p_attr in cqi_p_attrs.values():
 | 
			
		||||
                p_attr_ids = []
 | 
			
		||||
                for chunk in chunks:
 | 
			
		||||
                    print(f'Gather p_attr_ids from cpos: {chunk[0]} - {chunk[-1]}')
 | 
			
		||||
                    # print(f'Gather p_attr_ids from cpos: {chunk[0]} - {chunk[-1]}')
 | 
			
		||||
                    p_attr_ids.extend(p_attr.ids_by_cpos(chunk))
 | 
			
		||||
                static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'][p_attr.name] = dict(Counter(p_attr_ids))
 | 
			
		||||
                del p_attr_ids
 | 
			
		||||
@@ -178,9 +177,9 @@ def ext_corpus_static_data(corpus: str) -> Dict:
 | 
			
		||||
            sub_s_attr_values.append(tmp)
 | 
			
		||||
            del tmp
 | 
			
		||||
        del chunks
 | 
			
		||||
        print(f's_attrs.{s_attr.name}.values')
 | 
			
		||||
        # print(f's_attrs.{s_attr.name}.values')
 | 
			
		||||
        static_corpus_data['s_attrs'][s_attr.name]['values'] = s_attr_value_names
 | 
			
		||||
        print(f'values.s_attrs.{s_attr.name}')
 | 
			
		||||
        # print(f'values.s_attrs.{s_attr.name}')
 | 
			
		||||
        static_corpus_data['values']['s_attrs'][s_attr.name] = {
 | 
			
		||||
            s_attr_id: {
 | 
			
		||||
                s_attr_value_name: sub_s_attr_values[s_attr_value_name_idx][s_attr_id_idx]
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user