mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
				synced 2025-11-04 04:12:45 +00:00 
			
		
		
		
	Implement fast boundary computation for ent and s s_attrs
This commit is contained in:
		@@ -99,12 +99,29 @@ def ext_corpus_static_data(corpus: str) -> Dict:
 | 
				
			|||||||
        static_corpus_data['corpus']['counts'][s_attr.name] = s_attr.size
 | 
					        static_corpus_data['corpus']['counts'][s_attr.name] = s_attr.size
 | 
				
			||||||
        static_corpus_data['s_attrs'][s_attr.name] = {'lexicon': {}, 'values': None}
 | 
					        static_corpus_data['s_attrs'][s_attr.name] = {'lexicon': {}, 'values': None}
 | 
				
			||||||
        static_corpus_data['values']['s_attrs'][s_attr.name] = {}
 | 
					        static_corpus_data['values']['s_attrs'][s_attr.name] = {}
 | 
				
			||||||
 | 
					        if s_attr.name in ['s', 'ent']:
 | 
				
			||||||
 | 
					            cqi_corpus.query('Last', f'<{s_attr.name}> []* </{s_attr.name}>;')
 | 
				
			||||||
 | 
					            cqi_subcorpus = cqi_corpus.subcorpora.get('Last')
 | 
				
			||||||
 | 
					            first_match = 0
 | 
				
			||||||
 | 
					            last_match = cqi_subcorpus.size - 1
 | 
				
			||||||
 | 
					            match_boundaries = zip(
 | 
				
			||||||
 | 
					                range(first_match, last_match + 1),
 | 
				
			||||||
 | 
					                cqi_subcorpus.dump(cqi_subcorpus.fields['match'], first_match, last_match),
 | 
				
			||||||
 | 
					                cqi_subcorpus.dump(cqi_subcorpus.fields['matchend'], first_match, last_match)
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            for id, lbound, rbound in match_boundaries:
 | 
				
			||||||
 | 
					                static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {}
 | 
				
			||||||
 | 
					                static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
 | 
				
			||||||
 | 
					                static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {}
 | 
				
			||||||
 | 
					                static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
 | 
				
			||||||
 | 
					            cqi_subcorpus.drop()
 | 
				
			||||||
        for id in range(0, s_attr.size):
 | 
					        for id in range(0, s_attr.size):
 | 
				
			||||||
            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {}
 | 
					            if s_attr.name not in ['s', 'ent']:
 | 
				
			||||||
            lbound, rbound = s_attr.cpos_by_id(id)
 | 
					                static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {}
 | 
				
			||||||
            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
 | 
					                lbound, rbound = s_attr.cpos_by_id(id)
 | 
				
			||||||
            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {}
 | 
					                static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
 | 
				
			||||||
            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
 | 
					                static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {}
 | 
				
			||||||
 | 
					                static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
 | 
				
			||||||
            if s_attr.name not in ['text', 's']:
 | 
					            if s_attr.name not in ['text', 's']:
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
            cpos_range = range(lbound, rbound + 1)
 | 
					            cpos_range = range(lbound, rbound + 1)
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user