From 413b6111df712972a09e8a8c87dc4b1436793a7c Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Mon, 3 Jul 2023 15:31:28 +0200 Subject: [PATCH] Implement fast boundary computation for ent and s s_attrs --- app/corpora/cqi_over_sio/extensions.py | 27 +++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/app/corpora/cqi_over_sio/extensions.py b/app/corpora/cqi_over_sio/extensions.py index d92b6d5e..0ff166bf 100644 --- a/app/corpora/cqi_over_sio/extensions.py +++ b/app/corpora/cqi_over_sio/extensions.py @@ -99,12 +99,29 @@ def ext_corpus_static_data(corpus: str) -> Dict: static_corpus_data['corpus']['counts'][s_attr.name] = s_attr.size static_corpus_data['s_attrs'][s_attr.name] = {'lexicon': {}, 'values': None} static_corpus_data['values']['s_attrs'][s_attr.name] = {} + if s_attr.name in ['s', 'ent']: + cqi_corpus.query('Last', f'<{s_attr.name}> []* ;') + cqi_subcorpus = cqi_corpus.subcorpora.get('Last') + first_match = 0 + last_match = cqi_subcorpus.size - 1 + match_boundaries = zip( + range(first_match, last_match + 1), + cqi_subcorpus.dump(cqi_subcorpus.fields['match'], first_match, last_match), + cqi_subcorpus.dump(cqi_subcorpus.fields['matchend'], first_match, last_match) + ) + for id, lbound, rbound in match_boundaries: + static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {} + static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound] + static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {} + static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1 + cqi_subcorpus.drop() for id in range(0, s_attr.size): - static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {} - lbound, rbound = s_attr.cpos_by_id(id) - static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound] - static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {} - static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1 + if s_attr.name not in ['s', 'ent']: + static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {} + lbound, rbound = s_attr.cpos_by_id(id) + static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound] + static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {} + static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1 if s_attr.name not in ['text', 's']: continue cpos_range = range(lbound, rbound + 1)