mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
				synced 2025-11-04 04:12:45 +00:00 
			
		
		
		
	Get results with wrapper 3.0
This commit is contained in:
		@@ -1,6 +1,6 @@
 | 
			
		||||
from .CQiClient import CQiClient
 | 
			
		||||
from .CQi import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND
 | 
			
		||||
import re
 | 
			
		||||
import time
 | 
			
		||||
from app import logger  # only works if imported into opaque web app
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -94,6 +94,7 @@ class CQiWrapper(CQiClient):
 | 
			
		||||
                                 + result_subcorpus_name)
 | 
			
		||||
        self.SUBCORPUS_NAMES.append(self.result_subcorpus)
 | 
			
		||||
        self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus)
 | 
			
		||||
        print('Nr of all matches is:', self.nr_matches)
 | 
			
		||||
        # logger.warning('Nr of all matches is: {}'.format(self.nr_matches))
 | 
			
		||||
 | 
			
		||||
    def show_subcorpora(self):
 | 
			
		||||
@@ -104,7 +105,8 @@ class CQiWrapper(CQiClient):
 | 
			
		||||
 | 
			
		||||
    def show_query_results(self,
 | 
			
		||||
                           context_len=10,
 | 
			
		||||
                           result_len=1000):
 | 
			
		||||
                           result_len=1000,
 | 
			
		||||
                           result_offset=0):
 | 
			
		||||
        """
 | 
			
		||||
        Show query results
 | 
			
		||||
 | 
			
		||||
@@ -131,14 +133,16 @@ class CQiWrapper(CQiClient):
 | 
			
		||||
            # match_boundries shows the start and end cpos of one match as a
 | 
			
		||||
            # pair of cpositions
 | 
			
		||||
            # [(1355, 1357), (1477, 1479)] Example for two boundry pairs
 | 
			
		||||
            offset_start = 0 + (result_offset + 1) if result_offset != 0 else result_offset
 | 
			
		||||
            offset_end = self.nr_matches + result_offset
 | 
			
		||||
            match_boundaries = zip(self.cqp_dump_subcorpus(self.result_subcorpus,
 | 
			
		||||
                                                           CONST_FIELD_MATCH,
 | 
			
		||||
                                                           0,
 | 
			
		||||
                                                           self.nr_matches - 1),
 | 
			
		||||
                                                           offset_start,
 | 
			
		||||
                                                           offset_end),
 | 
			
		||||
                                   self.cqp_dump_subcorpus(self.result_subcorpus,
 | 
			
		||||
                                                           CONST_FIELD_MATCHEND,
 | 
			
		||||
                                                           0,
 | 
			
		||||
                                                           self.nr_matches - 1))
 | 
			
		||||
                                                           offset_start,
 | 
			
		||||
                                                           offset_end))
 | 
			
		||||
 | 
			
		||||
        # Generate all cpos between match boundries including start and end boundries.
 | 
			
		||||
        # Also generate cpos for left and right context.
 | 
			
		||||
@@ -152,7 +156,7 @@ class CQiWrapper(CQiClient):
 | 
			
		||||
            lc = {'lc': lc_cpos}
 | 
			
		||||
            match_cpos = list(range(start, end + 1))
 | 
			
		||||
            match = {'hit': match_cpos}
 | 
			
		||||
            rc_cpos = list(range(end + 1, min([self.corpus_max_len, end + self.context_len + 1])))
 | 
			
		||||
            rc_cpos = list(range(end, min([self.corpus_max_len, end + self.context_len])))
 | 
			
		||||
            rc = {'rc': rc_cpos}
 | 
			
		||||
            lc.update(match)
 | 
			
		||||
            lc.update(rc)
 | 
			
		||||
@@ -161,81 +165,87 @@ class CQiWrapper(CQiClient):
 | 
			
		||||
        # print(all_matches)
 | 
			
		||||
        # print(all_cpos)
 | 
			
		||||
 | 
			
		||||
        # Get all sentences IDs for all above collected cpos in all_cpos
 | 
			
		||||
        s_ids = self.cl_cpos2struc('CORPUS.s', all_cpos) # CHANGE to CORPUS.s will always be like this in nopaque
 | 
			
		||||
        # Get all cpos for all sneteces boundries
 | 
			
		||||
        s_lookup = {}
 | 
			
		||||
        for s_id in set(s_ids):
 | 
			
		||||
            s_start, s_end = self.cl_struc2cpos('CORPUS.s', s_id)  # CHANGE to CORPUS.s will always be like this in nopaque
 | 
			
		||||
            # print(s_start, s_end)
 | 
			
		||||
            s_cpos = range(s_start, s_end)
 | 
			
		||||
            s_lookup.update({s_id: list(s_cpos)})
 | 
			
		||||
            # print(list(s_cpos))
 | 
			
		||||
            all_cpos.extend(s_cpos)
 | 
			
		||||
        # s_lookup = {}
 | 
			
		||||
        # for s_id in set(s_ids):
 | 
			
		||||
        #     s_start, s_end = self.cl_struc2cpos('UTOPIEN.s', s_id)
 | 
			
		||||
        #     # CHANGE to UTOPIEN.s will always be like this in nopaque
 | 
			
		||||
        #     s_cpos = range(s_start, s_end)
 | 
			
		||||
        #     s_lookup.update({s_id: list(s_cpos)})
 | 
			
		||||
        #     # print(list(s_cpos))
 | 
			
		||||
        #     all_cpos.extend(s_cpos)
 | 
			
		||||
        t0 = time.time()
 | 
			
		||||
        all_cpos = list(set(all_cpos)) # get rid of cpos duplicates
 | 
			
		||||
        t1 = time.time()
 | 
			
		||||
        t_total = t1 - t0
 | 
			
		||||
        print('TIME FOR ALL CPOS:', t_total)
 | 
			
		||||
        print('CPOS SUM:', len(all_cpos))
 | 
			
		||||
 | 
			
		||||
        # Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
 | 
			
		||||
        # all cpos entries in all_cpos_list
 | 
			
		||||
        # Also saves these informations into self.results dict
 | 
			
		||||
        t6 = time.time()
 | 
			
		||||
        all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
 | 
			
		||||
        t7 = time.time()
 | 
			
		||||
        t_final = t7 - t6
 | 
			
		||||
        print('GOT ALL RESULTS IN:', t_final)
 | 
			
		||||
 | 
			
		||||
        self.results = {'matches': all_matches, 'cpos_lookup': all_cpos_infos,
 | 
			
		||||
                        's_lookup': s_lookup, 'text_lookup': text_lookup}
 | 
			
		||||
                        'text_lookup': text_lookup}
 | 
			
		||||
        return self.results
 | 
			
		||||
        # print(self.results)
 | 
			
		||||
 | 
			
		||||
    def get_cpos_infos(self, all_cpos):
 | 
			
		||||
        '''
 | 
			
		||||
        Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
 | 
			
		||||
        all cpos entries specified in the parameter all_cpos.
 | 
			
		||||
        '''
 | 
			
		||||
        # Get all positional attribute informations
 | 
			
		||||
        cpos_infos = {}
 | 
			
		||||
        for p_attr_key in self.attr_strings['positional_attrs'].keys():
 | 
			
		||||
            match_strs = self.cl_cpos2str(self.attr_strings['positional_attrs'][p_attr_key], all_cpos)
 | 
			
		||||
            cpos_infos[p_attr_key] = match_strs
 | 
			
		||||
 | 
			
		||||
        tmp_s_info = []
 | 
			
		||||
        tmp_text_info = []
 | 
			
		||||
        text_lookup = {}
 | 
			
		||||
        tmp_dict = {}
 | 
			
		||||
        # Get all strucutural attribute informations
 | 
			
		||||
        tmp_info = {}
 | 
			
		||||
        structs_to_check = []
 | 
			
		||||
        for struct_attr_key in self.attr_strings['struct_attrs'].keys():
 | 
			
		||||
            check = self.attr_strings['struct_attrs'][struct_attr_key]
 | 
			
		||||
            if check == 'CORPUS.s':
 | 
			
		||||
                struct_ids = self.cl_cpos2struc(check, all_cpos)
 | 
			
		||||
            key = self.attr_strings['struct_attrs'][struct_attr_key]
 | 
			
		||||
            has_value = self.corpus_structural_attribute_has_values(key)
 | 
			
		||||
            struct_ids = self.cl_cpos2struc(key, all_cpos)
 | 
			
		||||
            if has_value is False:  # Get IDs of strucutural elements without values (this means get IDs of XML tags. Struct elements only have values if they are XML attributes)
 | 
			
		||||
                tmp_info[struct_attr_key] = []
 | 
			
		||||
                for id in struct_ids:
 | 
			
		||||
                    tmp_s_info.append({struct_attr_key: id})
 | 
			
		||||
            elif check == 'CORPUS.text':
 | 
			
		||||
                struct_ids = self.cl_cpos2struc(check, all_cpos)
 | 
			
		||||
                for id in struct_ids:
 | 
			
		||||
                    tmp_text_info.append({struct_attr_key: id})
 | 
			
		||||
                    tmp_info[struct_attr_key].append(id)
 | 
			
		||||
            else:
 | 
			
		||||
                struct_ids = struct_ids = self.cl_cpos2struc(check, all_cpos)
 | 
			
		||||
                struct_values = self.cl_struc2str(self.attr_strings['struct_attrs'][struct_attr_key], struct_ids)
 | 
			
		||||
                for value in struct_values:
 | 
			
		||||
                    for id in struct_ids:
 | 
			
		||||
                        tmp_dict.update({id: {struct_attr_key: value}})
 | 
			
		||||
        print(tmp_dict)
 | 
			
		||||
        print(text_lookup)
 | 
			
		||||
                structs_to_check.append({key: struct_attr_key})
 | 
			
		||||
        struct_attr_values = list(tmp_info.values())
 | 
			
		||||
        struct_attr_keys = list(tmp_info.keys())
 | 
			
		||||
 | 
			
		||||
            # struct_entry = self.cl_cpos2struc(self.attr_strings['struct_attrs'][struct_attr_key], all_cpos)
 | 
			
		||||
            # has_value = self.corpus_structural_attribute_has_values(self.attr_strings['struct_attrs'][struct_attr_key])
 | 
			
		||||
            # if has_value:
 | 
			
		||||
            #     match_strs = self.cl_struc2str(self.attr_strings['struct_attrs'][struct_attr_key], struct_entry)
 | 
			
		||||
            # elif self.attr_strings['struct_attrs'][struct_attr_key] == 'CORPUS.s':
 | 
			
		||||
            #     pass
 | 
			
		||||
            # else:
 | 
			
		||||
            #     match_strs = [None for i in struct_entry]
 | 
			
		||||
            # cpos_infos[struct_attr_key] = zip(struct_entry, match_strs)
 | 
			
		||||
        tmp_list = []
 | 
			
		||||
        attr_key_list = []
 | 
			
		||||
        # Build textlookup dictionary
 | 
			
		||||
        text_lookup_ids = list(set(struct_attr_values[0]))  # First is always one text
 | 
			
		||||
        text_lookup = {}
 | 
			
		||||
        for d in structs_to_check:
 | 
			
		||||
            s_key, s_value = zip(*d.items())
 | 
			
		||||
            s_value = s_value[0].split('_')[1]
 | 
			
		||||
            struct_values = self.cl_struc2str(s_key[0], text_lookup_ids)
 | 
			
		||||
            zipped = dict(zip(text_lookup_ids, struct_values))
 | 
			
		||||
            for zip_key, zip_value in zipped.items():
 | 
			
		||||
                check = text_lookup.get(zip_key)
 | 
			
		||||
                if check is None:
 | 
			
		||||
                    text_lookup[zip_key] = {s_value: zip_value}
 | 
			
		||||
                else:
 | 
			
		||||
                    text_lookup[zip_key].update({s_value: zip_value})
 | 
			
		||||
 | 
			
		||||
        # zip keys and values together
 | 
			
		||||
        attr_values_list = []
 | 
			
		||||
        attr_keys_list = []
 | 
			
		||||
        for key in cpos_infos.keys():
 | 
			
		||||
            tmp_list.append(cpos_infos[key])
 | 
			
		||||
            attr_key_list.append(key)
 | 
			
		||||
        joined_cpos_infos = zip(all_cpos, *tmp_list)
 | 
			
		||||
            attr_values_list.append(cpos_infos[key])
 | 
			
		||||
            attr_keys_list.append(key)
 | 
			
		||||
        attr_keys_list.extend(struct_attr_keys)
 | 
			
		||||
        attr_values_list.extend(struct_attr_values)
 | 
			
		||||
        joined_cpos_infos = zip(all_cpos, *attr_values_list)
 | 
			
		||||
        dict_cpos_infos = {}
 | 
			
		||||
        for info in joined_cpos_infos:
 | 
			
		||||
            dict_cpos_infos[info[0]] = dict(zip(attr_key_list, info[1:]))
 | 
			
		||||
        for key, s_id, text_id in zip(dict_cpos_infos.keys(), tmp_s_info, tmp_text_info):
 | 
			
		||||
            dict_cpos_infos[key].update(s_id)
 | 
			
		||||
            dict_cpos_infos[key].update(text_id)
 | 
			
		||||
            dict_cpos_infos[info[0]] = dict(zip(attr_keys_list, info[1:]))
 | 
			
		||||
        return dict_cpos_infos, text_lookup
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user