-
Couldn't load subscription status.
- Fork 21
sort-on multiple indexes is broken#81 #82
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 8 commits
697719c
6eaef16
8a9608c
78fd575
bab34d7
88de79d
85611a5
5686c7a
1ff6b5f
87f0149
9141f94
a17c1f7
cedeca3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -819,7 +819,7 @@ def _sort_nbest(self, actual_result_count, result, rs, | |
| sort_index, sort_index_length, sort_spec, | ||
| second_indexes_key_map): | ||
| # Limit / sort results using N-Best algorithm | ||
| # This is faster for large sets then a full sort | ||
| # This is faster for large sets than a full sort | ||
| # And uses far less memory | ||
| index_key_map = sort_index.documentToKeyMap() | ||
| keys = [] | ||
|
|
@@ -845,27 +845,16 @@ def _sort_nbest(self, actual_result_count, result, rs, | |
| worst = keys[0] | ||
| result.reverse() | ||
| else: | ||
| for did in rs: | ||
| try: | ||
| key = index_key_map[did] | ||
| full_key = (key, ) | ||
| for km in second_indexes_key_map: | ||
| full_key += (km[did], ) | ||
| except KeyError: | ||
| # This document is not in the sort key index, skip it. | ||
| actual_result_count -= 1 | ||
| else: | ||
| if n >= limit and key <= worst: | ||
| continue | ||
| i = bisect(keys, key) | ||
| keys.insert(i, key) | ||
| result.insert(i, (full_key, did, self.__getitem__)) | ||
| if n == limit: | ||
| del keys[0], result[0] | ||
| else: | ||
| n += 1 | ||
| worst = keys[0] | ||
| result = multisort(result, sort_spec) | ||
| # we have multi index sorting | ||
| result = self._multi_index_nbest( | ||
| actual_result_count, | ||
| result, | ||
| rs, | ||
| limit, | ||
| sort_index, | ||
| sort_spec, | ||
| second_indexes_key_map, | ||
| reverse=True) | ||
|
|
||
| return (actual_result_count, 0, result) | ||
|
|
||
|
|
@@ -874,11 +863,11 @@ def _sort_nbest_reverse(self, actual_result_count, result, rs, | |
| sort_index, sort_index_length, sort_spec, | ||
| second_indexes_key_map): | ||
| # Limit / sort results using N-Best algorithm in reverse (N-Worst?) | ||
| index_key_map = sort_index.documentToKeyMap() | ||
| keys = [] | ||
| n = 0 | ||
| best = None | ||
| if sort_index_length == 1: | ||
| index_key_map = sort_index.documentToKeyMap() | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ideally, You have refactored the multi-index sorting into a single method. I would go a step further and merge |
||
| keys = [] | ||
| n = 0 | ||
| best = None | ||
| for did in rs: | ||
| try: | ||
| key = index_key_map[did] | ||
|
|
@@ -897,29 +886,83 @@ def _sort_nbest_reverse(self, actual_result_count, result, rs, | |
| n += 1 | ||
| best = keys[-1] | ||
| else: | ||
| for did in rs: | ||
| # we have multi index sorting | ||
| result = self._multi_index_nbest( | ||
| actual_result_count, | ||
| result, | ||
| rs, | ||
| limit, | ||
| sort_index, | ||
| sort_spec, | ||
| second_indexes_key_map, | ||
| reverse=False | ||
| ) | ||
|
|
||
| return (actual_result_count, 0, result) | ||
|
|
||
| def _multi_index_nbest(self, actual_result_count, result, | ||
| rs, limit, sort_index, sort_spec, | ||
| second_indexes_key_map, reverse=True): | ||
| """ | ||
| For multiple indexes. | ||
| 1) Categorize documents as lists by the first index values in the | ||
| did_by_index_value dict. | ||
| 2) Sort the index_values | ||
| 3) Collect from the did_by_index_value dict by the sorted | ||
| index_values till limit is exeeded. | ||
| """ | ||
| # A dict of lists categorize the documents after the first sort | ||
| # index value. | ||
| did_by_index_value = {} | ||
| # get the index' keymap | ||
| index_key_map = sort_index.documentToKeyMap() | ||
| # for all documents | ||
| for did in rs: | ||
| # get the index value of the current document id | ||
| try: | ||
| index_value = index_key_map[did] | ||
| except KeyError: | ||
| # This document is not in the sort key index, skip it. | ||
| # ToDo: Is this the correct/intended behavior??? | ||
| actual_result_count -= 1 | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As |
||
| else: | ||
| # do we already have a list for this index_value? If not | ||
| # create one. | ||
| if index_value not in did_by_index_value: | ||
| did_by_index_value[index_value] = [] | ||
| # add document id for the index value | ||
| did_by_index_value[index_value].append(did) | ||
| # All documents are now categorized after the first sort index values. | ||
| # Sort the sort index_values | ||
| sorted_index_values = sorted( | ||
| did_by_index_value.keys(), | ||
| reverse=reverse) | ||
|
||
| # How many documents do we have | ||
| result_count = 0 | ||
| # for all index_values in sorted order | ||
| for index_value in sorted_index_values: | ||
| # for all documents for this index_value | ||
| for did in did_by_index_value[index_value]: | ||
| # get additional index metadata for the other sort indexes | ||
| try: | ||
| key = index_key_map[did] | ||
| full_key = (key, ) | ||
| full_key = (index_value,) | ||
| for km in second_indexes_key_map: | ||
| full_key += (km[did], ) | ||
| full_key += (km[did],) | ||
| except KeyError: | ||
| # This document is not in the sort key index, skip it. | ||
| # ToDo: Is this the correct/intended behavior??? | ||
| actual_result_count -= 1 | ||
| else: | ||
| if n >= limit and key >= best: | ||
| continue | ||
| i = bisect(keys, key) | ||
| keys.insert(i, key) | ||
| result.insert(i, (full_key, did, self.__getitem__)) | ||
| if n == limit: | ||
| del keys[-1], result[-1] | ||
| else: | ||
| n += 1 | ||
| best = keys[-1] | ||
| result = multisort(result, sort_spec) | ||
| # Add the document to the result set | ||
| result.append((full_key, did, self.__getitem__)) | ||
| result_count += 1 | ||
| # Check if we have enough datasets to fullfill the limit | ||
| if result_count >= limit: | ||
| break | ||
|
|
||
| return (actual_result_count, 0, result) | ||
| # Sort after the secondary indexes. | ||
| result = multisort(result, sort_spec) | ||
| return result | ||
|
|
||
| def sortResults(self, rs, sort_index, | ||
| reverse=False, limit=None, merge=True, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Moving the sorting code into a separate method (which is in general a good idea) prevents
actual_result_countto be updated. You must either wrap (int)actual_result_countin a "mutable" object (i.e. updatable in place) or return the updated value as part of the return value (and reassign).ZCatalogfilters out hits for which at least one of the sort indexes lacks a value - a test examining the correctactual_result_countthus would ensure that some hits lack a sort value and verify the correct result size.