Skip to content
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 85 additions & 42 deletions src/Products/ZCatalog/Catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -819,7 +819,7 @@ def _sort_nbest(self, actual_result_count, result, rs,
sort_index, sort_index_length, sort_spec,
second_indexes_key_map):
# Limit / sort results using N-Best algorithm
# This is faster for large sets then a full sort
# This is faster for large sets than a full sort
# And uses far less memory
index_key_map = sort_index.documentToKeyMap()
keys = []
Expand All @@ -845,27 +845,16 @@ def _sort_nbest(self, actual_result_count, result, rs,
worst = keys[0]
result.reverse()
else:
for did in rs:
try:
key = index_key_map[did]
full_key = (key, )
for km in second_indexes_key_map:
full_key += (km[did], )
except KeyError:
# This document is not in the sort key index, skip it.
actual_result_count -= 1
else:
if n >= limit and key <= worst:
continue
i = bisect(keys, key)
keys.insert(i, key)
result.insert(i, (full_key, did, self.__getitem__))
if n == limit:
del keys[0], result[0]
else:
n += 1
worst = keys[0]
result = multisort(result, sort_spec)
# we have multi index sorting
result = self._multi_index_nbest(
actual_result_count,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moving the sorting code into a separate method (which is in general a good idea) prevents actual_result_count to be updated. You must either wrap (int) actual_result_count in a "mutable" object (i.e. updatable in place) or return the updated value as part of the return value (and reassign). ZCatalog filters out hits for which at least one of the sort indexes lacks a value - a test examining the correct actual_result_count thus would ensure that some hits lack a sort value and verify the correct result size.

result,
rs,
limit,
sort_index,
sort_spec,
second_indexes_key_map,
reverse=True)

return (actual_result_count, 0, result)

Expand All @@ -874,11 +863,11 @@ def _sort_nbest_reverse(self, actual_result_count, result, rs,
sort_index, sort_index_length, sort_spec,
second_indexes_key_map):
# Limit / sort results using N-Best algorithm in reverse (N-Worst?)
index_key_map = sort_index.documentToKeyMap()
keys = []
n = 0
best = None
if sort_index_length == 1:
index_key_map = sort_index.documentToKeyMap()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally, sort_nbest and sort_nbest_reverse would be symmetric. Your change above seems to have reduced the symmetry.

You have refactored the multi-index sorting into a single method. I would go a step further and merge sort_nbest and sort_nbest_reverse into a single function (in my view those methods are also named unintuitively; the heapq module contains similar functions named nsmallest and nlargest, respectively - which would be much better names). I would use functools.partial to instantiate the proper sorting function from it.

keys = []
n = 0
best = None
for did in rs:
try:
key = index_key_map[did]
Expand All @@ -897,29 +886,83 @@ def _sort_nbest_reverse(self, actual_result_count, result, rs,
n += 1
best = keys[-1]
else:
for did in rs:
# we have multi index sorting
result = self._multi_index_nbest(
actual_result_count,
result,
rs,
limit,
sort_index,
sort_spec,
second_indexes_key_map,
reverse=False
)

return (actual_result_count, 0, result)

def _multi_index_nbest(self, actual_result_count, result,
rs, limit, sort_index, sort_spec,
second_indexes_key_map, reverse=True):
"""
For multiple indexes.
1) Categorize documents as lists by the first index values in the
did_by_index_value dict.
2) Sort the index_values
3) Collect from the did_by_index_value dict by the sorted
index_values till limit is exeeded.
"""
# A dict of lists categorize the documents after the first sort
# index value.
did_by_index_value = {}
# get the index' keymap
index_key_map = sort_index.documentToKeyMap()
# for all documents
for did in rs:
# get the index value of the current document id
try:
index_value = index_key_map[did]
except KeyError:
# This document is not in the sort key index, skip it.
# ToDo: Is this the correct/intended behavior???
actual_result_count -= 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As actual_result_count is an int and thus immutable, this does not change the value in the caller. As mentioned before, you must do something special to get the updated value to the caller (e.g. return the updated value as part or the return value).

else:
# do we already have a list for this index_value? If not
# create one.
if index_value not in did_by_index_value:
did_by_index_value[index_value] = []
# add document id for the index value
did_by_index_value[index_value].append(did)
# All documents are now categorized after the first sort index values.
# Sort the sort index_values
sorted_index_values = sorted(
did_by_index_value.keys(),
reverse=reverse)
Copy link
Contributor

@d-maurer d-maurer Aug 29, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While this is correct, you lose the former optimization (use nbest rather than full sorting to save sorting time). This may be relevant if the first sort index is large (e.g. a timestamp based index such as modified, effective, ...). I would avoid the full sorting and use heapq.n[smallest|largest] (depending on reverse) with limit as n. You might even use more elementary heapq functions to sort incrementally and stop as soon as you have enough documents found.

Copy link
Author

@volkerjaenisch volkerjaenisch Aug 29, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will go for a heapq implementation of the algorithm.

Cheers,
Volker

Copy link
Author

@volkerjaenisch volkerjaenisch Aug 29, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After digging into heapq: Heapq asumes a fixed size heap.
So if say limit=2 but I need in reality three elements
Result-set:
eggs 1
ham 2
ham 1

Expected Result set:
eggs 1
ham 1

I would start with a heap of length limit=2. Then I would have to extend the heap iteratevely and push in all the index values in again. This would cost 2 x O(n) (One for heapq.heapify and on for the pushs) for each additional item I need.

I think it may be preferable to switch to a linear search, here. Finding all results with the same index value of the largest/smallest value (not already in the sort body) is of O(n) and this operation takes place only once.

Cheers,
Volker

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for the delay. Will work on this at the weekend

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Finally!

I pushed a new version using heapq.

Ideally, sort_nbest and sort_nbest_reverse would be symmetric. Your change above seems to have reduced the symmetry.

You have refactored the multi-index sorting into a single method. I would go a step further and merge sort_nbest and sort_nbest_reverse into a single function (in my view those methods are also named unintuitively; the heapq module contains similar functions named nsmallest and nlargest, respectively - which would be much better names). I would use functools.partial to instantiate the proper sorting function from it.

Currently these methods utilize a bisecting scheme that is probably as effective as heapq, if the bisecting is implemented in C-Code. One can replace this code by the using heapq to use the same sorting algorithm.

I start doing this now

Volker

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My heapq Code is currently sorting the index_values [primitive], only.
If we like to handle the former bisec_code we have to sort a list of tuples [(primitive, did)].

A benchmark for 100Mio values to sort [primitive] or [(primitive, did)] I got

import heapq
import random
import datetime

NUM = 100000000

a = []
b =[]

for i in range(NUM):

a.append(random.random())
b.append((random.random(), random.random()))

start = datetime.datetime.now()

x = heapq.nlargest(100, a)

now = datetime.datetime.now()
diff = now - start
print( diff.total_seconds())

start = datetime.datetime.now()

y = heapq.nlargest(100, b)

now = datetime.datetime.now()
diff = now - start

print( diff.total_seconds())

8.796026
11.152541

One will need 27% more time for heapq-sort to sort a list of tuples than only primitives.

So I am uncertain which way to go.

Cheers,
Volker

# How many documents do we have
result_count = 0
# for all index_values in sorted order
for index_value in sorted_index_values:
# for all documents for this index_value
for did in did_by_index_value[index_value]:
# get additional index metadata for the other sort indexes
try:
key = index_key_map[did]
full_key = (key, )
full_key = (index_value,)
for km in second_indexes_key_map:
full_key += (km[did], )
full_key += (km[did],)
except KeyError:
# This document is not in the sort key index, skip it.
# ToDo: Is this the correct/intended behavior???
actual_result_count -= 1
else:
if n >= limit and key >= best:
continue
i = bisect(keys, key)
keys.insert(i, key)
result.insert(i, (full_key, did, self.__getitem__))
if n == limit:
del keys[-1], result[-1]
else:
n += 1
best = keys[-1]
result = multisort(result, sort_spec)
# Add the document to the result set
result.append((full_key, did, self.__getitem__))
result_count += 1
# Check if we have enough datasets to fullfill the limit
if result_count >= limit:
break

return (actual_result_count, 0, result)
# Sort after the secondary indexes.
result = multisort(result, sort_spec)
return result

def sortResults(self, rs, sort_index,
reverse=False, limit=None, merge=True,
Expand Down
Loading