in src/annoylib.h [1344:1445]
S _make_tree(const vector<S>& indices, bool is_root, Random& _random, ThreadedBuildPolicy& threaded_build_policy) {
// The basic rule is that if we have <= _K items, then it's a leaf node, otherwise it's a split node.
// There's some regrettable complications caused by the problem that root nodes have to be "special":
// 1. We identify root nodes by the arguable logic that _n_items == n->n_descendants, regardless of how many descendants they actually have
// 2. Root nodes with only 1 child need to be a "dummy" parent
// 3. Due to the _n_items "hack", we need to be careful with the cases where _n_items <= _K or _n_items > _K
if (indices.size() == 1 && !is_root)
return indices[0];
if (indices.size() <= (size_t)_K && (!is_root || (size_t)_n_items <= (size_t)_K || indices.size() == 1)) {
threaded_build_policy.lock_n_nodes();
_allocate_size(_n_nodes + 1, threaded_build_policy);
S item = _n_nodes++;
threaded_build_policy.unlock_n_nodes();
threaded_build_policy.lock_shared_nodes();
Node* m = _get(item);
m->n_descendants = is_root ? _n_items : (S)indices.size();
// Using std::copy instead of a loop seems to resolve issues #3 and #13,
// probably because gcc 4.8 goes overboard with optimizations.
// Using memcpy instead of std::copy for MSVC compatibility. #235
// Only copy when necessary to avoid crash in MSVC 9. #293
if (!indices.empty())
memcpy(m->children, &indices[0], indices.size() * sizeof(S));
threaded_build_policy.unlock_shared_nodes();
return item;
}
threaded_build_policy.lock_shared_nodes();
vector<Node*> children;
for (size_t i = 0; i < indices.size(); i++) {
S j = indices[i];
Node* n = _get(j);
if (n)
children.push_back(n);
}
vector<S> children_indices[2];
Node* m = (Node*)alloca(_s);
for (int attempt = 0; attempt < 3; attempt++) {
children_indices[0].clear();
children_indices[1].clear();
D::create_split(children, _f, _s, _random, m);
for (size_t i = 0; i < indices.size(); i++) {
S j = indices[i];
Node* n = _get(j);
if (n) {
bool side = D::side(m, n, _f, _random);
children_indices[side].push_back(j);
} else {
annoylib_showUpdate("No node for index %d?\n", j);
}
}
if (_split_imbalance(children_indices[0], children_indices[1]) < 0.95)
break;
}
threaded_build_policy.unlock_shared_nodes();
// If we didn't find a hyperplane, just randomize sides as a last option
while (_split_imbalance(children_indices[0], children_indices[1]) > 0.99) {
if (_verbose)
annoylib_showUpdate("\tNo hyperplane found (left has %zu children, right has %zu children)\n",
children_indices[0].size(), children_indices[1].size());
children_indices[0].clear();
children_indices[1].clear();
// Set the vector to 0.0
for (int z = 0; z < _f; z++)
m->v[z] = 0;
for (size_t i = 0; i < indices.size(); i++) {
S j = indices[i];
// Just randomize...
children_indices[_random.flip()].push_back(j);
}
}
int flip = (children_indices[0].size() > children_indices[1].size());
m->n_descendants = is_root ? _n_items : (S)indices.size();
for (int side = 0; side < 2; side++) {
// run _make_tree for the smallest child first (for cache locality)
m->children[side^flip] = _make_tree(children_indices[side^flip], false, _random, threaded_build_policy);
}
threaded_build_policy.lock_n_nodes();
_allocate_size(_n_nodes + 1, threaded_build_policy);
S item = _n_nodes++;
threaded_build_policy.unlock_n_nodes();
threaded_build_policy.lock_shared_nodes();
memcpy(_get(item), m, _s);
threaded_build_policy.unlock_shared_nodes();
return item;
}