void TopNSimMatcher::_findTopN()

in api/c/bingo-nosql/src/bingo_matcher.cpp [984:1254]


void TopNSimMatcher::_findTopN()
{
    QS_DEF(Array<float>, thrs);
    QS_DEF(Array<int>, nhits_per_block);
    QS_DEF(Array<int>, blocks);
    QS_DEF(Array<int>, cells);

    thrs.clear();
    nhits_per_block.clear();
    blocks.clear();
    cells.clear();

    SimStorage& sim_storage = _index.getSimStorage();

    float thr_low_limit = _query_data->getMin();
    int hits_limit = _limit;

    _initModelDistribution(thrs, nhits_per_block);

    blocks.clear_resize(thrs.size());
    cells.clear_resize(thrs.size());
    _current_results.clear();

    bool already_found = false;
    bool too_many = false;
    float thr_proc = 0.0f;
    float cur_thr = 0.0f;
    float thr_too_many = 0.0f;
    int nhits = 0;
    int start_iter = thrs.size();

    int cont_count = 0;
    int cells_count = 0;
    int min_cell = 0;
    int max_cell = 0;
    int cur_cell = 0;

    int i;
    int cnt = 0;

    for (i = 0; i < thrs.size(); i++)
    {
        if (too_many)
        {
            if (thr_proc == 0.0f)
                cur_thr = thr_too_many + (1.0f - thr_too_many) / 2;
            else
                cur_thr = thr_too_many + (thr_proc - thr_too_many) / 2;
        }
        else
            cur_thr = thrs[i];

        resetThresholdLimit(cur_thr);

        cont_count = containersCount();
        blocks[i] = cont_count;
        cells_count = cellsCount();
        cells[i] = cells_count;
        min_cell = minCell();
        max_cell = maxCell();

        if ((cont_count > 0) && (nhits == 0))
        {
            cnt = 0;
            _current_results.clear();

            SimResult* res = 0;

            while (BaseSimilarityMatcher::next())
            {
                cnt++;
                res = &_current_results.push();
                res->id = _current_id;
                res->sim_value = _current_sim_value;
                cur_cell = currentCell();
                if ((cnt > hits_limit * 2) && ((max_cell - cur_cell) > cells_count / 2))
                {
                    thr_too_many = cur_thr;
                    too_many = true;
                    break;
                }
                else
                    too_many = false;
            }

            if (too_many)
                continue;

            if ((cnt >= hits_limit) || (cur_thr <= thr_low_limit))
            {
                already_found = true;
                break;
            }

            nhits = (cnt / cont_count);

            if ((nhits == 0) && (cnt > 0))
                nhits = 1;

            thr_proc = cur_thr;
            nhits_per_block[i] = nhits;
            if (i < thrs.size() - 2)
                start_iter = i + 1;
            else
            {
                already_found = true;
                break;
            }

            if (thr_too_many > 0.0)
            {
                thrs[i] = thr_proc;
                thrs[start_iter] = thr_proc - (thr_proc - thr_too_many) / 2;
                if (start_iter < thrs.size() - 2)
                    thrs[start_iter + 1] = thr_too_many;
            }
        }
        else if ((cont_count == 0) && sim_storage.isSmallBase())
        {
            cnt = 0;
            _current_results.clear();

            SimResult* res = 0;

            while (BaseSimilarityMatcher::next())
            {
                cnt++;
                res = &_current_results.push();
                res->id = _current_id;
                res->sim_value = _current_sim_value;

                if (cnt > hits_limit * 2)
                {
                    thr_too_many = cur_thr;
                    too_many = true;
                    break;
                }
                else
                    too_many = false;
            }

            if (too_many)
                continue;

            if ((cnt >= hits_limit) || (cur_thr <= thr_low_limit))
            {
                already_found = true;
                break;
            }
        }
        else if (cont_count == 0)
            continue;
        else
        {
            nhits_per_block[i] = nhits_per_block[i - 1] * 2;
        }
    }

    if (!already_found)
    {
        bool adjusted = false;
        float next_thr = 0.0f;
        float prev_thr = 0.0f;

        i = start_iter;

        while (i < thrs.size())
        {
            if ((blocks[i] * nhits_per_block[i] < hits_limit) && (i < (thrs.size() - 1)) && (!adjusted))
            {
                i++;
                continue;
            }

            cur_thr = thrs[i];

            if ((blocks[i] * nhits_per_block[i] > 2 * hits_limit) && (!adjusted))
            {
                next_thr = cur_thr + (thrs[i - 1] - cur_thr) / 2;
                prev_thr = cur_thr;
                cur_thr = next_thr;
                adjusted = true;
            }

            too_many = false;

            resetThresholdLimit(cur_thr);

            cont_count = containersCount();
            cells_count = cellsCount();
            cells[i] = cells_count;
            min_cell = minCell();
            max_cell = maxCell();

            if (cont_count > 0)
            {
                cnt = 0;
                _current_results.clear();

                SimResult* res = 0;

                while (BaseSimilarityMatcher::next())
                {
                    cnt++;
                    res = &_current_results.push();
                    res->id = _current_id;
                    res->sim_value = _current_sim_value;
                    cur_cell = currentCell();
                    if ((cnt > hits_limit * 2) && (max_cell - cur_cell) > cells_count / 2)
                    {
                        next_thr = cur_thr + (thr_proc - cur_thr) / 2;
                        thr_too_many = cur_thr;
                        thrs[i] = next_thr;
                        too_many = true;
                        adjusted = true;
                        break;
                    }
                }

                if (((cnt >= hits_limit) || (cur_thr <= thr_low_limit)) && (!too_many))
                    break;
            }
            else
                i++;

            if (!too_many)
            {
                thr_proc = cur_thr;
                nhits_per_block[i] = cnt / cont_count;
                if (i < (thrs.size() - 2))
                    nhits_per_block[i + 1] = nhits_per_block[i] * 2;

                if (thr_too_many > 0.0)
                {
                    next_thr = cur_thr - (cur_thr - thr_too_many) / 2;
                    thrs[i] = next_thr;
                }
                else if (i < (thrs.size() - 2))
                {
                    if ((hits_limit - cnt) < (nhits_per_block[i + 1] * (blocks[i + 1] - cont_count)))
                    {
                        next_thr = cur_thr - (cur_thr - thrs[i + 1]) / 2;
                        thrs[i] = next_thr;
                        adjusted = true;
                    }
                    else
                    {
                        i++;
                        next_thr = thrs[i];
                    }
                }
            }

            cur_thr = next_thr;
        }
    }

    if (_current_results.size() > 0)
    {
        _current_results.qsort(_cmp_sim_res, 0);

        for (i = 0; i < _current_results.size(); i++)
        {
            _result_ids.push(_current_results[i].id);
            _result_sims.push(_current_results[i].sim_value);

            if (i == (hits_limit - 1))
                break;
        }
    }
}