void CvDTreeTrainData::set

void CvDTreeTrainData::set_data()

in apps/traincascade/old_ml_tree.cpp [145:686]
440 lines of code
110 McCabe index (conditional complexity)

void CvDTreeTrainData::set_data( const CvMat* _train_data, int _tflag,
    const CvMat* _responses, const CvMat* _var_idx, const CvMat* _sample_idx,
    const CvMat* _var_type, const CvMat* _missing_mask, const CvDTreeParams& _params,
    bool _shared, bool _add_labels, bool _update_data )
{
    CvMat* sample_indices = 0;
    CvMat* var_type0 = 0;
    CvMat* tmp_map = 0;
    int** int_ptr = 0;
    CvPair16u32s* pair16u32s_ptr = 0;
    CvDTreeTrainData* data = 0;
    float *_fdst = 0;
    int *_idst = 0;
    unsigned short* udst = 0;
    int* idst = 0;

    CV_FUNCNAME( "CvDTreeTrainData::set_data" );

    __BEGIN__;

    int sample_all = 0, r_type, cv_n;
    int total_c_count = 0;
    int tree_block_size, temp_block_size, max_split_size, nv_size, cv_size = 0;
    int ds_step, dv_step, ms_step = 0, mv_step = 0; // {data|mask}{sample|var}_step
    int vi, i, size;
    char err[100];
    const int *sidx = 0, *vidx = 0;

    uint64 effective_buf_size = 0;
    int effective_buf_height = 0, effective_buf_width = 0;

    if( _update_data && data_root )
    {
        data = new CvDTreeTrainData( _train_data, _tflag, _responses, _var_idx,
            _sample_idx, _var_type, _missing_mask, _params, _shared, _add_labels );

        // compare new and old train data
        if( !(data->var_count == var_count &&
            cvNorm( data->var_type, var_type, CV_C ) < FLT_EPSILON &&
            cvNorm( data->cat_count, cat_count, CV_C ) < FLT_EPSILON &&
            cvNorm( data->cat_map, cat_map, CV_C ) < FLT_EPSILON) )
            CV_ERROR( CV_StsBadArg,
            "The new training data must have the same types and the input and output variables "
            "and the same categories for categorical variables" );

        cvReleaseMat( &priors );
        cvReleaseMat( &priors_mult );
        cvReleaseMat( &buf );
        cvReleaseMat( &direction );
        cvReleaseMat( &split_buf );
        cvReleaseMemStorage( &temp_storage );

        priors = data->priors; data->priors = 0;
        priors_mult = data->priors_mult; data->priors_mult = 0;
        buf = data->buf; data->buf = 0;
        buf_count = data->buf_count; buf_size = data->buf_size;
        sample_count = data->sample_count;

        direction = data->direction; data->direction = 0;
        split_buf = data->split_buf; data->split_buf = 0;
        temp_storage = data->temp_storage; data->temp_storage = 0;
        nv_heap = data->nv_heap; cv_heap = data->cv_heap;

        data_root = new_node( 0, sample_count, 0, 0 );
        EXIT;
    }

    clear();

    var_all = 0;
    rng = &cv::theRNG();

    CV_CALL( set_params( _params ));

    // check parameter types and sizes
    CV_CALL( cvCheckTrainData( _train_data, _tflag, _missing_mask, &var_all, &sample_all ));

    train_data = _train_data;
    responses = _responses;

    if( _tflag == CV_ROW_SAMPLE )
    {
        ds_step = _train_data->step/CV_ELEM_SIZE(_train_data->type);
        dv_step = 1;
        if( _missing_mask )
            ms_step = _missing_mask->step, mv_step = 1;
    }
    else
    {
        dv_step = _train_data->step/CV_ELEM_SIZE(_train_data->type);
        ds_step = 1;
        if( _missing_mask )
            mv_step = _missing_mask->step, ms_step = 1;
    }
    tflag = _tflag;

    sample_count = sample_all;
    var_count = var_all;

    if( _sample_idx )
    {
        CV_CALL( sample_indices = cvPreprocessIndexArray( _sample_idx, sample_all ));
        sidx = sample_indices->data.i;
        sample_count = sample_indices->rows + sample_indices->cols - 1;
    }

    if( _var_idx )
    {
        CV_CALL( var_idx = cvPreprocessIndexArray( _var_idx, var_all ));
        vidx = var_idx->data.i;
        var_count = var_idx->rows + var_idx->cols - 1;
    }

    is_buf_16u = false;
    if ( sample_count < 65536 )
        is_buf_16u = true;

    if( !CV_IS_MAT(_responses) ||
        (CV_MAT_TYPE(_responses->type) != CV_32SC1 &&
         CV_MAT_TYPE(_responses->type) != CV_32FC1) ||
        (_responses->rows != 1 && _responses->cols != 1) ||
        _responses->rows + _responses->cols - 1 != sample_all )
        CV_ERROR( CV_StsBadArg, "The array of _responses must be an integer or "
                  "floating-point vector containing as many elements as "
                  "the total number of samples in the training data matrix" );

    r_type = CV_VAR_CATEGORICAL;
    if( _var_type )
        CV_CALL( var_type0 = cvPreprocessVarType( _var_type, var_idx, var_count, &r_type ));

    CV_CALL( var_type = cvCreateMat( 1, var_count+2, CV_32SC1 ));

    cat_var_count = 0;
    ord_var_count = -1;

    is_classifier = r_type == CV_VAR_CATEGORICAL;

    // step 0. calc the number of categorical vars
    for( vi = 0; vi < var_count; vi++ )
    {
        char vt = var_type0 ? var_type0->data.ptr[vi] : CV_VAR_ORDERED;
        var_type->data.i[vi] = vt == CV_VAR_CATEGORICAL ? cat_var_count++ : ord_var_count--;
    }

    ord_var_count = ~ord_var_count;
    cv_n = params.cv_folds;
    // set the two last elements of var_type array to be able
    // to locate responses and cross-validation labels using
    // the corresponding get_* functions.
    var_type->data.i[var_count] = cat_var_count;
    var_type->data.i[var_count+1] = cat_var_count+1;

    // in case of single ordered predictor we need dummy cv_labels
    // for safe split_node_data() operation
    have_labels = cv_n > 0 || (ord_var_count == 1 && cat_var_count == 0) || _add_labels;

    work_var_count = var_count + (is_classifier ? 1 : 0) // for responses class_labels
                               + (have_labels ? 1 : 0); // for cv_labels

    shared = _shared;
    buf_count = shared ? 2 : 1;

    buf_size = -1; // the member buf_size is obsolete

    effective_buf_size = (uint64)(work_var_count + 1)*(uint64)sample_count * buf_count; // this is the total size of "CvMat buf" to be allocated
    effective_buf_width = sample_count;
    effective_buf_height = work_var_count+1;

    if (effective_buf_width >= effective_buf_height)
        effective_buf_height *= buf_count;
    else
        effective_buf_width *= buf_count;

    if ((uint64)effective_buf_width * (uint64)effective_buf_height != effective_buf_size)
    {
        CV_Error(CV_StsBadArg, "The memory buffer cannot be allocated since its size exceeds integer fields limit");
    }



    if ( is_buf_16u )
    {
        CV_CALL( buf = cvCreateMat( effective_buf_height, effective_buf_width, CV_16UC1 ));
        CV_CALL( pair16u32s_ptr = (CvPair16u32s*)cvAlloc( sample_count*sizeof(pair16u32s_ptr[0]) ));
    }
    else
    {
        CV_CALL( buf = cvCreateMat( effective_buf_height, effective_buf_width, CV_32SC1 ));
        CV_CALL( int_ptr = (int**)cvAlloc( sample_count*sizeof(int_ptr[0]) ));
    }

    size = is_classifier ? (cat_var_count+1) : cat_var_count;
    size = !size ? 1 : size;
    CV_CALL( cat_count = cvCreateMat( 1, size, CV_32SC1 ));
    CV_CALL( cat_ofs = cvCreateMat( 1, size, CV_32SC1 ));

    size = is_classifier ? (cat_var_count + 1)*params.max_categories : cat_var_count*params.max_categories;
    size = !size ? 1 : size;
    CV_CALL( cat_map = cvCreateMat( 1, size, CV_32SC1 ));

    // now calculate the maximum size of split,
    // create memory storage that will keep nodes and splits of the decision tree
    // allocate root node and the buffer for the whole training data
    max_split_size = cvAlign(sizeof(CvDTreeSplit) +
        (MAX(0,sample_count - 33)/32)*sizeof(int),sizeof(void*));
    tree_block_size = MAX((int)sizeof(CvDTreeNode)*8, max_split_size);
    tree_block_size = MAX(tree_block_size + block_size_delta, min_block_size);
    CV_CALL( tree_storage = cvCreateMemStorage( tree_block_size ));
    CV_CALL( node_heap = cvCreateSet( 0, sizeof(*node_heap), sizeof(CvDTreeNode), tree_storage ));

    nv_size = var_count*sizeof(int);
    nv_size = cvAlign(MAX( nv_size, (int)sizeof(CvSetElem) ), sizeof(void*));

    temp_block_size = nv_size;

    if( cv_n )
    {
        if( sample_count < cv_n*MAX(params.min_sample_count,10) )
            CV_ERROR( CV_StsOutOfRange,
                "The many folds in cross-validation for such a small dataset" );

        cv_size = cvAlign( cv_n*(sizeof(int) + sizeof(double)*2), sizeof(double) );
        temp_block_size = MAX(temp_block_size, cv_size);
    }

    temp_block_size = MAX( temp_block_size + block_size_delta, min_block_size );
    CV_CALL( temp_storage = cvCreateMemStorage( temp_block_size ));
    CV_CALL( nv_heap = cvCreateSet( 0, sizeof(*nv_heap), nv_size, temp_storage ));
    if( cv_size )
        CV_CALL( cv_heap = cvCreateSet( 0, sizeof(*cv_heap), cv_size, temp_storage ));

    CV_CALL( data_root = new_node( 0, sample_count, 0, 0 ));

    max_c_count = 1;

    _fdst = 0;
    _idst = 0;
    if (ord_var_count)
        _fdst = (float*)cvAlloc(sample_count*sizeof(_fdst[0]));
    if (is_buf_16u && (cat_var_count || is_classifier))
        _idst = (int*)cvAlloc(sample_count*sizeof(_idst[0]));

    // transform the training data to convenient representation
    for( vi = 0; vi <= var_count; vi++ )
    {
        int ci;
        const uchar* mask = 0;
        int64 m_step = 0, step;
        const int* idata = 0;
        const float* fdata = 0;
        int num_valid = 0;

        if( vi < var_count ) // analyze i-th input variable
        {
            int vi0 = vidx ? vidx[vi] : vi;
            ci = get_var_type(vi);
            step = ds_step; m_step = ms_step;
            if( CV_MAT_TYPE(_train_data->type) == CV_32SC1 )
                idata = _train_data->data.i + vi0*dv_step;
            else
                fdata = _train_data->data.fl + vi0*dv_step;
            if( _missing_mask )
                mask = _missing_mask->data.ptr + vi0*mv_step;
        }
        else // analyze _responses
        {
            ci = cat_var_count;
            step = CV_IS_MAT_CONT(_responses->type) ?
                1 : _responses->step / CV_ELEM_SIZE(_responses->type);
            if( CV_MAT_TYPE(_responses->type) == CV_32SC1 )
                idata = _responses->data.i;
            else
                fdata = _responses->data.fl;
        }

        if( (vi < var_count && ci>=0) ||
            (vi == var_count && is_classifier) ) // process categorical variable or response
        {
            int c_count, prev_label;
            int* c_map;

            if (is_buf_16u)
                udst = (unsigned short*)(buf->data.s + (size_t)vi*sample_count);
            else
                idst = buf->data.i + (size_t)vi*sample_count;

            // copy data
            for( i = 0; i < sample_count; i++ )
            {
                int val = INT_MAX, si = sidx ? sidx[i] : i;
                if( !mask || !mask[(size_t)si*m_step] )
                {
                    if( idata )
                        val = idata[(size_t)si*step];
                    else
                    {
                        float t = fdata[(size_t)si*step];
                        val = cvRound(t);
                        if( fabs(t - val) > FLT_EPSILON )
                        {
                            sprintf( err, "%d-th value of %d-th (categorical) "
                                "variable is not an integer", i, vi );
                            CV_ERROR( CV_StsBadArg, err );
                        }
                    }

                    if( val == INT_MAX )
                    {
                        sprintf( err, "%d-th value of %d-th (categorical) "
                            "variable is too large", i, vi );
                        CV_ERROR( CV_StsBadArg, err );
                    }
                    num_valid++;
                }
                if (is_buf_16u)
                {
                    _idst[i] = val;
                    pair16u32s_ptr[i].u = udst + i;
                    pair16u32s_ptr[i].i = _idst + i;
                }
                else
                {
                    idst[i] = val;
                    int_ptr[i] = idst + i;
                }
            }

            c_count = num_valid > 0;
            if (is_buf_16u)
            {
                std::sort(pair16u32s_ptr, pair16u32s_ptr + sample_count, LessThanPairs());
                // count the categories
                for( i = 1; i < num_valid; i++ )
                    if (*pair16u32s_ptr[i].i != *pair16u32s_ptr[i-1].i)
                        c_count ++ ;
            }
            else
            {
                std::sort(int_ptr, int_ptr + sample_count, LessThanPtr<int>());
                // count the categories
                for( i = 1; i < num_valid; i++ )
                    c_count += *int_ptr[i] != *int_ptr[i-1];
            }

            if( vi > 0 )
                max_c_count = MAX( max_c_count, c_count );
            cat_count->data.i[ci] = c_count;
            cat_ofs->data.i[ci] = total_c_count;

            // resize cat_map, if need
            if( cat_map->cols < total_c_count + c_count )
            {
                tmp_map = cat_map;
                CV_CALL( cat_map = cvCreateMat( 1,
                    MAX(cat_map->cols*3/2,total_c_count+c_count), CV_32SC1 ));
                for( i = 0; i < total_c_count; i++ )
                    cat_map->data.i[i] = tmp_map->data.i[i];
                cvReleaseMat( &tmp_map );
            }

            c_map = cat_map->data.i + total_c_count;
            total_c_count += c_count;

            c_count = -1;
            if (is_buf_16u)
            {
                // compact the class indices and build the map
                prev_label = ~*pair16u32s_ptr[0].i;
                for( i = 0; i < num_valid; i++ )
                {
                    int cur_label = *pair16u32s_ptr[i].i;
                    if( cur_label != prev_label )
                        c_map[++c_count] = prev_label = cur_label;
                    *pair16u32s_ptr[i].u = (unsigned short)c_count;
                }
                // replace labels for missing values with -1
                for( ; i < sample_count; i++ )
                    *pair16u32s_ptr[i].u = 65535;
            }
            else
            {
                // compact the class indices and build the map
                prev_label = ~*int_ptr[0];
                for( i = 0; i < num_valid; i++ )
                {
                    int cur_label = *int_ptr[i];
                    if( cur_label != prev_label )
                        c_map[++c_count] = prev_label = cur_label;
                    *int_ptr[i] = c_count;
                }
                // replace labels for missing values with -1
                for( ; i < sample_count; i++ )
                    *int_ptr[i] = -1;
            }
        }
        else if( ci < 0 ) // process ordered variable
        {
            if (is_buf_16u)
                udst = (unsigned short*)(buf->data.s + (size_t)vi*sample_count);
            else
                idst = buf->data.i + (size_t)vi*sample_count;

            for( i = 0; i < sample_count; i++ )
            {
                float val = ord_nan;
                int si = sidx ? sidx[i] : i;
                if( !mask || !mask[(size_t)si*m_step] )
                {
                    if( idata )
                        val = (float)idata[(size_t)si*step];
                    else
                        val = fdata[(size_t)si*step];

                    if( fabs(val) >= ord_nan )
                    {
                        sprintf( err, "%d-th value of %d-th (ordered) "
                            "variable (=%g) is too large", i, vi, val );
                        CV_ERROR( CV_StsBadArg, err );
                    }
                    num_valid++;
                }

                if (is_buf_16u)
                    udst[i] = (unsigned short)i; // TODO: memory corruption may be here
                else
                    idst[i] = i;
                _fdst[i] = val;

            }
            if (is_buf_16u)
                std::sort(udst, udst + sample_count, LessThanIdx<float, unsigned short>(_fdst));
            else
                std::sort(idst, idst + sample_count, LessThanIdx<float, int>(_fdst));
        }

        if( vi < var_count )
            data_root->set_num_valid(vi, num_valid);
    }

    // set sample labels
    if (is_buf_16u)
        udst = (unsigned short*)(buf->data.s + (size_t)work_var_count*sample_count);
    else
        idst = buf->data.i + (size_t)work_var_count*sample_count;

    for (i = 0; i < sample_count; i++)
    {
        if (udst)
            udst[i] = sidx ? (unsigned short)sidx[i] : (unsigned short)i;
        else
            idst[i] = sidx ? sidx[i] : i;
    }

    if( cv_n )
    {
        unsigned short* usdst = 0;
        int* idst2 = 0;

        if (is_buf_16u)
        {
            usdst = (unsigned short*)(buf->data.s + (size_t)(get_work_var_count()-1)*sample_count);
            for( i = vi = 0; i < sample_count; i++ )
            {
                usdst[i] = (unsigned short)vi++;
                vi &= vi < cv_n ? -1 : 0;
            }

            for( i = 0; i < sample_count; i++ )
            {
                int a = (*rng)(sample_count);
                int b = (*rng)(sample_count);
                unsigned short unsh = (unsigned short)vi;
                CV_SWAP( usdst[a], usdst[b], unsh );
            }
        }
        else
        {
            idst2 = buf->data.i + (size_t)(get_work_var_count()-1)*sample_count;
            for( i = vi = 0; i < sample_count; i++ )
            {
                idst2[i] = vi++;
                vi &= vi < cv_n ? -1 : 0;
            }

            for( i = 0; i < sample_count; i++ )
            {
                int a = (*rng)(sample_count);
                int b = (*rng)(sample_count);
                CV_SWAP( idst2[a], idst2[b], vi );
            }
        }
    }

    if ( cat_map )
        cat_map->cols = MAX( total_c_count, 1 );

    max_split_size = cvAlign(sizeof(CvDTreeSplit) +
        (MAX(0,max_c_count - 33)/32)*sizeof(int),sizeof(void*));
    CV_CALL( split_heap = cvCreateSet( 0, sizeof(*split_heap), max_split_size, tree_storage ));

    have_priors = is_classifier && params.priors;
    if( is_classifier )
    {
        int m = get_num_classes();
        double sum = 0;
        CV_CALL( priors = cvCreateMat( 1, m, CV_64F ));
        for( i = 0; i < m; i++ )
        {
            double val = have_priors ? params.priors[i] : 1.;
            if( val <= 0 )
                CV_ERROR( CV_StsOutOfRange, "Every class weight should be positive" );
            priors->data.db[i] = val;
            sum += val;
        }

        // normalize weights
        if( have_priors )
            cvScale( priors, priors, 1./sum );

        CV_CALL( priors_mult = cvCloneMat( priors ));
        CV_CALL( counts = cvCreateMat( 1, m, CV_32SC1 ));
    }


    CV_CALL( direction = cvCreateMat( 1, sample_count, CV_8UC1 ));
    CV_CALL( split_buf = cvCreateMat( 1, sample_count, CV_32SC1 ));

    __END__;

    if( data )
        delete data;

    if (_fdst)
        cvFree( &_fdst );
    if (_idst)
        cvFree( &_idst );
    cvFree( &int_ptr );
    cvFree( &pair16u32s_ptr);
    cvReleaseMat( &var_type0 );
    cvReleaseMat( &sample_indices );
    cvReleaseMat( &tmp_map );
}