dlib C++ Library - random_forest

// Copyright (C) 2018  Davis E. King (davis@dlib.net)
// License: Boost Software License   See LICENSE.txt for the full license.
#ifndef DLIB_RANdOM_FOREST_REGRESSION_H_
#define DLIB_RANdOM_FOREST_REGRESSION_H_

#include "random_forest_regression_abstract.h"
#include <vector>
#include "../matrix.h"
#include <algorithm>
#include "../threads.h"

namespace dlib
{

// ----------------------------------------------------------------------------------------

    class dense_feature_extractor
    {

    public:
        typedef uint32_t feature;
        typedef matrix<double,0,1> sample_type;

        dense_feature_extractor(
        ) = default;

        void setup (
            const std::vector<sample_type>& x,
            const std::vector<double>& y 
        ) 
        {
            DLIB_CASSERT(x.size() > 0);
            DLIB_CASSERT(x.size() == y.size());
            for (auto& el : x)
                DLIB_CASSERT(el.size() == x[0].size(), "All the vectors in a training set have to have the same dimensionality.");

            DLIB_CASSERT(x[0].size() != 0, "The vectors can't be empty.");

            num_feats = x[0].size();
        }


        void get_random_features (
            dlib::rand& rnd,
            size_t num,
            std::vector<feature>& feats
        ) const
        {
            DLIB_ASSERT(max_num_feats() != 0);
            num = std::min(num, num_feats);

            feats.clear();
            for (size_t i = 0; i < num_feats; ++i)
                feats.push_back(i);

            // now pick num features at random
            for (size_t i = 0; i < num; ++i)
            {
                auto idx = rnd.get_integer_in_range(i,num_feats);
                std::swap(feats[i], feats[idx]);
            }
            feats.resize(num);
        }

        double extract_feature_value (
            const sample_type& item,
            const feature& f
        ) const
        {
            DLIB_ASSERT(max_num_feats() != 0);
            return item(f);
        }

        size_t max_num_feats (
        ) const
        {
            return num_feats;
        }

        friend void serialize(const dense_feature_extractor& item, std::ostream& out)
        {
            serialize("dense_feature_extractor", out);
            serialize(item.num_feats, out);
        }

        friend void deserialize(dense_feature_extractor& item, std::istream& in)
        {
            check_serialized_version("dense_feature_extractor", in);
            deserialize(item.num_feats, in);
        }

    private:
        size_t num_feats = 0;
    };

// ----------------------------------------------------------------------------------------

    template <
        typename feature_extractor
        >
    struct internal_tree_node
    {
        uint32_t left;
        uint32_t right;
        float split_threshold;
        typename feature_extractor::feature split_feature;
    };

    template <typename feature_extractor>
    void serialize(const internal_tree_node<feature_extractor>& item, std::ostream& out)
    {
        serialize(item.left, out);
        serialize(item.right, out);
        serialize(item.split_threshold, out);
        serialize(item.split_feature, out);
    }

    template <typename feature_extractor>
    void deserialize(internal_tree_node<feature_extractor>& item, std::istream& in)
    {
        deserialize(item.left, in);
        deserialize(item.right, in);
        deserialize(item.split_threshold, in);
        deserialize(item.split_feature, in);
    }

// ----------------------------------------------------------------------------------------

    template <
        typename feature_extractor = dense_feature_extractor
        >
    class random_forest_regression_function
    {

    public:

        typedef feature_extractor feature_extractor_type;
        typedef typename feature_extractor::sample_type sample_type;

        random_forest_regression_function(
        ) = default;

        random_forest_regression_function (
            feature_extractor_type&& fe_,
            std::vector<std::vector<internal_tree_node<feature_extractor>>>&& trees_,
            std::vector<std::vector<float>>&& leaves_
        ) :
            fe(std::move(fe_)),
            trees(std::move(trees_)),
            leaves(std::move(leaves_))
        {
            DLIB_ASSERT(trees.size() > 0);
            DLIB_ASSERT(trees.size() == leaves.size(), "Every set of tree nodes has to have leaves");
#ifdef ENABLE_ASSERTS
            for (size_t i = 0; i < trees.size(); ++i)
            {
                DLIB_ASSERT(trees[i].size() > 0, "A tree can't have 0 leaves.");
                for (auto& node : trees[i])
                {
                    DLIB_ASSERT(trees[i].size()+leaves[i].size() > node.left, "left node index in tree is too big. There is no associated tree node or leaf.");
                    DLIB_ASSERT(trees[i].size()+leaves[i].size() > node.right, "right node index in tree is too big. There is no associated tree node or leaf.");
                }
            }
#endif
        }

        size_t get_num_trees(
        ) const
        {
            return trees.size();
        }

        const std::vector<std::vector<internal_tree_node<feature_extractor>>>& get_internal_tree_nodes (
        ) const { return trees; }

        const std::vector<std::vector<float>>& get_tree_leaves (
        ) const { return leaves; }

        const feature_extractor_type& get_feature_extractor (
        ) const { return fe; }

        double operator() (
            const sample_type& x
        ) const
        {
            DLIB_ASSERT(get_num_trees() > 0);

            double accum = 0;

            for (size_t i = 0; i < trees.size(); ++i)
            {
                auto& tree = trees[i];
                // walk the tree to the leaf
                uint32_t idx = 0;
                while(idx < tree.size())
                {
                    auto feature_value = fe.extract_feature_value(x, tree[idx].split_feature);
                    if (feature_value < tree[idx].split_threshold)
                        idx = tree[idx].left;
                    else
                        idx = tree[idx].right;
                }
                // compute leaf index 
                accum += leaves[i][idx-tree.size()];
            }

            return accum/trees.size();
        }

        friend void serialize(const random_forest_regression_function& item, std::ostream& out)
        {
            serialize("random_forest_regression_function", out);
            serialize(item.fe, out);
            serialize(item.trees, out);
            serialize(item.leaves, out);
        }

        friend void deserialize(random_forest_regression_function& item, std::istream& in)
        {
            check_serialized_version("random_forest_regression_function", in);
            deserialize(item.fe, in);
            deserialize(item.trees, in);
            deserialize(item.leaves, in);
        }

    private:

        /*!
            CONVENTION
                - trees.size() == leaves.size()
                - Any .left or .right index in trees that is larger than the number of
                  nodes in the tree references a leaf. Moreover, the index of the leaf is
                  computed by subtracting the number of nodes in the tree.
        !*/

        feature_extractor_type fe;

        // internal nodes of trees
        std::vector<std::vector<internal_tree_node<feature_extractor>>> trees;
        // leaves of trees
        std::vector<std::vector<float>> leaves;

    };

// ----------------------------------------------------------------------------------------

    template <
        typename feature_extractor = dense_feature_extractor
        >
    class random_forest_regression_trainer
    {
    public:
        typedef feature_extractor feature_extractor_type;
        typedef random_forest_regression_function<feature_extractor> trained_function_type;
        typedef typename feature_extractor::sample_type sample_type;


        random_forest_regression_trainer (
        ) = default;

        const feature_extractor_type& get_feature_extractor (
        ) const
        {
            return fe_;
        }

        void set_feature_extractor (
            const feature_extractor_type& feat_extractor
        )
        {
            fe_ = feat_extractor;
        }

        void set_seed (
            const std::string& seed
        )
        {
            random_seed = seed;
        }

        const std::string& get_random_seed (
        ) const
        {
            return random_seed;
        }

        size_t get_num_trees (
        ) const
        {
            return num_trees;
        }

        void set_num_trees (
            size_t num
        )
        {
            DLIB_CASSERT(num > 0);
            num_trees = num;
        }

        void set_feature_subsampling_fraction (
            double frac
        )
        {
            DLIB_CASSERT(0 < frac && frac <= 1);
            feature_subsampling_frac = frac;
        }

        double get_feature_subsampling_frac(
        ) const
        {
            return feature_subsampling_frac;
        }

        void set_min_samples_per_leaf (
            size_t num
        )
        {
            DLIB_ASSERT(num > 0);
            min_samples_per_leaf = num;
        }

        size_t get_min_samples_per_leaf(
        ) const
        {
            return min_samples_per_leaf;
        }

        void be_verbose (
        )
        {
            verbose = true;
        }

        void be_quiet (
        )
        {
            verbose = false;
        }

        trained_function_type train (
            const std::vector<sample_type>& x,
            const std::vector<double>& y 
        ) const
        {
            std::vector<double> junk; 
            return do_train(x,y,junk,false);
        }

        trained_function_type train (
            const std::vector<sample_type>& x,
            const std::vector<double>& y,
            std::vector<double>& oob_values 
        ) const
        {
            return do_train(x,y,oob_values,true);
        }

    private:

        trained_function_type do_train (
            const std::vector<sample_type>& x,
            const std::vector<double>& y,
            std::vector<double>& oob_values, 
            bool compute_oob_values
        ) const 
        {
            DLIB_CASSERT(x.size() == y.size());
            DLIB_CASSERT(x.size() > 0);

            feature_extractor_type fe = fe_;
            fe.setup(x,y);

            DLIB_CASSERT(fe.max_num_feats() != 0);

            std::vector<std::vector<internal_tree_node<feature_extractor>>> all_trees(num_trees);
            std::vector<std::vector<float>> all_leaves(num_trees);

            const size_t feats_per_node = std::max(1.0,std::round(fe.max_num_feats()*feature_subsampling_frac));

            // Each tree couldn't have more than this many interior nodes.  It might
            // end up having less though. We need to know this value because the way
            // we mark a left or right pointer in a tree as pointing to a leaf is by
            // making its index larger than the number of interior nodes in the tree.
            // But we don't know the tree's size before we finish building it.  So we
            // will use max_num_nodes as a proxy during tree construction and then go
            // back and fix it once a tree's size is known.
            const uint32_t max_num_nodes = y.size(); 

            std::vector<uint32_t> oob_hits;

            if (compute_oob_values)
            {
                oob_values.resize(y.size());
                oob_hits.resize(y.size());
            }


            std::mutex m;

            // Calling build_tree(i) creates the ith tree and stores the results in
            // all_trees and all_leaves.
            auto build_tree = [&](long i)
            {
                dlib::rand rnd(random_seed + std::to_string(i));
                auto& tree = all_trees[i];
                auto& leaves = all_leaves[i];

                // Check if there are fewer than min_samples_per_leaf and if so then
                // don't make any tree.  Just average the things and be done. 
                if (y.size() <= min_samples_per_leaf)
                {
                    leaves.push_back(mean(mat(y)));
                    return;
                }


                double sumy = 0;
                // pick a random bootstrap of the data.
                std::vector<std::pair<float,uint32_t>> idxs(y.size());
                for (auto& idx : idxs) {
                    idx = std::make_pair(0.0f, static_cast<uint32_t>(rnd.get_integer(y.size())));
                    sumy += y[idx.second];
                }

                // We are going to use ranges_to_process as a stack that tracks which
                // range of samples we are going to split next.
                std::vector<range_t> ranges_to_process;
                // start with the root of the tree, i.e. the entire range of training
                // samples.
                ranges_to_process.emplace_back(sumy, 0, static_cast<uint32_t>(y.size()));
                // push an unpopulated root node into the tree.  We will populate it
                // when we process its corresponding range. 
                tree.emplace_back();

                std::vector<typename feature_extractor::feature> feats;

                while(ranges_to_process.size() > 0)
                {
                    // Grab the next range/node to process.
                    const auto range = ranges_to_process.back();
                    ranges_to_process.pop_back();


                    // Get the split features we will consider at this node.
                    fe.get_random_features(rnd, feats_per_node, feats);
                    // Then find the best split
                    auto best_split = find_best_split_among_feats(fe, range, feats, x, y, idxs); 

                    range_t left_split(best_split.left_sum, range.begin, best_split.split_idx);
                    range_t right_split(best_split.right_sum, best_split.split_idx, range.end);

                    DLIB_ASSERT(left_split.begin < left_split.end);
                    DLIB_ASSERT(right_split.begin < right_split.end);

                    // Now that we know the split we can populate the parent node we popped
                    // from ranges_to_process.
                    tree[range.tree_idx].split_threshold = best_split.split_threshold; 
                    tree[range.tree_idx].split_feature = best_split.split_feature; 

                    // If the left split is big enough to make a new interior leaf
                    // node. We also stop splitting if all the samples went into this node.
                    // This could happen if the features are all uniform so there just
                    // isn't any way to split them anymore.
                    if (left_split.size() > min_samples_per_leaf && right_split.size() != 0)
                    {
                        // allocate an interior leaf node for it.
                        left_split.tree_idx = tree.size();
                        tree.emplace_back(); 
                        // set the pointer in the parent node to the newly allocated
                        // node.
                        tree[range.tree_idx].left  = left_split.tree_idx;

                        ranges_to_process.emplace_back(left_split);
                    }
                    else
                    {
                        // Add to leaves.  Don't forget to set the pointer in the
                        // parent node to the newly allocated leaf node.
                        tree[range.tree_idx].left = leaves.size() + max_num_nodes;
                        leaves.emplace_back(static_cast<float>(left_split.avg()));
                    }


                    // If the right split is big enough to make a new interior leaf
                    // node. We also stop splitting if all the samples went into this node.
                    // This could happen if the features are all uniform so there just
                    // isn't any way to split them anymore.
                    if (right_split.size() > min_samples_per_leaf && left_split.size() != 0)
                    {
                        // allocate an interior leaf node for it.
                        right_split.tree_idx = tree.size();
                        tree.emplace_back(); 
                        // set the pointer in the parent node to the newly allocated
                        // node.
                        tree[range.tree_idx].right  = right_split.tree_idx;

                        ranges_to_process.emplace_back(right_split);
                    }
                    else
                    {
                        // Add to leaves.  Don't forget to set the pointer in the
                        // parent node to the newly allocated leaf node.
                        tree[range.tree_idx].right = leaves.size() + max_num_nodes;
                        leaves.emplace_back(static_cast<float>(right_split.avg()));
                    }
                } // end while (still building tree)

                // Fix the leaf pointers in the tree now that we know the correct
                // tree.size() value.
                DLIB_CASSERT(max_num_nodes >= tree.size()); 
                const auto offset = max_num_nodes - tree.size();
                for (auto& n : tree)
                {
                    if (n.left >= max_num_nodes)
                        n.left -= offset;
                    if (n.right >= max_num_nodes)
                        n.right -= offset;
                }
                

                if (compute_oob_values)
                {
                    std::sort(idxs.begin(), idxs.end(), 
                        [](const std::pair<float,uint32_t>& a, const std::pair<float,uint32_t>& b) {return a.second<b.second; });

                    std::lock_guard<std::mutex> lock(m);

                    size_t j = 0;
                    for (size_t i = 0; i < oob_values.size(); ++i)
                    {
                        // check if i is in idxs
                        while(j < idxs.size() && i > idxs[j].second)
                            ++j;

                        // i isn't in idxs so it's an oob sample and we should process it.
                        if (j == idxs.size() || idxs[j].second != i)
                        {
                            oob_hits[i]++;

                            // walk the tree to find the leaf value for this oob sample
                            uint32_t idx = 0;
                            while(idx < tree.size())
                            {
                                auto feature_value = fe.extract_feature_value(x[i], tree[idx].split_feature);
                                if (feature_value < tree[idx].split_threshold)
                                    idx = tree[idx].left;
                                else
                                    idx = tree[idx].right;
                            }
                            oob_values[i] += leaves[idx-tree.size()];
                        }
                    }
                }
            };

            if (verbose)
                parallel_for_verbose(0, num_trees, build_tree);
            else
                parallel_for(0, num_trees, build_tree);


            if (compute_oob_values)
            {
                double meanval = 0;
                double cnt = 0;
                for (size_t i = 0; i < oob_values.size(); ++i)
                {
                    if (oob_hits[i] != 0)
                    {
                        oob_values[i] /= oob_hits[i];
                        meanval += oob_values[i];
                        ++cnt;
                    }
                }

                // If there are some elements that didn't get hits, we set their oob values
                // to the mean oob value.
                if (cnt != 0)
                {
                    const double typical_value = meanval/cnt;
                    for (size_t i = 0; i < oob_values.size(); ++i)
                    {
                        if (oob_hits[i] == 0)
                            oob_values[i] = typical_value;
                    }
                }
            }

            return trained_function_type(std::move(fe), std::move(all_trees), std::move(all_leaves));
        }

        struct range_t 
        {
            range_t(
                double sumy,
                uint32_t begin,
                uint32_t end
            ) : sumy(sumy), begin(begin), end(end), tree_idx(0) {}

            double sumy;
            uint32_t begin;
            uint32_t end;

            // Every range object corresponds to an entry in a tree. This tells you the
            // tree node that owns the range.
            uint32_t tree_idx; 

            uint32_t size() const { return end-begin; }
            double avg() const { return sumy/size(); }
        };

        struct best_split_details
        {
            double score = -std::numeric_limits<double>::infinity();
            double left_sum;
            double right_sum;
            uint32_t split_idx;
            double split_threshold;
            typename feature_extractor::feature split_feature;

            bool operator < (const best_split_details& rhs) const
            {
                return score < rhs.score;
            }
        };

        static best_split_details find_best_split (
            const range_t& range,
            const std::vector<double>& y,
            const std::vector<std::pair<float,uint32_t>>& idxs
        )
        /*!
            requires
                - max(mat(idxs)) < y.size()
                - range.sumy == sum of y[idxs[j].second] for all valid j in range [range.begin, range.end). 
            ensures
                - finds a threshold T such that there exists an i satisfying the following:
                    - y[idxs[j].second] < T for all j <= i
                    - y[idxs[j].second] > T for all j > i
                  Therefore, the threshold T partitions the contents of y into two groups,
                  relative to the ordering established by idxs.  Moreover the partitioning
                  of y values into two groups has the additional requirement that it is
                  optimal in the sense that the sum of the squared deviations from each
                  partition's mean is minimized.
        !*/
        {

            size_t best_i = range.begin;
            double best_score = -1;
            double left_sum = 0;
            double best_left_sum = y[idxs[range.begin].second];
            const auto size = range.size();
            size_t left_size = 0;
            for (size_t i = range.begin; i+1 < range.end; ++i)
            {
                ++left_size;
                left_sum += y[idxs[i].second];

                // Don't split here because the next element has the same feature value so
                // we can't *really* split here.
                if (idxs[i].first==idxs[i+1].first)
                    continue;

                const double right_sum = range.sumy-left_sum;

                const double score = left_sum*left_sum/left_size + right_sum*right_sum/(size-left_size);

                if (score > best_score)
                {
                    best_score = score;
                    best_i = i;
                    best_left_sum = left_sum;
                }
            }

            best_split_details result;
            result.score = best_score;
            result.left_sum = best_left_sum;
            result.right_sum = range.sumy-best_left_sum;
            result.split_idx = best_i+1; // one past the end of the left range
            result.split_threshold = (idxs[best_i].first+idxs[best_i+1].first)/2;

            return result;
        }


        static best_split_details find_best_split_among_feats(
            const feature_extractor& fe,
            const range_t& range, 
            const std::vector<typename feature_extractor::feature>& feats, 
            const std::vector<sample_type>& x,
            const std::vector<double>& y,
            std::vector<std::pair<float,uint32_t>>& idxs
        )
        {
            auto compare_first = [](const std::pair<float,uint32_t>& a, const std::pair<float,uint32_t>& b) { return a.first<b.first; };
            best_split_details best;
            for (auto& feat : feats)
            {
                // Extract feature values for this feature and sort the indexes based on
                // that feature so we can then find the best split.
                for (auto i = range.begin; i < range.end; ++i)
                    idxs[i].first = fe.extract_feature_value(x[idxs[i].second], feat);

                std::stable_sort(idxs.begin()+range.begin, idxs.begin()+range.end, compare_first);

                auto split = find_best_split(range, y, idxs);

                if (best < split)
                {
                    best = split;
                    best.split_feature = feat;
                }
            }

            // resort idxs based on winning feat
            for (auto i = range.begin; i < range.end; ++i)
                idxs[i].first = fe.extract_feature_value(x[idxs[i].second], best.split_feature);
            std::stable_sort(idxs.begin()+range.begin, idxs.begin()+range.end, compare_first);

            return best;
        }

        std::string random_seed;
        size_t num_trees = 1000;
        double feature_subsampling_frac = 1.0/3.0;
        size_t min_samples_per_leaf = 5;
        feature_extractor_type fe_;
        bool verbose = false;
    };

// ----------------------------------------------------------------------------------------

}

#endif // DLIB_RANdOM_FOREST_REGRESSION_H_