// Copyright (C) 2018 Davis E. King (davis@dlib.net) // License: Boost Software License See LICENSE.txt for the full license. #ifndef DLIB_RANdOM_FOREST_REGRESSION_H_ #define DLIB_RANdOM_FOREST_REGRESSION_H_ #include "random_forest_regression_abstract.h" #include <vector> #include "../matrix.h" #include <algorithm> #include "../threads.h" namespace dlib { // ---------------------------------------------------------------------------------------- class dense_feature_extractor { public: typedef uint32_t feature; typedef matrix<double,0,1> sample_type; dense_feature_extractor( ) = default; void setup ( const std::vector<sample_type>& x, const std::vector<double>& y ) { DLIB_CASSERT(x.size() > 0); DLIB_CASSERT(x.size() == y.size()); for (auto& el : x) DLIB_CASSERT(el.size() == x[0].size(), "All the vectors in a training set have to have the same dimensionality."); DLIB_CASSERT(x[0].size() != 0, "The vectors can't be empty."); num_feats = x[0].size(); } void get_random_features ( dlib::rand& rnd, size_t num, std::vector<feature>& feats ) const { DLIB_ASSERT(max_num_feats() != 0); num = std::min(num, num_feats); feats.clear(); for (size_t i = 0; i < num_feats; ++i) feats.push_back(i); // now pick num features at random for (size_t i = 0; i < num; ++i) { auto idx = rnd.get_integer_in_range(i,num_feats); std::swap(feats[i], feats[idx]); } feats.resize(num); } double extract_feature_value ( const sample_type& item, const feature& f ) const { DLIB_ASSERT(max_num_feats() != 0); return item(f); } size_t max_num_feats ( ) const { return num_feats; } friend void serialize(const dense_feature_extractor& item, std::ostream& out) { serialize("dense_feature_extractor", out); serialize(item.num_feats, out); } friend void deserialize(dense_feature_extractor& item, std::istream& in) { check_serialized_version("dense_feature_extractor", in); deserialize(item.num_feats, in); } private: size_t num_feats = 0; }; // ---------------------------------------------------------------------------------------- template < typename feature_extractor > struct internal_tree_node { uint32_t left; uint32_t right; float split_threshold; typename feature_extractor::feature split_feature; }; template <typename feature_extractor> void serialize(const internal_tree_node<feature_extractor>& item, std::ostream& out) { serialize(item.left, out); serialize(item.right, out); serialize(item.split_threshold, out); serialize(item.split_feature, out); } template <typename feature_extractor> void deserialize(internal_tree_node<feature_extractor>& item, std::istream& in) { deserialize(item.left, in); deserialize(item.right, in); deserialize(item.split_threshold, in); deserialize(item.split_feature, in); } // ---------------------------------------------------------------------------------------- template < typename feature_extractor = dense_feature_extractor > class random_forest_regression_function { public: typedef feature_extractor feature_extractor_type; typedef typename feature_extractor::sample_type sample_type; random_forest_regression_function( ) = default; random_forest_regression_function ( feature_extractor_type&& fe_, std::vector<std::vector<internal_tree_node<feature_extractor>>>&& trees_, std::vector<std::vector<float>>&& leaves_ ) : fe(std::move(fe_)), trees(std::move(trees_)), leaves(std::move(leaves_)) { DLIB_ASSERT(trees.size() > 0); DLIB_ASSERT(trees.size() == leaves.size(), "Every set of tree nodes has to have leaves"); #ifdef ENABLE_ASSERTS for (size_t i = 0; i < trees.size(); ++i) { DLIB_ASSERT(trees[i].size() > 0, "A tree can't have 0 leaves."); for (auto& node : trees[i]) { DLIB_ASSERT(trees[i].size()+leaves[i].size() > node.left, "left node index in tree is too big. There is no associated tree node or leaf."); DLIB_ASSERT(trees[i].size()+leaves[i].size() > node.right, "right node index in tree is too big. There is no associated tree node or leaf."); } } #endif } size_t get_num_trees( ) const { return trees.size(); } const std::vector<std::vector<internal_tree_node<feature_extractor>>>& get_internal_tree_nodes ( ) const { return trees; } const std::vector<std::vector<float>>& get_tree_leaves ( ) const { return leaves; } const feature_extractor_type& get_feature_extractor ( ) const { return fe; } double operator() ( const sample_type& x ) const { DLIB_ASSERT(get_num_trees() > 0); double accum = 0; for (size_t i = 0; i < trees.size(); ++i) { auto& tree = trees[i]; // walk the tree to the leaf uint32_t idx = 0; while(idx < tree.size()) { auto feature_value = fe.extract_feature_value(x, tree[idx].split_feature); if (feature_value < tree[idx].split_threshold) idx = tree[idx].left; else idx = tree[idx].right; } // compute leaf index accum += leaves[i][idx-tree.size()]; } return accum/trees.size(); } friend void serialize(const random_forest_regression_function& item, std::ostream& out) { serialize("random_forest_regression_function", out); serialize(item.fe, out); serialize(item.trees, out); serialize(item.leaves, out); } friend void deserialize(random_forest_regression_function& item, std::istream& in) { check_serialized_version("random_forest_regression_function", in); deserialize(item.fe, in); deserialize(item.trees, in); deserialize(item.leaves, in); } private: /*! CONVENTION - trees.size() == leaves.size() - Any .left or .right index in trees that is larger than the number of nodes in the tree references a leaf. Moreover, the index of the leaf is computed by subtracting the number of nodes in the tree. !*/ feature_extractor_type fe; // internal nodes of trees std::vector<std::vector<internal_tree_node<feature_extractor>>> trees; // leaves of trees std::vector<std::vector<float>> leaves; }; // ---------------------------------------------------------------------------------------- template < typename feature_extractor = dense_feature_extractor > class random_forest_regression_trainer { public: typedef feature_extractor feature_extractor_type; typedef random_forest_regression_function<feature_extractor> trained_function_type; typedef typename feature_extractor::sample_type sample_type; random_forest_regression_trainer ( ) = default; const feature_extractor_type& get_feature_extractor ( ) const { return fe_; } void set_feature_extractor ( const feature_extractor_type& feat_extractor ) { fe_ = feat_extractor; } void set_seed ( const std::string& seed ) { random_seed = seed; } const std::string& get_random_seed ( ) const { return random_seed; } size_t get_num_trees ( ) const { return num_trees; } void set_num_trees ( size_t num ) { DLIB_CASSERT(num > 0); num_trees = num; } void set_feature_subsampling_fraction ( double frac ) { DLIB_CASSERT(0 < frac && frac <= 1); feature_subsampling_frac = frac; } double get_feature_subsampling_frac( ) const { return feature_subsampling_frac; } void set_min_samples_per_leaf ( size_t num ) { DLIB_ASSERT(num > 0); min_samples_per_leaf = num; } size_t get_min_samples_per_leaf( ) const { return min_samples_per_leaf; } void be_verbose ( ) { verbose = true; } void be_quiet ( ) { verbose = false; } trained_function_type train ( const std::vector<sample_type>& x, const std::vector<double>& y ) const { std::vector<double> junk; return do_train(x,y,junk,false); } trained_function_type train ( const std::vector<sample_type>& x, const std::vector<double>& y, std::vector<double>& oob_values ) const { return do_train(x,y,oob_values,true); } private: trained_function_type do_train ( const std::vector<sample_type>& x, const std::vector<double>& y, std::vector<double>& oob_values, bool compute_oob_values ) const { DLIB_CASSERT(x.size() == y.size()); DLIB_CASSERT(x.size() > 0); feature_extractor_type fe = fe_; fe.setup(x,y); DLIB_CASSERT(fe.max_num_feats() != 0); std::vector<std::vector<internal_tree_node<feature_extractor>>> all_trees(num_trees); std::vector<std::vector<float>> all_leaves(num_trees); const size_t feats_per_node = std::max(1.0,std::round(fe.max_num_feats()*feature_subsampling_frac)); // Each tree couldn't have more than this many interior nodes. It might // end up having less though. We need to know this value because the way // we mark a left or right pointer in a tree as pointing to a leaf is by // making its index larger than the number of interior nodes in the tree. // But we don't know the tree's size before we finish building it. So we // will use max_num_nodes as a proxy during tree construction and then go // back and fix it once a tree's size is known. const uint32_t max_num_nodes = y.size(); std::vector<uint32_t> oob_hits; if (compute_oob_values) { oob_values.resize(y.size()); oob_hits.resize(y.size()); } std::mutex m; // Calling build_tree(i) creates the ith tree and stores the results in // all_trees and all_leaves. auto build_tree = [&](long i) { dlib::rand rnd(random_seed + std::to_string(i)); auto& tree = all_trees[i]; auto& leaves = all_leaves[i]; // Check if there are fewer than min_samples_per_leaf and if so then // don't make any tree. Just average the things and be done. if (y.size() <= min_samples_per_leaf) { leaves.push_back(mean(mat(y))); return; } double sumy = 0; // pick a random bootstrap of the data. std::vector<std::pair<float,uint32_t>> idxs(y.size()); for (auto& idx : idxs) { idx = std::make_pair(0.0f, static_cast<uint32_t>(rnd.get_integer(y.size()))); sumy += y[idx.second]; } // We are going to use ranges_to_process as a stack that tracks which // range of samples we are going to split next. std::vector<range_t> ranges_to_process; // start with the root of the tree, i.e. the entire range of training // samples. ranges_to_process.emplace_back(sumy, 0, static_cast<uint32_t>(y.size())); // push an unpopulated root node into the tree. We will populate it // when we process its corresponding range. tree.emplace_back(); std::vector<typename feature_extractor::feature> feats; while(ranges_to_process.size() > 0) { // Grab the next range/node to process. const auto range = ranges_to_process.back(); ranges_to_process.pop_back(); // Get the split features we will consider at this node. fe.get_random_features(rnd, feats_per_node, feats); // Then find the best split auto best_split = find_best_split_among_feats(fe, range, feats, x, y, idxs); range_t left_split(best_split.left_sum, range.begin, best_split.split_idx); range_t right_split(best_split.right_sum, best_split.split_idx, range.end); DLIB_ASSERT(left_split.begin < left_split.end); DLIB_ASSERT(right_split.begin < right_split.end); // Now that we know the split we can populate the parent node we popped // from ranges_to_process. tree[range.tree_idx].split_threshold = best_split.split_threshold; tree[range.tree_idx].split_feature = best_split.split_feature; // If the left split is big enough to make a new interior leaf // node. We also stop splitting if all the samples went into this node. // This could happen if the features are all uniform so there just // isn't any way to split them anymore. if (left_split.size() > min_samples_per_leaf && right_split.size() != 0) { // allocate an interior leaf node for it. left_split.tree_idx = tree.size(); tree.emplace_back(); // set the pointer in the parent node to the newly allocated // node. tree[range.tree_idx].left = left_split.tree_idx; ranges_to_process.emplace_back(left_split); } else { // Add to leaves. Don't forget to set the pointer in the // parent node to the newly allocated leaf node. tree[range.tree_idx].left = leaves.size() + max_num_nodes; leaves.emplace_back(static_cast<float>(left_split.avg())); } // If the right split is big enough to make a new interior leaf // node. We also stop splitting if all the samples went into this node. // This could happen if the features are all uniform so there just // isn't any way to split them anymore. if (right_split.size() > min_samples_per_leaf && left_split.size() != 0) { // allocate an interior leaf node for it. right_split.tree_idx = tree.size(); tree.emplace_back(); // set the pointer in the parent node to the newly allocated // node. tree[range.tree_idx].right = right_split.tree_idx; ranges_to_process.emplace_back(right_split); } else { // Add to leaves. Don't forget to set the pointer in the // parent node to the newly allocated leaf node. tree[range.tree_idx].right = leaves.size() + max_num_nodes; leaves.emplace_back(static_cast<float>(right_split.avg())); } } // end while (still building tree) // Fix the leaf pointers in the tree now that we know the correct // tree.size() value. DLIB_CASSERT(max_num_nodes >= tree.size()); const auto offset = max_num_nodes - tree.size(); for (auto& n : tree) { if (n.left >= max_num_nodes) n.left -= offset; if (n.right >= max_num_nodes) n.right -= offset; } if (compute_oob_values) { std::sort(idxs.begin(), idxs.end(), [](const std::pair<float,uint32_t>& a, const std::pair<float,uint32_t>& b) {return a.second<b.second; }); std::lock_guard<std::mutex> lock(m); size_t j = 0; for (size_t i = 0; i < oob_values.size(); ++i) { // check if i is in idxs while(j < idxs.size() && i > idxs[j].second) ++j; // i isn't in idxs so it's an oob sample and we should process it. if (j == idxs.size() || idxs[j].second != i) { oob_hits[i]++; // walk the tree to find the leaf value for this oob sample uint32_t idx = 0; while(idx < tree.size()) { auto feature_value = fe.extract_feature_value(x[i], tree[idx].split_feature); if (feature_value < tree[idx].split_threshold) idx = tree[idx].left; else idx = tree[idx].right; } oob_values[i] += leaves[idx-tree.size()]; } } } }; if (verbose) parallel_for_verbose(0, num_trees, build_tree); else parallel_for(0, num_trees, build_tree); if (compute_oob_values) { double meanval = 0; double cnt = 0; for (size_t i = 0; i < oob_values.size(); ++i) { if (oob_hits[i] != 0) { oob_values[i] /= oob_hits[i]; meanval += oob_values[i]; ++cnt; } } // If there are some elements that didn't get hits, we set their oob values // to the mean oob value. if (cnt != 0) { const double typical_value = meanval/cnt; for (size_t i = 0; i < oob_values.size(); ++i) { if (oob_hits[i] == 0) oob_values[i] = typical_value; } } } return trained_function_type(std::move(fe), std::move(all_trees), std::move(all_leaves)); } struct range_t { range_t( double sumy, uint32_t begin, uint32_t end ) : sumy(sumy), begin(begin), end(end), tree_idx(0) {} double sumy; uint32_t begin; uint32_t end; // Every range object corresponds to an entry in a tree. This tells you the // tree node that owns the range. uint32_t tree_idx; uint32_t size() const { return end-begin; } double avg() const { return sumy/size(); } }; struct best_split_details { double score = -std::numeric_limits<double>::infinity(); double left_sum; double right_sum; uint32_t split_idx; double split_threshold; typename feature_extractor::feature split_feature; bool operator < (const best_split_details& rhs) const { return score < rhs.score; } }; static best_split_details find_best_split ( const range_t& range, const std::vector<double>& y, const std::vector<std::pair<float,uint32_t>>& idxs ) /*! requires - max(mat(idxs)) < y.size() - range.sumy == sum of y[idxs[j].second] for all valid j in range [range.begin, range.end). ensures - finds a threshold T such that there exists an i satisfying the following: - y[idxs[j].second] < T for all j <= i - y[idxs[j].second] > T for all j > i Therefore, the threshold T partitions the contents of y into two groups, relative to the ordering established by idxs. Moreover the partitioning of y values into two groups has the additional requirement that it is optimal in the sense that the sum of the squared deviations from each partition's mean is minimized. !*/ { size_t best_i = range.begin; double best_score = -1; double left_sum = 0; double best_left_sum = y[idxs[range.begin].second]; const auto size = range.size(); size_t left_size = 0; for (size_t i = range.begin; i+1 < range.end; ++i) { ++left_size; left_sum += y[idxs[i].second]; // Don't split here because the next element has the same feature value so // we can't *really* split here. if (idxs[i].first==idxs[i+1].first) continue; const double right_sum = range.sumy-left_sum; const double score = left_sum*left_sum/left_size + right_sum*right_sum/(size-left_size); if (score > best_score) { best_score = score; best_i = i; best_left_sum = left_sum; } } best_split_details result; result.score = best_score; result.left_sum = best_left_sum; result.right_sum = range.sumy-best_left_sum; result.split_idx = best_i+1; // one past the end of the left range result.split_threshold = (idxs[best_i].first+idxs[best_i+1].first)/2; return result; } static best_split_details find_best_split_among_feats( const feature_extractor& fe, const range_t& range, const std::vector<typename feature_extractor::feature>& feats, const std::vector<sample_type>& x, const std::vector<double>& y, std::vector<std::pair<float,uint32_t>>& idxs ) { auto compare_first = [](const std::pair<float,uint32_t>& a, const std::pair<float,uint32_t>& b) { return a.first<b.first; }; best_split_details best; for (auto& feat : feats) { // Extract feature values for this feature and sort the indexes based on // that feature so we can then find the best split. for (auto i = range.begin; i < range.end; ++i) idxs[i].first = fe.extract_feature_value(x[idxs[i].second], feat); std::stable_sort(idxs.begin()+range.begin, idxs.begin()+range.end, compare_first); auto split = find_best_split(range, y, idxs); if (best < split) { best = split; best.split_feature = feat; } } // resort idxs based on winning feat for (auto i = range.begin; i < range.end; ++i) idxs[i].first = fe.extract_feature_value(x[idxs[i].second], best.split_feature); std::stable_sort(idxs.begin()+range.begin, idxs.begin()+range.end, compare_first); return best; } std::string random_seed; size_t num_trees = 1000; double feature_subsampling_frac = 1.0/3.0; size_t min_samples_per_leaf = 5; feature_extractor_type fe_; bool verbose = false; }; // ---------------------------------------------------------------------------------------- } #endif // DLIB_RANdOM_FOREST_REGRESSION_H_