dlib C++ Library - bottom_up

// Copyright (C) 2015  Davis E. King (davis@dlib.net)
// License: Boost Software License   See LICENSE.txt for the full license.
#ifndef DLIB_BOTTOM_uP_CLUSTER_Hh_
#define DLIB_BOTTOM_uP_CLUSTER_Hh_

#include <queue>
#include <map>

#include "bottom_up_cluster_abstract.h"
#include "../algs.h"
#include "../matrix.h"
#include "../disjoint_subsets.h"
#include "../graph_utils.h"


namespace dlib
{

// ----------------------------------------------------------------------------------------

    namespace buc_impl
    {
        inline void merge_sets (
            matrix<double>& dists,
            unsigned long dest,
            unsigned long src
        )
        {
            for (long r = 0; r < dists.nr(); ++r)
                dists(dest,r) = dists(r,dest) = std::max(dists(r,dest), dists(r,src));
        }

        struct compare_dist
        {
            bool operator() (
                const sample_pair& a,
                const sample_pair& b
            ) const
            {
                return a.distance() > b.distance();
            }
        };
    }

// ----------------------------------------------------------------------------------------

    template <
        typename EXP
        >
    unsigned long bottom_up_cluster (
        const matrix_exp<EXP>& dists_,
        std::vector<unsigned long>& labels,
        unsigned long min_num_clusters,
        double max_dist = std::numeric_limits<double>::infinity()
    )
    {
        matrix<double> dists = matrix_cast<double>(dists_);
        // make sure requires clause is not broken
        DLIB_CASSERT(dists.nr() == dists.nc() && min_num_clusters > 0, 
            "\t unsigned long bottom_up_cluster()"
            << "\n\t Invalid inputs were given to this function."
            << "\n\t dists.nr(): " << dists.nr() 
            << "\n\t dists.nc(): " << dists.nc() 
            << "\n\t min_num_clusters: " << min_num_clusters 
            );

        using namespace buc_impl;

        labels.resize(dists.nr());
        disjoint_subsets sets;
        sets.set_size(dists.nr());
        if (labels.size() == 0)
            return 0;

        // push all the edges in the graph into a priority queue so the best edges to merge
        // come first.
        std::priority_queue<sample_pair, std::vector<sample_pair>, compare_dist> que;
        for (long r = 0; r < dists.nr(); ++r)
            for (long c = r+1; c < dists.nc(); ++c)
                que.push(sample_pair(r,c,dists(r,c)));

        // Now start merging nodes.
        for (unsigned long iter = min_num_clusters; iter < sets.size(); ++iter)
        {
            // find the next best thing to merge.
            double best_dist = que.top().distance();
            unsigned long a = sets.find_set(que.top().index1());
            unsigned long b = sets.find_set(que.top().index2());
            que.pop();
            // we have been merging and modifying the distances, so make sure this distance
            // is still valid and these guys haven't been merged already.
            while(a == b || best_dist < dists(a,b))
            {
                // Haven't merged it yet, so put it back in with updated distance for
                // reconsideration later.
                if (a != b)
                    que.push(sample_pair(a, b, dists(a, b)));

                best_dist = que.top().distance();
                a = sets.find_set(que.top().index1());
                b = sets.find_set(que.top().index2());
                que.pop();
            }


            // now merge these sets if the best distance is small enough
            if (best_dist > max_dist)
                break;
            unsigned long news = sets.merge_sets(a,b);
            unsigned long olds = (news==a)?b:a;
            merge_sets(dists, news, olds);
        }

        // figure out which cluster each element is in.  Also make sure the labels are
        // contiguous.
        std::map<unsigned long, unsigned long> relabel;
        for (unsigned long r = 0; r < labels.size(); ++r)
        {
            unsigned long l = sets.find_set(r);
            // relabel to make contiguous
            if (relabel.count(l) == 0)
            {
                unsigned long next = relabel.size();
                relabel[l] = next;
            }
            labels[r] = relabel[l];
        }


        return relabel.size();
    }

// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------

    struct snl_range
    {
        snl_range() = default;
        snl_range(double val) : lower(val), upper(val) {}
        snl_range(double l, double u) : lower(l), upper(u) { DLIB_ASSERT(lower <= upper)}

        double lower = 0;
        double upper = 0;

        double width() const { return upper-lower; }
        bool operator<(const snl_range& item) const { return lower < item.lower; }
    };

    inline snl_range merge(const snl_range& a, const snl_range& b)
    {
        return snl_range(std::min(a.lower, b.lower), std::max(a.upper, b.upper));
    }

    inline double distance (const snl_range& a, const snl_range& b)
    {
        return std::max(a.lower,b.lower) - std::min(a.upper,b.upper);
    }

    inline std::ostream& operator<< (std::ostream& out, const snl_range& item )
    {
        out << "["<<item.lower<<","<<item.upper<<"]";
        return out;
    }

// ----------------------------------------------------------------------------------------

    inline std::vector<snl_range> segment_number_line (
        const std::vector<double>& x,
        const double max_range_width
    )
    {
        DLIB_CASSERT(max_range_width >= 0);

        // create initial ranges, one for each value in x.  So initially, all the ranges have
        // width of 0.
        std::vector<snl_range> ranges;
        for (auto v : x)
            ranges.push_back(v);
        std::sort(ranges.begin(), ranges.end());

        std::vector<snl_range> greedy_final_ranges;
        if (ranges.size() == 0)
            return greedy_final_ranges;
        // We will try two different clustering strategies.  One that does a simple greedy left
        // to right sweep and another that does a bottom up agglomerative clustering.  This
        // first loop runs the greedy left to right sweep.  Then at the end of this routine we
        // will return the results that produced the tightest clustering.
        greedy_final_ranges.push_back(ranges[0]);
        for (size_t i = 1; i < ranges.size(); ++i)
        {
            auto m = merge(greedy_final_ranges.back(), ranges[i]);
            if (m.width() <= max_range_width)
                greedy_final_ranges.back() = m;
            else
                greedy_final_ranges.push_back(ranges[i]);
        }


        // Here we do the bottom up clustering.  So compute the edges connecting our ranges.
        // We will simply say there are edges between ranges if and only if they are
        // immediately adjacent on the number line.
        std::vector<sample_pair> edges;
        for (size_t i = 1; i < ranges.size(); ++i)
            edges.push_back(sample_pair(i-1,i, distance(ranges[i-1],ranges[i])));
        std::sort(edges.begin(), edges.end(), order_by_distance<sample_pair>);

        disjoint_subsets sets;
        sets.set_size(ranges.size());

        // Now start merging nodes.
        for (auto edge : edges)
        {
            // find the next best thing to merge.
            unsigned long a = sets.find_set(edge.index1());
            unsigned long b = sets.find_set(edge.index2());

            // merge it if it doesn't result in an interval that's too big.
            auto m = merge(ranges[a], ranges[b]);
            if (m.width() <= max_range_width)
            {
                unsigned long news = sets.merge_sets(a,b);
                ranges[news] = m;
            }
        }

        // Now create a list of the final ranges.  We will do this by keeping track of which
        // range we already added to final_ranges.
        std::vector<snl_range> final_ranges;
        std::vector<bool> already_output(ranges.size(), false);
        for (unsigned long i = 0; i < sets.size(); ++i)
        {
            auto s = sets.find_set(i);
            if (!already_output[s])
            {
                final_ranges.push_back(ranges[s]);
                already_output[s] = true;
            }
        }

        // only use the greedy clusters if they found a clustering with fewer clusters.
        // Otherwise, the bottom up clustering probably produced a more sensible clustering.
        if (final_ranges.size() <= greedy_final_ranges.size())
            return final_ranges;
        else
            return greedy_final_ranges;
    }

// ----------------------------------------------------------------------------------------

}

#endif // DLIB_BOTTOM_uP_CLUSTER_Hh_