```// Copyright (C) 2012  Davis E. King (davis@dlib.net)
#ifndef DLIB_MODULARITY_ClUSTERING__H__
#define DLIB_MODULARITY_ClUSTERING__H__

#include "modularity_clustering_abstract.h"
#include "../sparse_vector.h"
#include "../graph_utils/edge_list_graphs.h"
#include "../matrix.h"
#include "../rand.h"

namespace dlib
{

// -----------------------------------------------------------------------------------------

namespace impl
{
inline double newman_cluster_split (
dlib::rand& rnd,
const std::vector<ordered_sample_pair>& edges,
const matrix<double,0,1>& node_degrees, // k from the Newman paper
const matrix<double,0,1>& Bdiag,        // diag(B) from the Newman paper
const double& edge_sum,                 // m from the Newman paper
matrix<double,0,1>& labels,
const double eps,
const unsigned long max_iterations
)
/*!
requires
- node_degrees.size() == max_index_plus_one(edges)
- Bdiag.size() == max_index_plus_one(edges)
- edges must be sorted according to order_by_index()
ensures
- This routine splits a graph into two subgraphs using the Newman
clustering method.
- returns the modularity obtained when the graph is split according
to the contents of #labels.
- #labels.size() == node_degrees.size()
- for all valid i: #labels(i) == -1 or +1
- if (this function returns 0) then
- all the labels are equal, i.e. the graph is not split.
!*/
{
// Scale epsilon so that it is relative to the expected value of an element of a
// unit vector of length node_degrees.size().
const double power_iter_eps = eps * std::sqrt(1.0/node_degrees.size());

// Make a random unit vector and put in labels.
labels.set_size(node_degrees.size());
for (long i = 0; i < labels.size(); ++i)
labels(i) = rnd.get_random_gaussian();
labels /= length(labels);

matrix<double,0,1> Bv, Bv_unit;

// Do the power iteration for a while.
double eig = -1;
double offset = 0;
while (eig < 0)
{

// any number larger than power_iter_eps
double iteration_change = power_iter_eps*2+1;
for (unsigned long i = 0; i < max_iterations && iteration_change > power_iter_eps; ++i)
{
sparse_matrix_vector_multiply(edges, labels, Bv);
Bv -= dot(node_degrees, labels)/(2*edge_sum) * node_degrees;

if (offset != 0)
{
Bv -= offset*labels;
}

const double len = length(Bv);
if (len != 0)
{
Bv_unit = Bv/len;
iteration_change = max(abs(labels-Bv_unit));
labels.swap(Bv_unit);
}
else
{
// Had a bad time, pick another random vector and try it with the
// power iteration.
for (long i = 0; i < labels.size(); ++i)
labels(i) = rnd.get_random_gaussian();
}
}

eig = dot(Bv,labels);
// we will repeat this loop if the largest eigenvalue is negative
offset = eig;
}

for (long i = 0; i < labels.size(); ++i)
{
if (labels(i) > 0)
labels(i) = 1;
else
labels(i) = -1;
}

// compute B*labels, store result in Bv.
sparse_matrix_vector_multiply(edges, labels, Bv);
Bv -= dot(node_degrees, labels)/(2*edge_sum) * node_degrees;

// Do some label refinement.  In this step we swap labels if it
// improves the modularity score.
bool flipped_label = true;
while(flipped_label)
{
flipped_label = false;
unsigned long idx = 0;
for (long i = 0; i < labels.size(); ++i)
{
const double val = -2*labels(i);
const double increase = 4*Bdiag(i) + 2*val*Bv(i);

// if there is an increase in modularity for swapping this label
if (increase > 0)
{
labels(i) *= -1;
while (idx < edges.size() && edges[idx].index1() == (unsigned long)i)
{
const long j = edges[idx].index2();
Bv(j) += val*edges[idx].distance();
++idx;
}

Bv -= (val*node_degrees(i)/(2*edge_sum))*node_degrees;

flipped_label = true;
}
else
{
while (idx < edges.size() && edges[idx].index1() == (unsigned long)i)
{
++idx;
}
}
}
}

const double modularity = dot(Bv, labels)/(4*edge_sum);

return modularity;
}

// -------------------------------------------------------------------------------------

inline unsigned long newman_cluster_helper (
dlib::rand& rnd,
const std::vector<ordered_sample_pair>& edges,
const matrix<double,0,1>& node_degrees, // k from the Newman paper
const matrix<double,0,1>& Bdiag,        // diag(B) from the Newman paper
const double& edge_sum,                 // m from the Newman paper
std::vector<unsigned long>& labels,
double modularity_threshold,
const double eps,
const unsigned long max_iterations
)
/*!
ensures
- returns the number of clusters the data was split into
!*/
{
matrix<double,0,1> l;
const double modularity = newman_cluster_split(rnd,edges,node_degrees,Bdiag,edge_sum,l,eps,max_iterations);

// We need to collapse the node index values down to contiguous values.  So
// we use the following two vectors to contain the mappings from input index
// values to their corresponding index values in each split.
std::vector<unsigned long> left_idx_map(node_degrees.size());
std::vector<unsigned long> right_idx_map(node_degrees.size());

// figure out how many nodes went into each side of the split.
unsigned long num_left_split = 0;
unsigned long num_right_split = 0;
for (long i = 0; i < l.size(); ++i)
{
if (l(i) > 0)
{
left_idx_map[i] = num_left_split;
++num_left_split;
}
else
{
right_idx_map[i] = num_right_split;
++num_right_split;
}
}

// do a recursive split if it will improve the modularity.
if (modularity > modularity_threshold && num_left_split > 0 && num_right_split > 0)
{

// split the node_degrees and Bdiag matrices into left and right split parts
matrix<double,0,1> left_node_degrees(num_left_split);
matrix<double,0,1> right_node_degrees(num_right_split);
matrix<double,0,1> left_Bdiag(num_left_split);
matrix<double,0,1> right_Bdiag(num_right_split);
for (long i = 0; i < l.size(); ++i)
{
if (l(i) > 0)
{
left_node_degrees(left_idx_map[i]) = node_degrees(i);
left_Bdiag(left_idx_map[i]) = Bdiag(i);
}
else
{
right_node_degrees(right_idx_map[i]) = node_degrees(i);
right_Bdiag(right_idx_map[i]) = Bdiag(i);
}
}

// put the edges from one side of the split into split_edges
std::vector<ordered_sample_pair> split_edges;
modularity_threshold = 0;
for (unsigned long k = 0; k < edges.size(); ++k)
{
const unsigned long i = edges[k].index1();
const unsigned long j = edges[k].index2();
const double d = edges[k].distance();
if (l(i) > 0 && l(j) > 0)
{
split_edges.push_back(ordered_sample_pair(left_idx_map[i], left_idx_map[j], d));
modularity_threshold += d;
}
}
modularity_threshold -= sum(left_node_degrees*sum(left_node_degrees))/(2*edge_sum);
modularity_threshold /= 4*edge_sum;

unsigned long num_left_clusters;
std::vector<unsigned long> left_labels;
num_left_clusters = newman_cluster_helper(rnd,split_edges,left_node_degrees,left_Bdiag,
edge_sum,left_labels,modularity_threshold,
eps, max_iterations);

// now load the other side into split_edges and cluster it as well
split_edges.clear();
modularity_threshold = 0;
for (unsigned long k = 0; k < edges.size(); ++k)
{
const unsigned long i = edges[k].index1();
const unsigned long j = edges[k].index2();
const double d = edges[k].distance();
if (l(i) < 0 && l(j) < 0)
{
split_edges.push_back(ordered_sample_pair(right_idx_map[i], right_idx_map[j], d));
modularity_threshold += d;
}
}
modularity_threshold -= sum(right_node_degrees*sum(right_node_degrees))/(2*edge_sum);
modularity_threshold /= 4*edge_sum;

unsigned long num_right_clusters;
std::vector<unsigned long> right_labels;
num_right_clusters = newman_cluster_helper(rnd,split_edges,right_node_degrees,right_Bdiag,
edge_sum,right_labels,modularity_threshold,
eps, max_iterations);

// Now merge the labels from the two splits.
labels.resize(node_degrees.size());
for (unsigned long i = 0; i < labels.size(); ++i)
{
// if this node was in the left split
if (l(i) > 0)
{
labels[i] = left_labels[left_idx_map[i]];
}
else // if this node was in the right split
{
labels[i] = right_labels[right_idx_map[i]] + num_left_clusters;
}
}

return num_left_clusters + num_right_clusters;
}
else
{
labels.assign(node_degrees.size(),0);
return 1;
}

}
}

// ----------------------------------------------------------------------------------------

inline unsigned long newman_cluster (
const std::vector<ordered_sample_pair>& edges,
std::vector<unsigned long>& labels,
const double eps = 1e-4,
const unsigned long max_iterations = 2000
)
{
// make sure requires clause is not broken
DLIB_ASSERT(is_ordered_by_index(edges),
"\t unsigned long newman_cluster()"
<< "\n\t Invalid inputs were given to this function"
);

labels.clear();
if (edges.size() == 0)
return 0;

const unsigned long num_nodes = max_index_plus_one(edges);

// compute the node_degrees vector, edge_sum value, and diag(B).
matrix<double,0,1> node_degrees(num_nodes);
matrix<double,0,1> Bdiag(num_nodes);
Bdiag = 0;
double edge_sum = 0;
node_degrees = 0;
for (unsigned long i = 0; i < edges.size(); ++i)
{
node_degrees(edges[i].index1()) += edges[i].distance();
edge_sum += edges[i].distance();
if (edges[i].index1() == edges[i].index2())
Bdiag(edges[i].index1()) += edges[i].distance();
}
edge_sum /= 2;
Bdiag -= squared(node_degrees)/(2*edge_sum);

dlib::rand rnd;
return impl::newman_cluster_helper(rnd,edges,node_degrees,Bdiag,edge_sum,labels,0,eps,max_iterations);
}

// ----------------------------------------------------------------------------------------

inline unsigned long newman_cluster (
const std::vector<sample_pair>& edges,
std::vector<unsigned long>& labels,
const double eps = 1e-4,
const unsigned long max_iterations = 2000
)
{
std::vector<ordered_sample_pair> oedges;
convert_unordered_to_ordered(edges, oedges);
std::sort(oedges.begin(), oedges.end(), &order_by_index<ordered_sample_pair>);

return newman_cluster(oedges, labels, eps, max_iterations);
}

// ----------------------------------------------------------------------------------------

namespace impl
{
inline std::vector<unsigned long> remap_labels (
const std::vector<unsigned long>& labels,
unsigned long& num_labels
)
/*!
ensures
- This function takes labels and produces a mapping which maps elements of
labels into the most compact range in [0, max] as possible.  In particular,
there won't be any unused integers in the mapped range.
- #num_labels == the number of distinct values in labels.
- returns a vector V such that:
- V.size() == labels.size()
- max(mat(V))+1 == num_labels.
- for all valid i,j:
- if (labels[i] == labels[j]) then
- V[i] == V[j]
- else
- V[i] != V[j]
!*/
{
std::map<unsigned long, unsigned long> temp;
for (unsigned long i = 0; i < labels.size(); ++i)
{
if (temp.count(labels[i]) == 0)
{
const unsigned long next = temp.size();
temp[labels[i]] = next;
}
}

num_labels = temp.size();

std::vector<unsigned long> result(labels.size());
for (unsigned long i = 0; i < labels.size(); ++i)
{
result[i] = temp[labels[i]];
}
return result;
}
}

// ----------------------------------------------------------------------------------------

inline double modularity (
const std::vector<sample_pair>& edges,
const std::vector<unsigned long>& labels
)
{
const unsigned long num_nodes = max_index_plus_one(edges);
// make sure requires clause is not broken
DLIB_ASSERT(labels.size() == num_nodes,
"\t double modularity()"
<< "\n\t Invalid inputs were given to this function"
);

unsigned long num_labels;
const std::vector<unsigned long>& labels_ = dlib::impl::remap_labels(labels,num_labels);

std::vector<double> cluster_sums(num_labels,0);
std::vector<double> k(num_nodes,0);

double Q = 0;
double m = 0;
for (unsigned long i = 0; i < edges.size(); ++i)
{
const unsigned long n1 = edges[i].index1();
const unsigned long n2 = edges[i].index2();
k[n1] += edges[i].distance();
if (n1 != n2)
k[n2] += edges[i].distance();

if (n1 != n2)
m += edges[i].distance();
else
m += edges[i].distance()/2;

if (labels_[n1] == labels_[n2])
{
if (n1 != n2)
Q += 2*edges[i].distance();
else
Q += edges[i].distance();
}
}

if (m == 0)
return 0;

for (unsigned long i = 0; i < labels_.size(); ++i)
{
cluster_sums[labels_[i]] += k[i];
}

for (unsigned long i = 0; i < labels_.size(); ++i)
{
Q -= k[i]*cluster_sums[labels_[i]]/(2*m);
}

return 1.0/(2*m)*Q;
}

// ----------------------------------------------------------------------------------------

inline double modularity (
const std::vector<ordered_sample_pair>& edges,
const std::vector<unsigned long>& labels
)
{
const unsigned long num_nodes = max_index_plus_one(edges);
// make sure requires clause is not broken
DLIB_ASSERT(labels.size() == num_nodes,
"\t double modularity()"
<< "\n\t Invalid inputs were given to this function"
);

unsigned long num_labels;
const std::vector<unsigned long>& labels_ = dlib::impl::remap_labels(labels,num_labels);

std::vector<double> cluster_sums(num_labels,0);
std::vector<double> k(num_nodes,0);

double Q = 0;
double m = 0;
for (unsigned long i = 0; i < edges.size(); ++i)
{
const unsigned long n1 = edges[i].index1();
const unsigned long n2 = edges[i].index2();
k[n1] += edges[i].distance();
m += edges[i].distance();
if (labels_[n1] == labels_[n2])
{
Q += edges[i].distance();
}
}

if (m == 0)
return 0;

for (unsigned long i = 0; i < labels_.size(); ++i)
{
cluster_sums[labels_[i]] += k[i];
}

for (unsigned long i = 0; i < labels_.size(); ++i)
{
Q -= k[i]*cluster_sums[labels_[i]]/m;
}

return 1.0/m*Q;
}

// ----------------------------------------------------------------------------------------

}

#endif // DLIB_MODULARITY_ClUSTERING__H__

```