dlib C++ Library - find_k_nearest_neighbors_lsh

// Copyright (C) 2013  Davis E. King (davis@dlib.net)
// License: Boost Software License   See LICENSE.txt for the full license.
#undef DLIB_FIND_K_NEAREST_NEIGHBOrS_LSH_ABSTRACT_Hh_
#ifdef DLIB_FIND_K_NEAREST_NEIGHBOrS_LSH_ABSTRACT_Hh_

#include "../lsh/hashes_abstract.h"
#include "sample_pair_abstract.h"

namespace dlib
{

// ----------------------------------------------------------------------------------------

    template <
        typename vector_type,
        typename hash_function_type
        >
    void hash_samples (
        const vector_type& samples,
        const hash_function_type& hash_funct,
        const unsigned long num_threads,
        std::vector<typename hash_function_type::result_type>& hashes
    );
    /*!
        requires
            - hash_funct() is threadsafe.  This means that it must be safe for multiple
              threads to invoke the member functions of hash_funct() at the same time.
            - vector_type is any container that looks like a std::vector or dlib::array.
            - hash_funct must be a function object with an interface compatible with the
              objects defined in dlib/lsh/hashes_abstract.h.  In particular, hash_funct
              must be capable of hashing the elements in the samples vector.
        ensures
            - This function hashes all the elements in samples and stores the results in
              hashes.  It will also use num_threads concurrent threads to do this.  You
              should set this value equal to the number of processing cores on your
              computer for maximum speed.
            - #hashes.size() == 0
            - for all valid i:
                - #hashes[i] = hash_funct(samples[i])
                  (i.e. #hashes[i] will contain the hash of samples[i])
    !*/

// ----------------------------------------------------------------------------------------

    template <
        typename vector_type,
        typename distance_function_type,
        typename hash_function_type,
        typename alloc
        >
    void find_k_nearest_neighbors_lsh (
        const vector_type& samples,
        const distance_function_type& dist_funct,
        const hash_function_type& hash_funct,
        const unsigned long k,
        const unsigned long num_threads,
        std::vector<sample_pair, alloc>& edges,
        const unsigned long k_oversample = 20 
    );
    /*!
        requires
            - hash_funct and dist_funct are threadsafe.  This means that it must be safe
              for multiple threads to invoke the member functions of these objects at the
              same time.
            - k > 0
            - k_oversample > 0
            - dist_funct(samples[i], samples[j]) must be a valid expression that evaluates
              to a floating point number 
            - vector_type is any container that looks like a std::vector or dlib::array.
            - hash_funct must be a function object with an interface compatible with the
              objects defined in dlib/lsh/hashes_abstract.h.  In particular, hash_funct
              must be capable of hashing the elements in the samples vector.
        ensures
            - This function computes an approximate form of a k nearest neighbors graph of
              the elements in samples.  In particular, the way it works is that it first
              hashes all elements in samples using the provided locality sensitive hash
              function hash_funct().  Then it performs an exact k nearest neighbors on the
              hashes which can be done very quickly.  For each of these neighbors we
              compute the true distance using dist_funct() and the k nearest neighbors for
              each sample are stored into #edges.  
            - Note that samples with an infinite distance between them are considered to be
              not connected at all. Therefore, we exclude edges with such distances from
              being output.
            - for all valid i:
                - #edges[i].distance() == dist_funct(samples[#edges[i].index1()], samples[#edges[i].index2()])
                - #edges[i].distance() < std::numeric_limits<double>::infinity()
            - contains_duplicate_pairs(#edges) == false
            - This function will use num_threads concurrent threads of processing.  You
              should set this value equal to the number of processing cores on your
              computer for maximum speed.
            - The hash based k nearest neighbor step is approximate, however, you can
              improve the output accuracy by using a larger k value for this first step.
              Therefore, this function finds k*k_oversample nearest neighbors during the
              first hashing based step. 
    !*/

// ----------------------------------------------------------------------------------------

}

#endif // DLIB_FIND_K_NEAREST_NEIGHBOrS_LSH_ABSTRACT_Hh_