// Copyright (C) 2010  Davis E. King (davis@dlib.net)
// License: Boost Software License   See LICENSE.txt for the full license.
#undef DLIB_RANDOM_SUBSeT_SELECTOR_ABSTRACT_H_
#ifdef DLIB_RANDOM_SUBSeT_SELECTOR_ABSTRACT_H_

#include <vector>
#include "../rand/rand_kernel_abstract.h"
#include "../algs.h"
#include "../string.h"

namespace dlib
{
    template <
        typename T,
        typename Rand_type = dlib::rand
        >
    class random_subset_selector
    {
        /*!
            REQUIREMENTS ON T
                T must be a copyable type

            REQUIREMENTS ON Rand_type
                must be an implementation of dlib/rand/rand_kernel_abstract.h

            INITIAL VALUE
                - size() == 0
                - max_size() == 0
                - next_add_accepts() == false

            WHAT THIS OBJECT REPRESENTS
                This object is a tool to help you select a random subset of a large body of data.  
                In particular, it is useful when the body of data is too large to fit into memory.

                So for example, suppose you have 1000000 data samples and you want to select a
                random subset of size 1000.   Then you could do that as follows:
                    
                    random_subset_selector<sample_type> rand_subset;
                    rand_subset.set_max_size(1000)
                    for (int i = 0; i < 1000000; ++i)
                        rand_subset.add( get_next_data_sample());

              
                At the end of the for loop you will have your random subset of 1000 samples.  And by
                random I mean that each of the 1000000 data samples has an equal chance of ending
                up in the rand_subset object.


                Note that the above example calls get_next_data_sample() for each data sample.  This 
                may be inefficient since most of the data samples are just ignored.  An alternative 
                method that doesn't require you to load each sample can also be used.  Consider the 
                following:

                    random_subset_selector<sample_type> rand_subset;
                    rand_subset.set_max_size(1000)
                    for (int i = 0; i < 1000000; ++i)
                        if (rand_subset.next_add_accepts())
                            rand_subset.add(get_data_sample(i));
                        else
                            rand_subset.add() 

                In the above example we only actually fetch the data sample into memory if we
                know that the rand_subset would include it into the random subset.  Otherwise,
                we can just call the empty add().
                

                Finally, note that the random_subset_selector uses a deterministic pseudo-random
                number generator under the hood.  Moreover, the default constructor always seeds
                the random number generator in the same way.  So unless you call set_seed() 
                each instance of the random_subset_selector will function identically.
        !*/
    public:
        typedef T type;
        typedef T value_type;
        typedef default_memory_manager mem_manager_type;
        typedef Rand_type rand_type;

        typedef typename std::vector<T>::iterator iterator;
        typedef typename std::vector<T>::const_iterator const_iterator;

        random_subset_selector (
        );
        /*!
            ensures
                - this object is properly initialized
        !*/

        void set_seed(
            const std::string& value
        );
        /*!
            ensures
                - sets the seed of the random number generator that is embedded in
                  this object to the given value.
        !*/

        void make_empty (
        );
        /*!
            ensures
                - #size() == 0
        !*/

        size_t size (
        ) const;
        /*!
            ensures
                - returns the number of items of type T currently contained in this object
        !*/

        void set_max_size (
            unsigned long new_max_size
        );
        /*!
            ensures
                - #max_size() == new_max_size
                - #size() == 0
        !*/

        unsigned long max_size (
        ) const;
        /*!
            ensures
                - returns the maximum allowable size for this object
        !*/

        T& operator[] (
            unsigned long idx
        );
        /*!
            requires
                - idx < size()
            ensures
                - returns a non-const reference to the idx'th element of this object
        !*/

        const T& operator[] (
            unsigned long idx
        ) const;
        /*!
            requires
                - idx < size()
            ensures
                - returns a const reference to the idx'th element of this object
        !*/

        bool next_add_accepts (
        ) const;
        /*!
            ensures
                - if (the next call to add(item) will result in item being included
                  into *this) then
                    - returns true
                    - Note that the next item will always be accepted if size() < max_size().
                - else
                    - returns false
                    - Note that the next item will never be accepted if max_size() == 0.
        !*/

        void add (
            const T& new_item
        );
        /*!
            ensures
                - if (next_add_accepts()) then
                    - places new_item into *this object at a random location
                    - if (size() < max_size()) then
                        - #size() == size() + 1
                - #next_add_accepts() == The updated information about the acceptance
                  of the next call to add()
        !*/

        void add (
        );
        /*!
            requires
                - next_add_accepts() == false
            ensures
                - This function does nothing but update the value of #next_add_accepts()
        !*/

        iterator begin(
        );
        /*!
            ensures
                - if (size() > 0) then
                    - returns an iterator referring to the first element in 
                      this container.
                - else
                    - returns end()
        !*/
        
        const_iterator begin(
        ) const;
        /*!
            ensures
                - if (size() > 0) then
                    - returns a const_iterator referring to the first element in 
                      this container.
                - else
                    - returns end()
        !*/

        iterator end(
        ); 
        /*!
            ensures
                - returns an iterator that represents one past the end of
                  this container
        !*/

        const_iterator end(
        ) const;
        /*!
            ensures
                - returns an iterator that represents one past the end of
                  this container
        !*/

        const std::vector<T>& to_std_vector(
        ) const;
        /*!
            ensures
                - returns a const reference to the underlying std::vector<T> that contains
                  all elements in this object.  That is, this function returns a vector, V,
                  which has the following properties:  
                    - V.size()  == this->size()
                    - V.begin() == this->begin()
                    - V.end()   == this->end()
        !*/

        void swap (
            random_subset_selector& item
        );
        /*!
            ensures
                - swaps *this and item
        !*/

    };

    template <
        typename T,
        typename rand_type 
        >
    void swap (
        random_subset_selector<T,rand_type>& a,
        random_subset_selector<T,rand_type>& b
    ) { a.swap(b); }
    /*!
        provides global swap support
    !*/

    template <
        typename T,
        typename rand_type 
        >
    void serialize (
        const random_subset_selector<T,rand_type>& item,
        std::ostream& out 
    );   
    /*!
        provides serialization support 
    !*/

    template <
        typename T,
        typename rand_type 
        >
    void deserialize (
        random_subset_selector<T,rand_type>& item,
        std::istream& in
    );   
    /*!
        provides deserialization support 
    !*/

// ----------------------------------------------------------------------------------------

    template <
        typename T,
        typename alloc
        >
    random_subset_selector<T> randomly_subsample (
        const std::vector<T,alloc>& samples,
        unsigned long num
    );
    /*!
        ensures
            - returns a random subset R such that:
                - R contains a random subset of the given samples
                - R.size() == min(num, samples.size())
                - R.max_size() == num
            - The random number generator used by this function will always be
              initialized in the same way.  I.e. this function will always pick
              the same random subsample if called multiple times.
    !*/

// ----------------------------------------------------------------------------------------

    template <
        typename T,
        typename alloc,
        typename U
        >
    random_subset_selector<T> randomly_subsample (
        const std::vector<T,alloc>& samples,
        unsigned long num,
        const U& random_seed
    );
    /*!
        requires
            - random_seed must be convertible to a string by dlib::cast_to_string()
        ensures
            - returns a random subset R such that:
                - R contains a random subset of the given samples
                - R.size() == min(num, samples.size())
                - R.max_size() == num
            - The given random_seed will be used to initialize the random number
              generator used by this function.
    !*/

// ----------------------------------------------------------------------------------------

    template <
        typename T
        >
    random_subset_selector<T> randomly_subsample (
        const random_subset_selector<T>& samples,
        unsigned long num
    );
    /*!
        ensures
            - returns a random subset R such that:
                - R contains a random subset of the given samples
                - R.size() == min(num, samples.size())
                - R.max_size() == num
            - The random number generator used by this function will always be
              initialized in the same way.  I.e. this function will always pick
              the same random subsample if called multiple times.
    !*/

// ----------------------------------------------------------------------------------------

    template <
        typename T,
        typename U
        >
    random_subset_selector<T> randomly_subsample (
        const random_subset_selector<T>& samples,
        unsigned long num,
        const U& random_seed
    );
    /*!
        requires
            - random_seed must be convertible to a string by dlib::cast_to_string()
        ensures
            - returns a random subset R such that:
                - R contains a random subset of the given samples
                - R.size() == min(num, samples.size())
                - R.max_size() == num
            - The given random_seed will be used to initialize the random number
              generator used by this function.
    !*/

// ----------------------------------------------------------------------------------------

    template <
        typename T
        >
    const matrix_exp mat (
        const random_subset_selector<T>& m 
    );
    /*!
        ensures
            - returns a matrix R such that:
                - is_col_vector(R) == true 
                - R.size() == m.size()
                - for all valid r:
                  R(r) == m[r]
    !*/

// ----------------------------------------------------------------------------------------

}

#endif // DLIB_RANDOM_SUBSeT_SELECTOR_ABSTRACT_H_