// Copyright (C) 2015  Davis E. King (davis@dlib.net)
// License: Boost Software License   See LICENSE.txt for the full license.


#include <sstream>
#include <string>
#include <cstdlib>
#include <ctime>
#include <vector>
#include <random>
#include <numeric>
#include "../dnn.h"

#include "tester.h"

#ifndef __INTELLISENSE__

namespace
{

    using namespace test;
    using namespace dlib;
    using namespace std;

    logger dlog("test.dnn");

// ----------------------------------------------------------------------------------------

    template <typename T>
    float compare_gradients (
        const tensor& t,
        T grad
    )
    {
        float max_error = 0;
        auto p = t.host();
        for (size_t i = 0; i < t.size(); ++i)
        {
            max_error = std::max(max_error, std::abs(p[i]-grad(i)));
        }
        return max_error;
    }

// ----------------------------------------------------------------------------------------

    void test_tanh()
    {
        using namespace dlib::tt;
        print_spinner();
        resizable_tensor src, dest, gradient_input;
        src = matrix_cast<float>(gaussian_randm(5,5, 0));
        dest = matrix_cast<float>(gaussian_randm(5,5, 1));
        gradient_input = matrix_cast<float>(gaussian_randm(5,5, 2));



        auto grad_src = [&](long idx) {
            auto f = [&](float eps) {
                const float old = src.host()[idx];
                src.host()[idx] += eps;
                tanh(dest, src);
                float result = dot(gradient_input, dest);
                src.host()[idx] = old;
                return result;
            };
            const float eps = 0.01;
            return (f(+eps)-f(-eps))/(2*eps);
        };

        resizable_tensor src_grad;
        src_grad.copy_size(src);
        src_grad = 0;

        tanh(dest, src);
        tanh_gradient(src_grad, dest, gradient_input);

        auto grad_error = compare_gradients(src_grad, grad_src);
        dlog << LINFO << "src error: " << grad_error;
        DLIB_TEST(grad_error < 0.001);
    }

    void test_sigmoid()
    {
        using namespace dlib::tt;
        print_spinner();
        resizable_tensor src, dest, gradient_input;
        src = matrix_cast<float>(gaussian_randm(5,5, 0));
        dest = matrix_cast<float>(gaussian_randm(5,5, 1));
        gradient_input = matrix_cast<float>(gaussian_randm(5,5, 2));



        auto grad_src = [&](long idx) {
            auto f = [&](float eps) {
                const float old = src.host()[idx];
                src.host()[idx] += eps;
                sigmoid(dest, src);
                float result = dot(gradient_input, dest);
                src.host()[idx] = old;
                return result;
            };
            const float eps = 0.01;
            return (f(+eps)-f(-eps))/(2*eps);
        };

        resizable_tensor src_grad;
        src_grad.copy_size(src);
        src_grad = 0;

        sigmoid(dest, src);
        sigmoid_gradient(src_grad, dest, gradient_input);

        auto grad_error = compare_gradients(src_grad, grad_src);
        dlog << LINFO << "src error: " << grad_error;
        DLIB_TEST(grad_error < 0.001);
    }

    void test_softmax()
    {
        using namespace dlib::tt;
        print_spinner();
        const long nr = 3;
        const long nc = 3;
        resizable_tensor src(5,5,nr,nr), dest(5,5,nr,nc), gradient_input(5,5,nr,nc);
        tt::tensor_rand rnd;
        rnd.fill_uniform(src);
        rnd.fill_uniform(dest);
        // fill like this as a test of the assignment operator.
        gradient_input = matrix_cast<float>(gaussian_randm(5,5*nr*nc, 2));



        auto grad_src = [&](long idx) {
            auto f = [&](float eps) {
                const float old = src.host()[idx];
                src.host()[idx] += eps;
                tt::softmax(dest, src);
                float result = dot(gradient_input, dest);
                src.host()[idx] = old;
                return result;
            };
            const float eps = 0.01;
            return (f(+eps)-f(-eps))/(2*eps);
        };

        resizable_tensor src_grad;
        src_grad.copy_size(src);
        src_grad = 0;

        tt::softmax(dest, src);
        softmax_gradient(src_grad, dest, gradient_input);

        auto grad_error = compare_gradients(src_grad, grad_src);
        dlog << LINFO << "src error: " << grad_error;
        DLIB_TEST(grad_error < 0.001);

#ifdef DLIB_USE_CUDA
        resizable_tensor src1 = src;
        resizable_tensor src2 = src;
        resizable_tensor dest1, dest2;
        dest1.copy_size(src);
        dest2.copy_size(src);
        cuda::softmax_all(dest1, src1);
        cpu::softmax_all(dest2, src2);
        DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2))) < 1e-5, max(abs(mat(dest1)-mat(dest2))));
#endif
    }

    void test_softmax_all()
    {
        using namespace dlib::tt;
        print_spinner();
        const long nr = 3;
        const long nc = 3;
        resizable_tensor src(5,5,nr,nc), dest(5,5,nr,nc), gradient_input(5,5,nr,nc);
        tt::tensor_rand rnd;
        rnd.fill_uniform(src);
        rnd.fill_uniform(dest);
        // fill like this as a test of the assignment operator.
        gradient_input = matrix_cast<float>(gaussian_randm(5,5*nr*nc, 2));



        auto grad_src = [&](long idx) {
            auto f = [&](float eps) {
                const float old = src.host()[idx];
                src.host()[idx] += eps;
                tt::softmax_all(dest, src);
                float result = dot(gradient_input, dest);
                src.host()[idx] = old;
                return result;
            };
            const float eps = 0.01;
            return (f(+eps)-f(-eps))/(2*eps);
        };

        resizable_tensor src_grad;
        src_grad.copy_size(src);
        src_grad = 0;

        tt::softmax_all(dest, src);
        softmax_all_gradient(src_grad, dest, gradient_input);

        auto grad_error = compare_gradients(src_grad, grad_src);
        dlog << LINFO << "src error: " << grad_error;
        DLIB_TEST(grad_error < 0.001);

#ifdef DLIB_USE_CUDA
        resizable_tensor src1 = src;
        resizable_tensor src2 = src;
        resizable_tensor dest1, dest2;
        dest1.copy_size(src);
        dest2.copy_size(src);
        cuda::softmax_all(dest1, src1);
        cpu::softmax_all(dest2, src2);
        DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2))) < 1e-5, max(abs(mat(dest1)-mat(dest2))));
#endif
    }

    void test_mish()
    {
#ifdef DLIB_USE_CUDA
        // make sure that cuda::mish and cpu::mish return the same results
        using namespace dlib::tt;
        print_spinner();
        const long n = 4;
        const long k = 5;
        const long nr = 3;
        const long nc = 3;
        resizable_tensor src(n,k,nr,nc);
        tt::tensor_rand rnd;
        rnd.fill_gaussian(src);

        resizable_tensor dest1, dest2;
        dest1.copy_size(src);
        dest2.copy_size(src);
        // initialize to different values in order to make sure the output is actually changed
        dest1 = 1;
        dest2 = 2;
        cuda::mish(dest1, src);
        cpu::mish(dest2, src);
        DLIB_TEST_MSG(max(abs(mat(dest1) - mat(dest2))) < 1e-6, max(abs(mat(dest1) - mat(dest2))));
#endif // DLIB_USE_CUDA
    }

    void test_leaky_relu()
    {
#ifdef DLIB_USE_CUDA
        using namespace dlib::tt;
        print_spinner();
        const long n = 4;
        const long k = 5;
        const long nr = 3;
        const long nc = 3;
        const float alpha = 0.01;
        resizable_tensor src(n, k, nr, nc);
        tt::tensor_rand rnd;
        rnd.fill_gaussian(src);
        resizable_tensor dest_cuda, dest_cpu;
        dest_cuda.copy_size(src);
        dest_cpu.copy_size(src);
        // initialize to different values in order to make sure the output is actually changed
        dest_cuda = 1;
        dest_cpu = 2;
        cuda::leaky_relu(dest_cuda, src, alpha);
        cpu::leaky_relu(dest_cpu, src, alpha);

        DLIB_TEST_MSG(max(abs(mat(dest_cuda) - mat(dest_cpu))) < 1e-7, max(abs(mat(dest_cuda) - mat(dest_cpu))));
#endif // DLIB_USE_CUDA
    }

    void test_clipped_relu()
    {
#ifdef DLIB_USE_CUDA
        using namespace dlib::tt;
        print_spinner();
        const long n = 4;
        const long k = 5;
        const long nr = 3;
        const long nc = 3;
        const float ceiling = 6.0f;
        resizable_tensor src(n, k, nr, nc);
        tt::tensor_rand rnd;
        rnd.fill_gaussian(src, 0, 3);
        resizable_tensor dest_cuda, dest_cpu;
        dest_cuda.copy_size(src);
        dest_cpu.copy_size(src);
        // initialize to different values in order to make sure the output is actually changed
        dest_cuda = 1;
        dest_cpu = 2;
        cuda::clipped_relu(dest_cuda, src, ceiling);
        cpu::clipped_relu(dest_cpu, src, ceiling);
        auto error = max(abs(mat(dest_cuda) - mat(dest_cpu)));
        DLIB_TEST_MSG(error < 1e-7, "error: " << error);

        // test gradients
        resizable_tensor grad_cuda, grad_cpu, grad_input;
        grad_cuda.copy_size(src);
        grad_cpu.copy_size(src);
        grad_input.copy_size(src);
        rnd.fill_uniform(grad_input);
        grad_cuda = 0;
        grad_cpu = 0;
        cuda::clipped_relu_gradient(grad_cuda, dest_cuda, grad_input, ceiling);
        cpu::clipped_relu_gradient(grad_cpu, dest_cpu, grad_input, ceiling);
        error = max(abs(mat(grad_cuda) - mat(grad_cpu)));
        DLIB_TEST_MSG(error < 1e-7, "error: " << error);
#endif // DLIB_USE_CUDA
    }

    void test_elu()
    {
#ifdef DLIB_USE_CUDA
        using namespace dlib::tt;
        print_spinner();
        const long n = 4;
        const long k = 5;
        const long nr = 3;
        const long nc = 3;
        const float alpha = 1.0f;
        resizable_tensor src(n, k, nr, nc);
        tt::tensor_rand rnd;
        rnd.fill_gaussian(src);
        resizable_tensor dest_cuda, dest_cpu;
        dest_cuda.copy_size(src);
        dest_cpu.copy_size(src);
        // initialize to different values in order to make sure the output is actually changed
        dest_cuda = 1;
        dest_cpu = 2;
        cuda::elu(dest_cuda, src, alpha);
        cpu::elu(dest_cpu, src, alpha);
        auto error = max(abs(mat(dest_cuda) - mat(dest_cpu)));
        DLIB_TEST_MSG(error < 1e-7, "error: " << error);
        // test gradients
        resizable_tensor grad_cuda, grad_cpu, grad_input;
        grad_cuda.copy_size(src);
        grad_cpu.copy_size(src);
        grad_input.copy_size(src);
        rnd.fill_gaussian(grad_input);
        grad_cuda = 0;
        grad_cpu = 0;
        cuda::elu_gradient(grad_cuda, dest_cuda, grad_input, alpha);
        cpu::elu_gradient(grad_cpu, dest_cpu, grad_input, alpha);
        error = max(abs(mat(grad_cuda) - mat(grad_cpu)));
        DLIB_TEST_MSG(error < 1e-6, "error: " << error);
#endif // DLIB_USE_CUDA
    }

    void test_gelu()
    {
#ifdef DLIB_USE_CUDA
        // make sure that cuda::gelu and cpu::gelu return the same results
        using namespace dlib::tt;
        print_spinner();
        const long n = 4;
        const long k = 5;
        const long nr = 3;
        const long nc = 3;
        resizable_tensor src(n,k,nr,nc);
        tt::tensor_rand rnd;
        rnd.fill_gaussian(src);

        resizable_tensor dest1, dest2;
        dest1.copy_size(src);
        dest2.copy_size(src);
        // initialize to different values in order to make sure the output is actually changed
        dest1 = 1;
        dest2 = 2;
        cuda::gelu(dest1, src);
        cpu::gelu(dest2, src);
        DLIB_TEST_MSG(max(abs(mat(dest1) - mat(dest2))) < 1e-6, max(abs(mat(dest1) - mat(dest2))));
#endif // DLIB_USE_CUDA
    }

    void test_smelu()
    {
#ifdef DLIB_USE_CUDA
        using namespace dlib::tt;
        print_spinner();
        const long n = 4;
        const long k = 5;
        const long nr = 3;
        const long nc = 3;
        const float beta = 1;
        resizable_tensor src(n, k, nr, nc);
        tt::tensor_rand rnd;
        rnd.fill_gaussian(src);
        resizable_tensor dest_cuda, dest_cpu;
        dest_cuda.copy_size(src);
        dest_cpu.copy_size(src);
        // initialize to different values in order to make sure the output is actually changed
        dest_cuda = 1;
        dest_cpu = 2;
        cuda::smelu(dest_cuda, src, beta);
        cpu::smelu(dest_cpu, src, beta);

        DLIB_TEST_MSG(max(abs(mat(dest_cuda) - mat(dest_cpu))) < 1e-7, max(abs(mat(dest_cuda) - mat(dest_cpu))));
#endif // DLIB_USE_CUDA
    }

    void test_silu()
    {
#ifdef DLIB_USE_CUDA
        using namespace dlib::tt;
        print_spinner();
        const long n = 4;
        const long k = 5;
        const long nr = 3;
        const long nc = 3;
        resizable_tensor src(n, k, nr, nc);
        tt::tensor_rand rnd;
        rnd.fill_gaussian(src);
        resizable_tensor dest_cuda, dest_cpu;
        dest_cuda.copy_size(src);
        dest_cpu.copy_size(src);
        // initialize to different values in order to make sure the output is actually changed
        dest_cuda = 1;
        dest_cpu = 2;
        cuda::silu(dest_cuda, src);
        cpu::silu(dest_cpu, src);

        DLIB_TEST_MSG(max(abs(mat(dest_cuda) - mat(dest_cpu))) < 1e-6, max(abs(mat(dest_cuda) - mat(dest_cpu))));
#endif // DLIB_USE_CUDA
    }

    void test_batch_normalize()
    {
        using namespace dlib::tt;
        print_spinner();
        resizable_tensor src, gamma, beta, dest, dest2, dest3, means, vars, gradient_input;
        src = matrix_cast<float>(gaussian_randm(5,5, 0));
        gamma = matrix_cast<float>(gaussian_randm(1,5, 1));
        beta = matrix_cast<float>(gaussian_randm(1,5, 2));
        gradient_input = matrix_cast<float>(gaussian_randm(5,5, 3));

        gamma = 1;
        beta = 0;

        resizable_tensor running_means;
        resizable_tensor running_variances;
        batch_normalize(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
        const double scale = (src.num_samples())/(src.num_samples()-1.0);
        // Turn back into biased variance estimate because that's how batch_normalize() works, so if we want to match it this is necessary.
        running_variances = mat(running_variances)/scale; 
        batch_normalize_inference(DEFAULT_BATCH_NORM_EPS,dest2, src, gamma, beta, running_means, running_variances);
        DLIB_TEST_MSG(max(abs(mat(dest2)-mat(dest))) < 1e-5, max(abs(mat(dest2)-mat(dest))));
        cpu::batch_normalize_inference(DEFAULT_BATCH_NORM_EPS,dest3, src, gamma, beta, running_means, running_variances);
        DLIB_TEST_MSG(max(abs(mat(dest3)-mat(dest))) < 1e-5, max(abs(mat(dest3)-mat(dest))));


        auto grad_src = [&](long idx) {
            auto f = [&](float eps) {
                const float old = src.host()[idx];
                src.host()[idx] += eps;
                batch_normalize(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
                float result = dot(gradient_input, dest);
                src.host()[idx] = old;
                return result;
            };
            const float eps = 0.01;
            return (f(+eps)-f(-eps))/(2*eps);
        };
        auto grad_gamma = [&](long idx) {
            auto f = [&](float eps) {
                const float old = gamma.host()[idx];
                gamma.host()[idx] += eps;
                batch_normalize(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
                float result = dot(gradient_input, dest);
                gamma.host()[idx] = old;
                return result;
            };
            const float eps = 0.01;
            return (f(+eps)-f(-eps))/(2*eps);
        };
        auto grad_beta = [&](long idx) {
            auto f = [&](float eps) {
                const float old = beta.host()[idx];
                beta.host()[idx] += eps;
                batch_normalize(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
                float result = dot(gradient_input, dest);
                beta.host()[idx] = old;
                return result;
            };
            const float eps = 0.01;
            return (f(+eps)-f(-eps))/(2*eps);
        };

        resizable_tensor src_grad, gamma_grad, beta_grad;
        src_grad.copy_size(src);
        gamma_grad.copy_size(gamma);
        beta_grad.copy_size(beta);
        src_grad = 0;
        gamma_grad = 8;
        beta_grad = 8;

        batch_normalize_gradient(DEFAULT_BATCH_NORM_EPS,gradient_input, means, vars, src, gamma, src_grad, gamma_grad, beta_grad);

        auto grad_error = compare_gradients(src_grad, grad_src);
        dlog << LINFO << "src error: " << grad_error;
        DLIB_TEST(grad_error < 0.001);

        grad_error = compare_gradients(gamma_grad, grad_gamma);
        dlog << LINFO << "gamma error: " << grad_error;
        DLIB_TEST(grad_error < 0.001);

        grad_error = compare_gradients(beta_grad, grad_beta);
        dlog << LINFO << "beta error: " << grad_error;
        DLIB_TEST(grad_error < 0.001);
    }

    void test_batch_normalize_conv()
    {
        using namespace dlib::tt;
        print_spinner();
        resizable_tensor src(5,5,4,4), gamma, beta, dest, dest2, dest3, means, vars, gradient_input(5,5,4,4);
        tt::tensor_rand rnd;
        rnd.fill_gaussian(src);
        rnd.fill_gaussian(gradient_input);
        gamma = matrix_cast<float>(gaussian_randm(1,5, 1));
        beta = matrix_cast<float>(gaussian_randm(1,5, 2));

        gamma = 1;
        beta = 0;

        resizable_tensor running_means;
        resizable_tensor running_variances;
        batch_normalize_conv(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
        const double scale = (src.num_samples()*src.nr()*src.nc())/(src.num_samples()*src.nr()*src.nc()-1.0);
        // Turn back into biased variance estimate because that's how
        // batch_normalize_conv() works, so if we want to match it this is necessary.
        running_variances = mat(running_variances)/scale; 
        batch_normalize_conv_inference(DEFAULT_BATCH_NORM_EPS,dest2, src, gamma, beta, running_means, running_variances);
        DLIB_TEST(max(abs(mat(dest2)-mat(dest))) < 1e-5);
        cpu::batch_normalize_conv_inference(DEFAULT_BATCH_NORM_EPS,dest3, src, gamma, beta, running_means, running_variances);
        DLIB_TEST(max(abs(mat(dest3)-mat(dest))) < 1e-5);


        auto grad_src = [&](long idx) {
            auto f = [&](float eps) {
                const float old = src.host()[idx];
                src.host()[idx] += eps;
                batch_normalize_conv(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
                float result = dot(gradient_input, dest);
                src.host()[idx] = old;
                return result;
            };
            const float eps = 0.01;
            return (f(+eps)-f(-eps))/(2*eps);
        };
        auto grad_gamma = [&](long idx) {
            auto f = [&](float eps) {
                const float old = gamma.host()[idx];
                gamma.host()[idx] += eps;
                batch_normalize_conv(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
                float result = dot(gradient_input, dest);
                gamma.host()[idx] = old;
                return result;
            };
            const float eps = 0.01;
            return (f(+eps)-f(-eps))/(2*eps);
        };
        auto grad_beta = [&](long idx) {
            auto f = [&](float eps) {
                const float old = beta.host()[idx];
                beta.host()[idx] += eps;
                batch_normalize_conv(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
                float result = dot(gradient_input, dest);
                beta.host()[idx] = old;
                return result;
            };
            const float eps = 0.01;
            return (f(+eps)-f(-eps))/(2*eps);
        };


        resizable_tensor src_grad, gamma_grad, beta_grad;
        src_grad.copy_size(src);
        gamma_grad.copy_size(gamma);
        beta_grad.copy_size(beta);
        src_grad = 0;
        gamma_grad = 9;
        beta_grad = 9;

        batch_normalize_conv_gradient(DEFAULT_BATCH_NORM_EPS,gradient_input, means, vars, src, gamma, src_grad, gamma_grad, beta_grad);


        auto grad_error = compare_gradients(src_grad, grad_src);
        dlog << LINFO << "src error: " << grad_error;
        DLIB_TEST(grad_error < 0.001);

        grad_error = compare_gradients(gamma_grad, grad_gamma);
        dlog << LINFO << "gamma error: " << grad_error;
        DLIB_TEST(grad_error < 0.001);

        grad_error = compare_gradients(beta_grad, grad_beta);
        dlog << LINFO << "beta error: " << grad_error;
        DLIB_TEST(grad_error < 0.001);

    }

// ----------------------------------------------------------------------------------------

    void test_layer_normalize()
    {
        resizable_tensor x(2, 3, 4, 5);
        resizable_tensor y_cpu(x);
        tt::tensor_rand rnd(0);
        rnd.fill_uniform(x);
        resizable_tensor means_cpu(x.num_samples()), invstds_cpu(x.num_samples());
        resizable_tensor gamma(1, x.k(), x.nr(), x.nc()), beta(1, x.k(), x.nr(), x.nc());
        gamma = 1;
        beta = 0;
        const float eps = 1e-5;
        cpu::layer_normalize(eps, y_cpu, means_cpu, invstds_cpu, x, gamma, beta);
        // check that the mean and var per sample are 0 and 1
        const float* p = y_cpu.host();
        for (long n = 0; n < y_cpu.num_samples(); ++n)
        {
            running_stats<float> rs;
            for (long k = 0; k < y_cpu.k(); ++k)
            {
                for (long r = 0; r < y_cpu.nr(); ++r)
                {
                    for (long c = 0; c < y_cpu.nc(); ++c)
                    {
                        rs.add(p[tensor_index(y_cpu, n, k, r, c)]);
                    }
                }
            }
            DLIB_TEST(::std::abs(rs.mean()) < 1e-6);
            DLIB_TEST(::std::abs(rs.stddev() - 1.0f) < 0.01);
        }
        // check that the CPU and the CUDA implementation are equivalent
#if DLIB_USE_CUDA
        resizable_tensor y_cuda(x);
        resizable_tensor means_cuda(x.num_samples()), invstds_cuda(x.num_samples());
        cuda::layer_normalize(eps, y_cuda, means_cuda, invstds_cuda, x, gamma, beta);
        DLIB_TEST(max(abs(mat(y_cpu) - mat(y_cuda))) < 1e-5);
        DLIB_TEST(max(abs(mat(means_cpu) - mat(means_cuda))) < 1e-5);
        DLIB_TEST(max(abs(mat(invstds_cpu) - mat(invstds_cuda))) < 1e-5);
        resizable_tensor gradient_input(x);
        resizable_tensor src_grad_cpu(x), gamma_grad_cpu(1, x.k(), x.nr(), x.nc()), beta_grad_cpu(1, x.k(), x.nr(), x.nc());
        resizable_tensor src_grad_cuda(x), gamma_grad_cuda(1, x.k(), x.nr(), x.nc()), beta_grad_cuda(1, x.k(), x.nr(), x.nc());
        rnd.fill_gaussian(gradient_input);
        src_grad_cpu = 0;
        src_grad_cuda = 0;
        cpu::layer_normalize_gradient(eps, gradient_input, means_cpu, invstds_cpu, x, gamma, src_grad_cpu, gamma_grad_cpu, beta_grad_cpu);
        cuda::layer_normalize_gradient(eps, gradient_input, means_cuda, invstds_cuda, x, gamma, src_grad_cuda, gamma_grad_cuda, beta_grad_cuda);
        DLIB_TEST(max(abs(mat(src_grad_cpu) - mat(src_grad_cuda))) < 1e-5);
        DLIB_TEST(max(abs(mat(gamma_grad_cpu) - mat(gamma_grad_cuda))) < 1e-5);
        DLIB_TEST(max(abs(mat(beta_grad_cpu) - mat(beta_grad_cuda))) < 1e-5);
#endif
    }

// ----------------------------------------------------------------------------------------

    void test_basic_tensor_ops()
    {
        using namespace dlib::tt;
        print_spinner();
        resizable_tensor dest, src(3,4), A(1,4), B(1,4);
        src = 2;
        dest.copy_size(src);
        affine_transform(dest, src, 2, 3);
        dlog << LINFO << mat(dest);
        matrix<float> truth1(3,4), truth2(3,4);

        truth1 = 2;
        DLIB_TEST(max(abs(truth1-mat(src))) < 1e-5);
        src *= 2;
        truth1 = 4;
        DLIB_TEST(max(abs(truth1-mat(src))) < 1e-5);
        src = 2;


        truth1 = 7;
        truth2 = 7, 10,  7,  7,
        7, 10,  7,  7,
        7, 10,  7,  7;
        DLIB_TEST(max(abs(truth1-mat(dest))) < 1e-5);

        A = 2;
        B = 3;
        A.host()[1] = 3;
        B.host()[1] = 4;
        dest = 0;
        affine_transform(dest, src, A, B);
        dlog << LINFO << mat(dest);
        DLIB_TEST(max(abs(truth2-mat(dest))) < 1e-5);

        A = matrix_cast<float>(gaussian_randm(3,4, 1));
        B = matrix_cast<float>(gaussian_randm(3,4, 2));
        affine_transform(dest, src, A, B);
        dlog << LINFO << mat(dest);
        matrix<float> truth3 = pointwise_multiply(mat(src), mat(A)) + mat(B);
        DLIB_TEST(max(abs(truth3-mat(dest))) < 1e-5);

        matrix<float> truth4 = pointwise_multiply(mat(A), mat(B));
        tt::multiply(false, A, A, B);
        DLIB_TEST(max(abs(truth4-mat(A))) < 1e-5);
        truth4 = pointwise_multiply(mat(A), mat(B)) + mat(A);
        tt::multiply(true, A, A, B);
        DLIB_TEST(max(abs(truth4-mat(A))) < 1e-5);

        matrix<float> truth5 = mat(B) > 0.1;
        dlog << LINFO << truth5;
        threshold(B, 0.1);
        DLIB_TEST(max(abs(truth5-mat(B))) < 1e-5);

        int cnt = 0;
        for(auto& x : A)
            x = cnt++;

        truth1.set_size(2,2);
        truth2.set_size(2,2);
        truth3.set_size(2,2);
        truth1 = 0,1,2,3;
        truth2 = 4,5,6,7;
        truth3 = 8,9,10,11;

        alias_tensor at(2,2);
        auto A0 = at(A,0);
        auto A4 = at(A,4);
        auto A8 = at(const_cast<const resizable_tensor&>(A),8);
        DLIB_TEST(mat(A0) == truth1);
        DLIB_TEST(mat(at(A,4)) == truth2);
        DLIB_TEST(mat(A8) == truth3);

        A4 += uniform_matrix<float>(2,2,2);
        truth2 += 2;
        DLIB_TEST(mat(A4) == truth2);
        truth1 = trans(reshape_to_column_vector(truth1));
        truth2 = trans(reshape_to_column_vector(truth2));
        truth3 = trans(reshape_to_column_vector(truth3));

        DLIB_TEST(mat(A) == join_cols(truth1,join_cols(truth2,truth3)));

        affine_transform(A,A,1,2);
        truth1 += 2;
        truth2 += 2;
        truth3 += 2;
        DLIB_TEST(mat(at(A,4)) == reshape(truth2,2,2));
        DLIB_TEST(mat(A) == join_cols(truth1,join_cols(truth2,truth3)));

        {
            resizable_tensor dest(3,4);
            resizable_tensor A, B;
            A = dest;
            B = dest;

            tensor_rand rnd;
            rnd.fill_uniform(dest);
            rnd.fill_uniform(A);
            rnd.fill_uniform(B);

            dest.set_size(1,4);

            tt::multiply(false, dest, A, B);
            DLIB_TEST(max(abs(mat(dest)-sum_rows(pointwise_multiply(mat(A),mat(B))))) < 1e-6); 

            A.set_size(1,4);
            rnd.fill_uniform(A);
            matrix<float> AA = join_cols(mat(A),mat(A)); AA = join_cols(mat(A),AA);

            tt::multiply(false, dest, A, B);
            DLIB_TEST(max(abs(mat(dest)-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6); 

            tt::multiply(false, dest, B, A);
            DLIB_TEST(max(abs(mat(dest)-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6); 
            matrix<float> prevdest = mat(dest);
            tt::multiply(true, dest, B, A);
            DLIB_TEST(max(abs(mat(dest)-prevdest-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6); 

            dest.set_size(3,4);
            tt::multiply(false, dest, B, A);
            DLIB_TEST(max(abs(mat(dest)-pointwise_multiply(AA,mat(B)))) < 1e-6); 
            prevdest = mat(dest);
            tt::multiply(true, dest, B, A);
            DLIB_TEST(max(abs(mat(dest)-prevdest-pointwise_multiply(AA,mat(B)))) < 1e-6); 

            tt::multiply(false, dest, A, B);
            DLIB_TEST(max(abs(mat(dest)-pointwise_multiply(AA,mat(B)))) < 1e-6); 
            prevdest = mat(dest);
            tt::multiply(true, dest, B, A);
            DLIB_TEST(max(abs(mat(dest)-prevdest-pointwise_multiply(AA,mat(B)))) < 1e-6); 
        }

        {
            resizable_tensor A, B, truth;
            A.set_size(2,3,4,5);
            truth.copy_size(A);
            B.copy_size(A);

            A = 4;
            B = 1;
            truth = 1;
            DLIB_TEST(max(abs(mat(B)- mat(truth))) < 1e-5);
            memcpy(A, truth);
            DLIB_TEST(max(abs(mat(A)- mat(truth))) < 1e-5);

            A = 4;
            A.host();
            B.host();
            memcpy(A, truth);
            DLIB_TEST(max(abs(mat(A)- mat(truth))) < 1e-5);

#ifdef DLIB_USE_CUDA
            A = 4;
            A.device();
            B.host();
            memcpy(A, truth);
            DLIB_TEST(max(abs(mat(A)- mat(truth))) < 1e-5);

            A = 4;
            A.device();
            B.device();
            memcpy(A, truth);
            DLIB_TEST(max(abs(mat(A)- mat(truth))) < 1e-5);

            A = 4;
            A.host();
            B.device();
            memcpy(A, truth);
            DLIB_TEST(max(abs(mat(A)- mat(truth))) < 1e-5);

            A = 4;
            A.host_write_only();
            B.device();
            memcpy(A, truth);
            DLIB_TEST(max(abs(mat(A)- mat(truth))) < 1e-5);
#endif
        }

        {
            const int nr = 5;
            const int nc = 6;
            tensor_rand rnd;
            resizable_tensor out1(nr,nc), m(nr,nc), v(nc), out2;
            rnd.fill_uniform(out1);
            rnd.fill_uniform(m);
            rnd.fill_uniform(v);

            tt::scale_columns(out1, m, v);
            out2 = scale_columns(mat(m), mat(v));
            DLIB_TEST(max(abs(mat(out1)-mat(out2))) < 1e-6);
        }

        {
            resizable_tensor A, B;
            A.set_size(11);
            B.copy_size(A);

            A = 4;
            B = 1;
            matrix<float> truth;


            alias_tensor at(5);
            A = 4;
            A.host();
            B.host();
            {
                // non-aliasing test
                auto aA = at(A,5);
                auto aB = at(B,5);
                memcpy(aA, aB);
                truth = {4,4,4,4,4,  1,1,1,1,1, 4};
                DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
            }
            {
                // aliasing test
                auto aA = at(A,1);
                auto aB = at(A,6);
                memcpy(aA, aB);
                truth = {4,1,1,1,1,  4,1,1,1,1, 4};
                DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
            }


#ifdef DLIB_USE_CUDA
            A = 4;
            A.device();
            B.host();
            {
                // non-aliasing test
                auto aA = at(A,5);
                auto aB = at(B,5);
                memcpy(aA, aB);
                truth = {4,4,4,4,4,  1,1,1,1,1, 4};
                DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
            }
            {
                // aliasing test
                auto aA = at(A,1);
                auto aB = at(A,6);
                memcpy(aA, aB);
                truth = {4,1,1,1,1,  4,1,1,1,1, 4};
                DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
            }


            A = 4;
            A.device();
            B.device();
            {
                // non-aliasing test
                auto aA = at(A,5);
                auto aB = at(B,5);
                memcpy(aA, aB);
                truth = {4,4,4,4,4,  1,1,1,1,1, 4};
                DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
            }
            {
                // aliasing test
                auto aA = at(A,1);
                auto aB = at(A,6);
                memcpy(aA, aB);
                truth = {4,1,1,1,1,  4,1,1,1,1, 4};
                DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
            }

            A = 4;
            A.host();
            B.device();
            {
                // non-aliasing test
                auto aA = at(A,5);
                auto aB = at(B,5);
                memcpy(aA, aB);
                truth = {4,4,4,4,4,  1,1,1,1,1, 4};
                DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
            }
            {
                // aliasing test
                auto aA = at(A,1);
                auto aB = at(A,6);
                memcpy(aA, aB);
                truth = {4,1,1,1,1,  4,1,1,1,1, 4};
                DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
            }

#endif
        }

        {
            resizable_tensor A(4,5), B(4);

            tensor_rand rnd;
            rnd.fill_uniform(A);
            rnd.fill_uniform(B);

            float alpha = 1.4;
            float beta = 0.5;

            matrix<float> a(mat(A)), b(mat(B));
            for (long c = 0; c < a.nc(); ++c)
            {
                set_colm(a,c) = beta*colm(a,c) + alpha*b;
            }

            tt::add(beta, A, alpha, B);
            DLIB_TEST_MSG(max(abs(mat(A)-a)) < 1e-6, max(abs(mat(A)-a)));

            beta = 0;
            for (long c = 0; c < a.nc(); ++c)
            {
                set_colm(a,c) = beta*colm(a,c) + alpha*b;
            }

            tt::add(beta, A, alpha, B);
            DLIB_TEST(max(abs(mat(A)-a)) < 1e-6);
        }

        {
            resizable_tensor A, B;
            A.set_size(2,3,4,5);
            B.set_size(2,3,4,5);

            tensor_rand rnd;
            rnd.fill_uniform(A);
            rnd.fill_uniform(B);

            matrix<float> truth;

            truth = 2*mat(A) + 3*mat(B);
            tt::add(2, A, 3, B);
            DLIB_TEST(max(abs(mat(A)-truth )) < 1e-6);


            rnd.fill_uniform(A);
            rnd.fill_uniform(B);
            truth = 0*mat(A) + 3*mat(B);
            tt::add(0, A, 3, B);
            DLIB_TEST(max(abs(mat(A)-truth )) < 1e-6);

            rnd.fill_uniform(A);
            rnd.fill_uniform(B);
            truth = 1*mat(A) + 0*mat(B);
            tt::add(1, A, 0, B);
            DLIB_TEST(max(abs(mat(A)-truth )) < 1e-6);


            rnd.fill_uniform(A);
            rnd.fill_uniform(B);
            truth = 0*mat(A) + 0*mat(B);
            tt::add(0, A, 0, B);
            DLIB_TEST(max(abs(mat(A)-truth )) < 1e-6);


            B.set_size(1,3,4,5);
            rnd.fill_uniform(A);
            rnd.fill_uniform(B);
            truth = 2*mat(A) + 3*join_cols(mat(B), mat(B));
            tt::add(2, A, 3, B);
            DLIB_TEST(max(abs(mat(A)-truth )) < 1e-6);
            DLIB_TEST(A.num_samples()==2);

            B.set_size(1,1,4,5);
            rnd.fill_uniform(A);
            rnd.fill_uniform(B);
            matrix<float> temp = join_rows(mat(B), join_rows(mat(B),mat(B)));
            truth = 2*mat(A) + 3*join_cols(temp,temp);
            tt::add(2, A, 3, B);
            DLIB_TEST_MSG(max(abs(mat(A)-truth )) < 1e-6, max(abs(mat(A)-truth )));

            B.set_size(1,3,1,1);
            rnd.fill_uniform(A);
            rnd.fill_uniform(B);
            resizable_tensor AA(A), BB(B);
            tt::add(2, A, 3, B);
            cpu::add(2, AA, 3, BB);
            DLIB_TEST_MSG(max(abs(mat(A)-mat(AA) )) < 1e-6, max(abs(mat(A)-mat(AA) )));
        }

        {
            print_spinner();
            resizable_tensor dest1(123,456), dest2(123,456);
            resizable_tensor src1(123,456), src2(123,456);

            tt::tensor_rand rnd;

            rnd.fill_uniform(src1); tt::affine_transform(src1, src1, 1, 2); src2 = src1;  // random in range [2, 3]
            dest1 = exp(mat(src1));
            tt::exp(dest2, src2);
            tt::exp(src2, src2); // should work in place
            DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2))) < 1e-5, max(abs(mat(dest1)-mat(dest2))));
            DLIB_TEST(max(abs(mat(dest1)-mat(src2))) < 1e-5);

            rnd.fill_uniform(src1); tt::affine_transform(src1, src1, 1, 2); src2 = src1;  // random in range [2, 3]
            dest1 = log(mat(src1));
            tt::log(dest2, src2);
            tt::log(src2, src2); // should work in place
            DLIB_TEST(max(abs(mat(dest1)-mat(dest2))) < 1e-5);
            DLIB_TEST(max(abs(mat(dest1)-mat(src2))) < 1e-5);

            rnd.fill_uniform(src1); tt::affine_transform(src1, src1, 1, 2); src2 = src1;  // random in range [2, 3]
            dest1 = log10(mat(src1));
            tt::log10(dest2, src2);
            tt::log10(src2, src2); // should work in place
            DLIB_TEST(max(abs(mat(dest1)-mat(dest2))) < 1e-5);
            DLIB_TEST(max(abs(mat(dest1)-mat(src2))) < 1e-5);

        }
    }

// ----------------------------------------------------------------------------------------

#ifdef DLIB_USE_CUDA

    void test_scale_channels()
    {
        tt::tensor_rand rnd;

        resizable_tensor dest1(2,3,4,5), dest2;
        rnd.fill_gaussian(dest1);
        dest2 = dest1;

        resizable_tensor src(2,3,4,5);
        resizable_tensor scales(2,3);
        rnd.fill_gaussian(src);
        rnd.fill_gaussian(scales);


        cpu::scale_channels(true, dest1, src, scales);
        cuda::scale_channels(true, dest2, src, scales);

        DLIB_TEST(max(abs(mat(dest1)-mat(dest2))) < 1e-6);

        cpu::scale_channels(false, dest1, src, scales);
        cuda::scale_channels(false, dest2, src, scales);

        DLIB_TEST(max(abs(mat(dest1)-mat(dest2))) < 1e-6);
    }

// ----------------------------------------------------------------------------------------

    void test_affine_rect()
    {
        dlib::rand rnd;

        for (int iter = 0; iter < 20; ++iter)
        {

            long nr = 1 + rnd.get_random_32bit_number()%10;
            long nc = 1 + rnd.get_random_32bit_number()%10;

            resizable_tensor dest1(nr,nc), dest2(nr,nc), src1(nr,nc), src2(nr,nc), src3(nr,nc);
            matrix<float> dest3;

            dest1 = 1;
            dest2 = 1;
            dest3 = mat(dest1);
            src1 = 2;
            src2 = 3;
            src3 = 4;

            point p1(rnd.get_random_32bit_number()%nc, rnd.get_random_32bit_number()%nr);
            point p2(rnd.get_random_32bit_number()%nc, rnd.get_random_32bit_number()%nr);
            rectangle rect(p1,p2);

            cuda::affine_transform(rect, dest1, src1, src2, src3, 2,3,4);

            cpu::affine_transform(rect, dest2, src1, src2, src3, 2,3,4);

            DLIB_TEST(mat(dest1) == mat(dest2));

            set_subm(dest3,rect) = 2*subm(mat(src1),rect) + 3*subm(mat(src2),rect) + 4*subm(mat(src3),rect);
            DLIB_TEST(dest3 == mat(dest1));

            dest1 = 1;
            tt::affine_transform(rect, dest1, src1, src2, src3, 2,3,4);
            DLIB_TEST(dest3 == mat(dest1));
        }
    }

    void test_conv()
    {
        cuda::tensor_conv conv1;
        cpu::tensor_conv conv2;

        dlib::rand prnd;
        for (int iter = 0; iter < 400; ++iter)
        {
            print_spinner();

            resizable_tensor data(prnd.get_random_32bit_number()%5+1,
                prnd.get_random_32bit_number()%5+1,
                prnd.get_random_32bit_number()%25+1,
                prnd.get_random_32bit_number()%25+1
            );
            resizable_tensor filters(
                prnd.get_random_32bit_number()%5+1,
                data.k(),
                prnd.get_random_32bit_number()%6+1,
                prnd.get_random_32bit_number()%6+1 
            );

            tt::tensor_rand rnd;
            rnd.fill_uniform(data);
            rnd.fill_uniform(filters);


            resizable_tensor output1, output2;


            const int stride_y = prnd.get_random_32bit_number()%5+1;
            const int stride_x = prnd.get_random_32bit_number()%5+1;
            int padding_y = prnd.get_random_32bit_number()%(filters.nr()/2+1);
            int padding_x = prnd.get_random_32bit_number()%(filters.nc()/2+1);
            if (!(filters.nr() <= data.nr() + 2*padding_y))
                padding_y = (filters.nr()-data.nr()+1)/2;
            if (!(filters.nc() <= data.nc() + 2*padding_x))
                padding_x = (filters.nc()-data.nc()+1)/2;
            conv1.setup(data,filters,stride_y,stride_x,padding_y,padding_x);
            conv1(false, output1, data, filters);
            conv2.setup(data,filters,stride_y,stride_x,padding_y,padding_x);
            conv2(false, output2, data, filters);
            dlog << LINFO << "forward error: "<< max(abs(mat(output1)-mat(output2)));
            DLIB_TEST_MSG(max(abs(mat(output1)-mat(output2))) < 1e-3, max(abs(mat(output1)-mat(output2)))
                 <<"\n\t padding_y: "<< padding_y 
                 <<"\n\t padding_x: "<< padding_x 
                 );

            conv1(true, output1, data, filters);
            conv2(true, output2, data, filters);
            dlog << LINFO << "forward error: "<< max(abs(mat(output1)-mat(output2)));
            DLIB_TEST_MSG(max(abs(mat(output1)-mat(output2))) < 1e-3, max(abs(mat(output1)-mat(output2)))
                 <<"\n\t padding_y: "<< padding_y 
                 <<"\n\t padding_x: "<< padding_x 
                 );



            resizable_tensor gi, data_gradient1, data_gradient2;
            gi.copy_size(output1);
            rnd.fill_uniform(gi);

            data_gradient1.copy_size(data);
            data_gradient2.copy_size(data);
            data_gradient1 = 1;
            data_gradient2 = 1;

            conv1.get_gradient_for_data(true, gi, filters, data_gradient1);
            conv2.get_gradient_for_data(true, gi, filters, data_gradient2);

            dlog << LINFO << "data gradient error: "<< max(abs(mat(data_gradient1)-mat(data_gradient2)));
            DLIB_TEST(max(abs(mat(data_gradient1)-mat(data_gradient2))) < 1e-3);

            conv1.get_gradient_for_data(false, gi, filters, data_gradient1);
            conv2.get_gradient_for_data(false, gi, filters, data_gradient2);

            dlog << LINFO << "data gradient error: "<< max(abs(mat(data_gradient1)-mat(data_gradient2)));
            DLIB_TEST(max(abs(mat(data_gradient1)-mat(data_gradient2))) < 1e-3);


            resizable_tensor filter_gradient1, filter_gradient2;
            gi.copy_size(output1);
            rnd.fill_uniform(gi);

            filter_gradient1.copy_size(filters);
            filter_gradient2.copy_size(filters);
            filter_gradient1 = 1;
            filter_gradient2 = 1;

            conv1.get_gradient_for_filters(false, gi, data, filter_gradient1);
            conv2.get_gradient_for_filters(false, gi, data, filter_gradient2);

            dlog << LINFO << "filter gradient error: "<< max(abs(mat(filter_gradient1)-mat(filter_gradient2)));
            DLIB_TEST_MSG(max(abs(mat(filter_gradient1)-mat(filter_gradient2))) < 1e-3, max(abs(mat(filter_gradient1)-mat(filter_gradient2))));


            conv1.get_gradient_for_filters(true, gi, data, filter_gradient1);
            conv2.get_gradient_for_filters(true, gi, data, filter_gradient2);

            dlog << LINFO << "filter gradient error: "<< max(abs(mat(filter_gradient1)-mat(filter_gradient2)));
            DLIB_TEST_MSG(max(abs(mat(filter_gradient1)-mat(filter_gradient2))) < 2e-3, max(abs(mat(filter_gradient1)-mat(filter_gradient2))));
        }
    }

    void compare_adam()
    {
        float t = 2;
        tt::tensor_rand rnd;
        resizable_tensor s, m, v, params, params_grad;
        s.set_size(89,90,60,73);
        m.copy_size(s);
        v.copy_size(s);
        params.copy_size(s);
        params_grad.copy_size(s);

        rnd.fill_uniform(s);
        rnd.fill_uniform(m);
        rnd.fill_uniform(v);
        rnd.fill_uniform(params);
        rnd.fill_uniform(params_grad);

        resizable_tensor mm(m), vv(v);
        cpu::compute_adam_update(0,params.size(),s, mm, vv, t, 0.01, 0.001, 0.9, 0.99, params, params_grad);
        matrix<float> s1 = mat(s);
        
        rnd.fill_uniform(s);
        cuda::compute_adam_update(0,params.size(),s, m, v, t, 0.01, 0.001, 0.9, 0.99, params, params_grad);
        matrix<float> s2 = mat(s);

        DLIB_TEST_MSG(max(abs(s1-s2)) < 1e-6, max(abs(s1-s2)));
        DLIB_TEST_MSG(max(abs(mat(m)-mat(mm))) < 1e-6, max(abs(mat(m)-mat(mm))));
        DLIB_TEST_MSG(max(abs(mat(v)-mat(vv))) < 1e-6, max(abs(mat(v)-mat(vv))));
    }

    void test_multiply_zero_padded()
    {
        print_spinner();
        dlib::rand rnd;
        tt::tensor_rand trnd;
        for (int iter = 0; iter < 300; ++iter)
        {
            resizable_tensor dest1(rnd.get_random_32bit_number()%4+1,
                                  rnd.get_random_32bit_number()%4+1,
                                  rnd.get_random_32bit_number()%4+1,
                                  rnd.get_random_32bit_number()%4+1);
            resizable_tensor dest2;
            dest2.copy_size(dest1);
            resizable_tensor src1(rnd.get_random_32bit_number()%4+1,
                                  rnd.get_random_32bit_number()%4+1,
                                  rnd.get_random_32bit_number()%4+1,
                                  rnd.get_random_32bit_number()%4+1);
            resizable_tensor src2(rnd.get_random_32bit_number()%4+1,
                                  rnd.get_random_32bit_number()%4+1,
                                  rnd.get_random_32bit_number()%4+1,
                                  rnd.get_random_32bit_number()%4+1);

            trnd.fill_uniform(dest1);
            trnd.fill_uniform(dest2);
            trnd.fill_uniform(src1);
            trnd.fill_uniform(src2);
            cpu::multiply_zero_padded(false, dest1, src1, src2);
            cuda::multiply_zero_padded(false, dest2, src1, src2);
            DLIB_TEST(max(abs(mat(dest1) - mat(dest2))) < 1e-5);

            cpu::multiply_zero_padded(true, dest1, src1, src2);
            cuda::multiply_zero_padded(true, dest2, src1, src2);
            DLIB_TEST(max(abs(mat(dest1) - mat(dest2))) < 1e-5);
        }

        // make sure we have a test for the case where all tensors have the same
        // dimensions.
        resizable_tensor dest1(3,4,5,6);
        resizable_tensor dest2;
        resizable_tensor src1;
        resizable_tensor src2;
        dest2.copy_size(dest1);
        src1.copy_size(dest1);
        src2.copy_size(dest1);

        trnd.fill_uniform(dest1);
        trnd.fill_uniform(dest2);
        trnd.fill_uniform(src1);
        trnd.fill_uniform(src2);
        cpu::multiply_zero_padded(false, dest1, src1, src2);
        cuda::multiply_zero_padded(false, dest2, src1, src2);
        DLIB_TEST(max(abs(mat(dest1) - mat(dest2))) < 1e-5);

        cpu::multiply_zero_padded(true, dest1, src1, src2);
        cuda::multiply_zero_padded(true, dest2, src1, src2);
        DLIB_TEST(max(abs(mat(dest1) - mat(dest2))) < 1e-5);
    }

    void test_add()
    {
        print_spinner();
        dlib::rand rnd;
        tt::tensor_rand trnd;
        for (int iter = 0; iter < 300; ++iter)
        {
            resizable_tensor dest1(rnd.get_random_32bit_number()%4+1,
                                  rnd.get_random_32bit_number()%4+1,
                                  rnd.get_random_32bit_number()%4+1,
                                  rnd.get_random_32bit_number()%4+1);
            resizable_tensor dest2;
            dest2.copy_size(dest1);
            resizable_tensor src1(rnd.get_random_32bit_number()%4+1,
                                  rnd.get_random_32bit_number()%4+1,
                                  rnd.get_random_32bit_number()%4+1,
                                  rnd.get_random_32bit_number()%4+1);
            resizable_tensor src2(rnd.get_random_32bit_number()%4+1,
                                  rnd.get_random_32bit_number()%4+1,
                                  rnd.get_random_32bit_number()%4+1,
                                  rnd.get_random_32bit_number()%4+1);

            trnd.fill_uniform(dest1);
            trnd.fill_uniform(dest2);
            trnd.fill_uniform(src1);
            trnd.fill_uniform(src2);
            cpu::add(dest1, src1, src2);
            cuda::add(dest2, src1, src2);

            DLIB_TEST(max(abs(mat(dest1) - mat(dest2))) < 1e-5);
        }

        // make sure we have a test for the case where all tensors have the same
        // dimensions.
        resizable_tensor dest1(3,4,5,6);
        resizable_tensor dest2;
        resizable_tensor src1;
        resizable_tensor src2;
        dest2.copy_size(dest1);
        src1.copy_size(dest1);
        src2.copy_size(dest1);

        trnd.fill_uniform(dest1);
        trnd.fill_uniform(dest2);
        trnd.fill_uniform(src1);
        trnd.fill_uniform(src2);

        cpu::add(dest1, src1, src2);
        cuda::add(dest2, src1, src2);

        DLIB_TEST(max(abs(mat(dest1) - mat(dest2))) < 1e-5);
    }

    void test_more_ops(const long nr, const long nc)
    {
        using namespace dlib::tt;
        print_spinner();
        // We are going to make sure that the CPU implementation of these things matches
        // the CUDA implementation.

        tensor_rand rnd;

        resizable_tensor dest(nr,nc), src(nr,nc), dest2, src2;
        resizable_tensor srcb(nr,nc), srcc(nr,nc), srcb2, srcc2;


        rnd.fill_uniform(dest);
        rnd.fill_uniform(src);
        dest2 = dest; src2 = src;
        cuda::multiply(false, dest, dest, src);
        cpu::multiply(false, dest2, dest2, src2);
        DLIB_TEST(equal(mat(dest),mat(dest2)));
        cuda::multiply(true, dest, dest, src);
        cpu::multiply(true, dest2, dest2, src2);
        DLIB_TEST(equal(mat(dest),mat(dest2)));


        rnd.fill_uniform(dest);
        rnd.fill_uniform(src);
        dest2 = dest; src2 = src;
        cuda::affine_transform(dest, src, 2, 3);
        cpu::affine_transform(dest2, src2, 2, 3);
        DLIB_TEST(equal(mat(dest),mat(dest2)));

        rnd.fill_uniform(dest);
        rnd.fill_uniform(src);
        rnd.fill_uniform(srcb);
        dest2 = dest; src2 = src; srcb2 = srcb;
        cuda::affine_transform(dest, src, srcb, 2, 3, 4);
        cpu::affine_transform(dest2, src2, srcb2, 2, 3, 4);
        DLIB_TEST(equal(mat(dest),mat(dest2)));

        rnd.fill_uniform(dest);
        rnd.fill_uniform(src);
        rnd.fill_uniform(srcb);
        rnd.fill_uniform(srcc);
        dest2 = dest; src2 = src; srcb2 = srcb; srcc2 = srcc;
        cuda::affine_transform(dest, src, srcb, srcc, 2, 3, 4, 5);
        cpu::affine_transform(dest2, src2, srcb2, srcc2, 2, 3, 4, 5);
        DLIB_TEST(equal(mat(dest),mat(dest2)));

        cuda::affine_transform(dest, src, srcb, srcc, 2, 3, 4, 0);
        cpu::affine_transform(dest2, src2, srcb2, srcc2, 2, 3, 4, 0);
        DLIB_TEST(equal(mat(dest),mat(dest2)));

        cuda::affine_transform_range(0, dest.size(), dest, src, srcb, srcc, 2, 3, 4);
        cpu::affine_transform_range(0, dest2.size(), dest2, src2, srcb2, srcc2, 2, 3, 4);
        DLIB_TEST(equal(mat(dest),mat(dest2)));

        if (3 < dest.size())
        {
            dest = 999;
            dest2 = 999;
            cuda::affine_transform_range(3, dest.size()-1, dest, src, srcb, srcc, 2, 3, 4);
            cpu::affine_transform_range(3, dest2.size()-1, dest2, src2, srcb2, srcc2, 2, 3, 4);
            DLIB_TEST(equal(mat(dest),mat(dest2)));

            cuda::affine_transform_range(dest.size(), dest.size(), dest, src, srcb, srcc, 2, 3, 4);
            cpu::affine_transform_range(dest2.size(), dest2.size(), dest2, src2, srcb2, srcc2, 2, 3, 4);
            DLIB_TEST(equal(mat(dest),mat(dest2)));
        }


        rnd.fill_uniform(dest);
        rnd.fill_uniform(src);
        rnd.fill_uniform(srcb);
        rnd.fill_uniform(srcc);
        dest2 = dest; src2 = src; srcb2 = srcb; srcc2 = srcc;
        cuda::affine_transform(dest, src, srcb, srcc);
        cpu::affine_transform(dest2, src2, srcb2, srcc2);
        DLIB_TEST(equal(mat(dest),mat(dest2)));
        // now exercise code path where the A/B tensors have num_samples()==1
        srcb.set_size(1,nc);
        srcc.set_size(1,nc);
        rnd.fill_uniform(dest);
        rnd.fill_uniform(src);
        rnd.fill_uniform(srcb);
        rnd.fill_uniform(srcc);
        dest2 = dest; src2 = src; srcb2 = srcb; srcc2 = srcc;
        cuda::affine_transform(dest, src, srcb, srcc);
        cpu::affine_transform(dest2, src2, srcb2, srcc2);
        DLIB_TEST(equal(mat(dest),mat(dest2)));


        rnd.fill_uniform(src);
        src2 = src;
        cuda::threshold(src, 0.5);
        cpu::threshold(src2, 0.5);
        DLIB_TEST(equal(mat(src),mat(src2)));

        {
            resizable_tensor dest(3,4);
            resizable_tensor A, B;
            A = dest;
            B = dest;

            rnd.fill_uniform(dest);
            rnd.fill_uniform(A);
            rnd.fill_uniform(B);

            dest.set_size(1,4);

            cuda::multiply(false, dest, A, B);
            DLIB_TEST_MSG(max(abs(mat(dest)-sum_rows(pointwise_multiply(mat(A),mat(B))))) < 1e-6, max(abs(mat(dest)-sum_rows(pointwise_multiply(mat(A),mat(B)))))); 

            A.set_size(1,4);
            rnd.fill_uniform(A);
            matrix<float> AA = join_cols(mat(A),mat(A)); AA = join_cols(mat(A),AA);

            cuda::multiply(false, dest, A, B);
            DLIB_TEST(max(abs(mat(dest)-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6); 

            cuda::multiply(false, dest, B, A);
            DLIB_TEST(max(abs(mat(dest)-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6); 
            matrix<float> prevdest = mat(dest);
            cuda::multiply(true, dest, B, A);
            DLIB_TEST(max(abs(mat(dest)-prevdest-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6); 

            dest.set_size(3,4);
            cuda::multiply(false, dest, B, A);
            DLIB_TEST(max(abs(mat(dest)-pointwise_multiply(AA,mat(B)))) < 1e-6); 
            prevdest = mat(dest);
            cuda::multiply(true, dest, B, A);
            DLIB_TEST(max(abs(mat(dest)-prevdest-pointwise_multiply(AA,mat(B)))) < 1e-6); 

            cuda::multiply(false, dest, A, B);
            DLIB_TEST(max(abs(mat(dest)-pointwise_multiply(AA,mat(B)))) < 1e-6); 
        }

        {
            resizable_tensor invnorms1, invnorms2;
            resizable_tensor data(4,5), out1, out2;
            rnd.fill_uniform(data);

            const double eps = 0.1;

            invnorms2 = reciprocal(sqrt(sum_cols(squared(mat(data))) + eps));
            tt::inverse_norms(invnorms1, data, eps);
            DLIB_TEST(max(abs(mat(invnorms1)-mat(invnorms2))) < 1e-6);

            out1.copy_size(data);
            tt::scale_rows(out1, data, invnorms1);
            out2 = scale_rows(mat(data), mat(invnorms1));
            DLIB_TEST(max(abs(mat(out1)-mat(out2))) < 1e-6);
        }

        {
            resizable_tensor a(123,432), b(123,432);
            rnd.fill_gaussian(a);
            rnd.fill_gaussian(b);

            resizable_tensor out;
            dot_prods(out, a,b);
            const matrix<float> truth = sum_cols(pointwise_multiply(mat(a), mat(b)));
            DLIB_TEST(max(abs(mat(out) - truth)) < 1e-4);
            out = 0;
            DLIB_TEST(max(abs(mat(out) - truth)) > 1e-2);
            dot_prods(false, out, a,b);
            DLIB_TEST(max(abs(mat(out) - truth)) < 1e-4);
            dot_prods(true, out, a,b);
            DLIB_TEST(max(abs(mat(out)/2 - truth)) < 1e-4);
            DLIB_TEST(max(abs(mat(out) - truth)) > 1e-2);
        }
    }

// ----------------------------------------------------------------------------------------

    void compare_bn_gpu_and_cpu()
    {
        print_spinner();
        resizable_tensor dest, dest2;
        resizable_tensor means, means2;
        resizable_tensor invstds, invstds2;
        resizable_tensor running_means, running_means2;
        resizable_tensor running_variances, running_variances2;
        resizable_tensor src(64,20,100,100);
        resizable_tensor gamma(1,20,100,100);
        resizable_tensor beta(1,20,100,100);
        gamma = 2;
        beta = 3;
        tt::tensor_rand rnd;
        rnd.fill_uniform(src);


        cpu::batch_normalize(DEFAULT_BATCH_NORM_EPS,dest, means, invstds, 1, running_means, running_variances, src, gamma, beta);
        cuda::batch_normalize(DEFAULT_BATCH_NORM_EPS,dest2,means2,invstds2, 1, running_means2, running_variances2, src, gamma, beta);

        dlog << LINFO << "dest error:    "<< max(abs(mat(dest) -mat(dest2)));
        dlog << LINFO << "means error:   "<< max(abs(mat(means) -mat(means2)));
        dlog << LINFO << "invstds error: "<< max(abs(mat(invstds) -mat(invstds2)));
        dlog << LINFO << "running_means error:   "<< max(abs(mat(running_means) -mat(running_means2)));
        dlog << LINFO << "running_variances error: "<< max(abs(mat(running_variances) -mat(running_variances2)));

        DLIB_TEST(max(abs(mat(dest) -mat(dest2))) < 1e-4);
        DLIB_TEST(max(abs(mat(means) -mat(means2))) < 1e-4);
        DLIB_TEST(max(abs(mat(invstds) -mat(invstds2))) < 1e-4);
        DLIB_TEST(max(abs(mat(running_means) -mat(running_means2))) < 1e-4);
        DLIB_TEST_MSG(max(abs(mat(running_variances) -mat(running_variances2))) < 1e-4,
            mean(mat(running_variances)) 
            << "\n" << mean(mat(running_variances2))
            << "\n" << max(abs(mat(running_variances) -mat(running_variances2)))
            << "\n" << mean(abs(mat(running_variances) -mat(running_variances2)))
            );


        // now check that the gradients match as well
        resizable_tensor gradient_input;
        resizable_tensor src_grad, gamma_grad, beta_grad;
        resizable_tensor src_grad2, gamma_grad2, beta_grad2;
        gradient_input.copy_size(dest);
        src_grad.copy_size(src); src_grad = 0; src_grad2 = src_grad;
        gamma_grad.copy_size(gamma); gamma_grad = 0; gamma_grad2 = gamma_grad;
        beta_grad.copy_size(beta); beta_grad = 0; beta_grad2 = beta_grad;
        rnd.fill_uniform(gradient_input);


        cpu::batch_normalize_gradient(DEFAULT_BATCH_NORM_EPS,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
        cuda::batch_normalize_gradient(DEFAULT_BATCH_NORM_EPS,gradient_input, means, invstds, src, gamma, src_grad2, gamma_grad2, beta_grad2);

        dlog << LINFO << "src_grad error:   " << max(abs(mat(src_grad)-mat(src_grad2)));
        dlog << LINFO << "gamma_grad error: " << max(abs(mat(gamma_grad)-mat(gamma_grad2)));
        dlog << LINFO << "beta_grad error:  " << max(abs(mat(beta_grad)-mat(beta_grad2)));
        DLIB_TEST(max(abs(mat(src_grad)-mat(src_grad2))) < 1e-4);
        DLIB_TEST(max(abs(mat(gamma_grad)-mat(gamma_grad2))) < 1e-4);
        DLIB_TEST(max(abs(mat(beta_grad)-mat(beta_grad2))) < 1e-4);
    }

    void compare_bn_conv_gpu_and_cpu()
    {
        print_spinner();
        resizable_tensor dest, dest2;
        resizable_tensor means, means2;
        resizable_tensor invstds, invstds2;
        resizable_tensor running_means, running_means2;
        resizable_tensor running_variances, running_variances2;
        resizable_tensor src(2,8,10,9);
        resizable_tensor gamma(1,8);
        resizable_tensor beta(1,8);
        gamma = 2;
        beta = 3;
        tt::tensor_rand rnd;
        rnd.fill_uniform(src);

        cpu::batch_normalize_conv(DEFAULT_BATCH_NORM_EPS,dest,means,invstds,1,running_means,running_variances, src, gamma, beta);
        cuda::batch_normalize_conv(DEFAULT_BATCH_NORM_EPS,dest2,means2,invstds2,1,running_means2,running_variances2, src, gamma, beta);

        dlog << LINFO << "dest error:    "<< max(abs(mat(dest) -mat(dest2)));
        dlog << LINFO << "means error:   "<< max(abs(mat(means) -mat(means2)));
        dlog << LINFO << "invstds error: "<< max(abs(mat(invstds) -mat(invstds2)));
        dlog << LINFO << "running_means error:   "<< max(abs(mat(running_means) -mat(running_means2)));
        dlog << LINFO << "running_variances error: "<< max(abs(mat(running_variances) -mat(running_variances2)));

        DLIB_TEST(max(abs(mat(dest) -mat(dest2))) < 1e-4);
        DLIB_TEST(max(abs(mat(means) -mat(means2))) < 1e-4);
        DLIB_TEST(max(abs(mat(invstds) -mat(invstds2))) < 1e-4);
        DLIB_TEST(max(abs(mat(running_means) -mat(running_means2))) < 1e-4);
        DLIB_TEST(max(abs(mat(running_variances) -mat(running_variances2))) < 1e-4);

        resizable_tensor gradient_input;
        resizable_tensor src_grad, gamma_grad, beta_grad;
        resizable_tensor src_grad2, gamma_grad2, beta_grad2;
        gradient_input.copy_size(dest);
        src_grad.copy_size(src); src_grad = 0; src_grad2 = src_grad;
        gamma_grad.copy_size(gamma); gamma_grad = 0; gamma_grad2 = gamma_grad;
        beta_grad.copy_size(beta); beta_grad = 0; beta_grad2 = beta_grad;
        rnd.fill_uniform(gradient_input);


        cpu::batch_normalize_conv_gradient(DEFAULT_BATCH_NORM_EPS,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
        cuda::batch_normalize_conv_gradient(DEFAULT_BATCH_NORM_EPS,gradient_input, means, invstds, src, gamma, src_grad2, gamma_grad2, beta_grad2);

        dlog << LINFO << "src_grad error:   " << max(abs(mat(src_grad)-mat(src_grad2)));
        dlog << LINFO << "gamma_grad error: " << max(abs(mat(gamma_grad)-mat(gamma_grad2)));
        dlog << LINFO << "beta_grad error:  " << max(abs(mat(beta_grad)-mat(beta_grad2)));
        DLIB_TEST(max(abs(mat(src_grad)-mat(src_grad2))) < 1e-4);
        DLIB_TEST(max(abs(mat(gamma_grad)-mat(gamma_grad2))) < 1e-4);
        DLIB_TEST(max(abs(mat(beta_grad)-mat(beta_grad2))) < 1e-4);
    }


    void test_more_ops2()
    {
        dlib::rand rnd;
        tt::tensor_rand trand;

        for (int iter = 0; iter < 100; ++iter)
        {
            print_spinner();
            resizable_tensor dest1, dest2, src1, src2;
            src1.set_size(rnd.get_random_32bit_number()%30+1,
                rnd.get_random_32bit_number()%30+1,
                rnd.get_random_32bit_number()%30+1,
                rnd.get_random_32bit_number()%30+1);
            dest1.copy_size(src1);
            dest2.copy_size(src1);
            src2.set_size(1,src1.k(),1,1);

            trand.fill_uniform(dest1);
            trand.fill_uniform(dest2);
            trand.fill_uniform(src1);
            trand.fill_uniform(src2);

            cpu::multiply_conv(false, dest1, src1, src2);
            cuda::multiply_conv(false, dest2, src1, src2);
            DLIB_TEST(max(abs(mat(dest1)-mat(dest2))) < 1e-5);
            cpu::multiply_conv(true, dest1, src1, src2);
            cuda::multiply_conv(true, dest2, src1, src2);
            DLIB_TEST(max(abs(mat(dest1)-mat(dest2))) < 1e-5);


            // now try it using the other mode of multiply_conv
            src2.copy_size(src1);
            dest1.set_size(1,src1.k(),1,1);
            dest2.set_size(1,src1.k(),1,1);
            trand.fill_uniform(dest1);
            trand.fill_uniform(dest2);
            trand.fill_uniform(src1);
            trand.fill_uniform(src2);
            cpu::multiply_conv(false, dest1, src1, src2);
            cuda::multiply_conv(false, dest2, src1, src2);
            float scale = max(abs(mat(dest1)));
            float scalem = mean(abs(mat(dest1)));
            DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2)))/scale < 1e-4 , max(abs(mat(dest1)-mat(dest2)))/scale);
            DLIB_TEST_MSG(mean(abs(mat(dest1)-mat(dest2)))/scalem < 1e-5 , mean(abs(mat(dest1)-mat(dest2)))/scalem);
            matrix<float> prevd2 = mat(dest2);
            cpu::multiply_conv(false, dest1, src1, src2);
            cuda::multiply_conv(true, dest2, src1, src2);
            scale = max(abs(mat(dest1)));
            scalem = mean(abs(mat(dest1)));
            DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2)+prevd2))/scale < 1e-4 , max(abs(mat(dest1)-mat(dest2)+prevd2))/scale);
            DLIB_TEST_MSG(mean(abs(mat(dest1)-mat(dest2)+prevd2))/scalem < 1e-5 , mean(abs(mat(dest1)-mat(dest2)+prevd2))/scalem);
        }

        for (int iter = 0; iter < 100; ++iter)
        {
            print_spinner();
            resizable_tensor dest1, dest2, src, A, B;
            src.set_size(rnd.get_random_32bit_number()%30+1,
                rnd.get_random_32bit_number()%30+1,
                rnd.get_random_32bit_number()%30+1,
                rnd.get_random_32bit_number()%30+1);
            dest1.copy_size(src);
            dest2.copy_size(src);
            A.set_size(1,src.k(),1,1);
            B.set_size(1,src.k(),1,1);

            trand.fill_uniform(dest1);
            trand.fill_uniform(dest2);
            trand.fill_uniform(src);
            trand.fill_uniform(A);
            trand.fill_uniform(B);

            cpu::affine_transform_conv(dest1, src, A, B);
            cuda::affine_transform_conv(dest2, src, A, B);
            DLIB_TEST(max(abs(mat(dest1)-mat(dest2))) < 1e-5);
        }

        for (int iter = 0; iter < 100; ++iter)
        {
            print_spinner();
            resizable_tensor dest1, dest2, g;
            g.set_size(rnd.get_random_32bit_number()%30+1,
                rnd.get_random_32bit_number()%30+1,
                rnd.get_random_32bit_number()%30+1,
                rnd.get_random_32bit_number()%30+1);
            dest1.set_size(1,g.k(),1,1);
            dest2.set_size(1,g.k(),1,1);

            trand.fill_uniform(dest1);
            trand.fill_uniform(dest2);
            trand.fill_uniform(g);

            cpu::assign_conv_bias_gradient(dest1, g);
            cuda::assign_conv_bias_gradient(dest2, g);
            const float scale = max(abs(mat(dest1)));
            const float scalem = mean(abs(mat(dest1)));
            DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2)))/scale < 1e-4 , max(abs(mat(dest1)-mat(dest2)))/scale);
            DLIB_TEST_MSG(mean(abs(mat(dest1)-mat(dest2)))/scalem < 1e-5 , mean(abs(mat(dest1)-mat(dest2)))/scalem);
        }

    }

#endif // DLIB_USE_CUDA

// ----------------------------------------------------------------------------------------

    void test_max_pool(
        const int window_height,
        const int window_width,
        const int stride_y,
        const int stride_x,
        const int padding_y,
        const int padding_x
    )
    {
        print_spinner();
        resizable_tensor A, B, gradient_input;
        A.set_size(4,5,16,7);
        B.copy_size(A);
        gradient_input.copy_size(A);

        tt::tensor_rand rnd;
        rnd.fill_gaussian(A,0,1);
        rnd.fill_gaussian(B,0,1);
        rnd.fill_gaussian(gradient_input,0,1);


        tt::pooling mp;

        mp.setup_max_pooling(window_height,window_width,stride_y,stride_x,padding_y,padding_x);
        mp(A, B);

        // make sure max pooling does what it's spec says it should.
        DLIB_TEST( A.num_samples() == B.num_samples());
        DLIB_TEST( A.k() == B.k());

        DLIB_TEST( A.nr() == 1+(B.nr()+2*padding_y-window_height)/stride_y);
        DLIB_TEST( A.nc() == 1+(B.nc()+2*padding_x-window_width)/stride_x);

        const long x_offset = window_width/2 - padding_x;
        const long y_offset = window_height/2 - padding_y;
        for (long s = 0; s < A.num_samples(); ++s)
        {
            for (long k = 0; k < A.k(); ++k)
            {
                for (long r = 0; r < A.nr(); ++r)
                {
                    for (long c = 0; c < A.nc(); ++c)
                    {
                        DLIB_TEST_MSG(image_plane(A,s,k)(r,c) == max(subm_clipped(image_plane(B,s,k),
                                    centered_rect(c*stride_x+x_offset,
                                                  r*stride_y+y_offset,
                                                  window_width,
                                                  window_height))), 
                                                  "padding: "<< padding_x << "  " << padding_y 
                                                  << " window size: " << window_width << " " << window_height 
                                                  << " stride: " << stride_x << " " << stride_y
                                                  );
                    }
                }
            }
        }
    }

// ----------------------------------------------------------------------------------------

    void test_avg_pool(
        const int window_height,
        const int window_width,
        const int stride_y,
        const int stride_x,
        const int padding_y,
        const int padding_x
    )
    {
        print_spinner();
        resizable_tensor A, B, gradient_input;
        A.set_size(4,5,16,7);
        B.copy_size(A);
        gradient_input.copy_size(A);

        tt::tensor_rand rnd;
        rnd.fill_gaussian(A,0,1);
        rnd.fill_gaussian(B,0,1);
        rnd.fill_gaussian(gradient_input,0,1);


        tt::pooling mp;

        mp.setup_avg_pooling(window_height,window_width,stride_y,stride_x,padding_y,padding_x);
        mp(A, B);

        // make sure avg pooling does what it's spec says it should.
        DLIB_TEST( A.num_samples() == B.num_samples());
        DLIB_TEST( A.k() == B.k());
        DLIB_TEST( A.nr() == 1+(B.nr()+2*padding_y-window_height)/stride_y);
        DLIB_TEST( A.nc() == 1+(B.nc()+2*padding_x-window_width)/stride_x);

        const long x_offset = window_width/2 - padding_x;
        const long y_offset = window_height/2 - padding_y;
        for (long s = 0; s < A.num_samples(); ++s)
        {
            for (long k = 0; k < A.k(); ++k)
            {
                for (long r = 0; r < A.nr(); ++r)
                {
                    for (long c = 0; c < A.nc(); ++c)
                    {
                        float expected = mean(subm_clipped(image_plane(B,s,k),
                                            centered_rect(c*stride_x+x_offset,
                                                        r*stride_y+y_offset,
                                                        window_width,
                                                        window_height)));
                        float err = abs(image_plane(A,s,k)(r,c) - expected);
                        DLIB_TEST_MSG(err < 1e-5, err << "  " << expected << "  " << image_plane(A,s,k)(r,c));
                    }
                }
            }
        }
    }

// ----------------------------------------------------------------------------------------

    void test_layers()
    {
        {
            print_spinner();
            reorg_<2,2> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            extract_<0,2,2,2> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            extract_<3,2,1,2> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            extract_<0,2,1,2> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            upsample_<1,1> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            upsample_<2,1> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            upsample_<2,2> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            upsample_<3,3> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            resize_to_<1,1> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            resize_to_<2,1> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            resize_to_<2,2> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            l2normalize_ l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            multiply_ l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            max_pool_<3,3,1,1> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            avg_pool_<3,3,1,1> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            affine_ l(CONV_MODE);
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            affine_ l(FC_MODE);
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            bn_<CONV_MODE> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            bn_<FC_MODE> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            layer_norm_ l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            cont_<3,3,3,2,2,0,0> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            cont_<3,3,3,2,2> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            cont_<3,3,3,1,1> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            cont_<3,3,3,1,1,0,0> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            cont_<3,2,2,2,2> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            con_<3,2,2,2,2> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            con_<3,3,3,1,1>l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            con_<3,3,2,1,1> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            con_<2,1,1,1,1> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            con_<3,0,2,2,2> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            con_<3,2,0,2,2> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            con_<3,0,0,2,2> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            fc_<1,FC_HAS_BIAS> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            fc_<5,FC_HAS_BIAS> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            fc_<4,FC_NO_BIAS> l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            relu_ l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            prelu_ l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            leaky_relu_ l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            sig_ l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            mish_ l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            htan_ l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            clipped_relu_ l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            elu_ l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            gelu_ l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            smelu_ l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            silu_ l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            softmax_ l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
        {
            print_spinner();
            softmax_all_ l;
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
    }

// ----------------------------------------------------------------------------------------

    template <unsigned long n, typename SUBNET> using rcon = max_pool<2,2,2,2,relu<bn_con<con<n,5,5,1,1,SUBNET>>>>;
    template <unsigned long n, typename SUBNET> using rfc = relu<bn_fc<fc<n,SUBNET>>>;

    void test_tagging(
    )
    {
        typedef loss_multiclass_log<rfc<10,skip1<rfc<84,rfc<120,tag1<rcon<16,rcon<6,input<matrix<unsigned char>>>>>>>>>> net_type;

        net_type net;
        net_type net2(num_fc_outputs(4));

        DLIB_TEST(layer<tag1>(net).num_computational_layers == 8);
        DLIB_TEST(layer<skip1>(net).num_computational_layers == 8+3+3);
        DLIB_TEST(layer<tag1>(net).num_layers == 10);
        DLIB_TEST(layer<skip1>(net).num_layers == 10+3+3+1);
        DLIB_TEST(&layer<skip1>(net).get_output() == &layer<tag1>(net).get_output());
        DLIB_TEST(&layer<skip1>(net).get_output() != &layer<tag1>(net).subnet().subnet().get_output());
        DLIB_TEST(net.subnet().subnet().subnet().layer_details().get_num_outputs() == 10);
        DLIB_TEST(net2.subnet().subnet().subnet().layer_details().get_num_outputs() == 4);
    }

// ----------------------------------------------------------------------------------------

    template <
        int N, 
        template <typename> class BN, 
        int stride, 
        typename SUBNET
        > 
    using block  = BN<con<N,3,3,1,1,relu<BN<con<N,3,3,stride,stride,SUBNET>>>>>;

    template <
        template <int,template<typename>class,int,typename> class block, 
        int N, 
        template<typename>class BN, 
        typename SUBNET
        >
    using residual = add_prev1<block<N,BN,1,tag1<SUBNET>>>;

    template <
        template <int,template<typename>class,int,typename> class block, 
        int N, 
        template<typename>class BN, 
        typename SUBNET
        >
    using residual_down = add_prev2<avg_pool<2,2,2,2,skip1<tag2<block<N,BN,2,tag1<SUBNET>>>>>>;


    template <typename SUBNET> using res       = relu<residual<block,8,bn_con,SUBNET>>;
    template <typename SUBNET> using ares      = relu<residual<block,8,affine,SUBNET>>;
    template <typename SUBNET> using res_down  = relu<residual_down<block,8,bn_con,SUBNET>>;
    template <typename SUBNET> using ares_down = relu<residual_down<block,8,affine,SUBNET>>;

    template <typename SUBNET> 
    using pres  = prelu<add_prev1<bn_con<con<8,3,3,1,1,prelu<bn_con<con<8,3,3,1,1,tag1<SUBNET>>>>>>>>;

    void test_visit_functions()
    {
        using net_type2 = loss_multiclass_log<fc<10,
            avg_pool_everything<
            pres<res<res<res_down< // 2 prelu layers here
            tag4<repeat<9,pres,    // 9 groups, each containing 2 prelu layers  
            res_down<
            leaky_relu<res<
            input<matrix<unsigned char>>
            >>>>>>>>>>>>;

        net_type2 pnet;
        const net_type2& const_pnet = pnet;

        DLIB_TEST_MSG(pnet.num_layers == 132, pnet.num_layers);
        DLIB_TEST_MSG(pnet.num_computational_layers == 110, pnet.num_computational_layers);

        {
            std::vector<bool> hit(pnet.num_computational_layers, false);
            size_t count = 0;
            visit_layer_parameter_gradients(pnet, [&](size_t i, tensor& ){hit[i] = true; ++count; });
            for (auto x : hit)
                DLIB_TEST(x);
            DLIB_TEST(count == pnet.num_computational_layers);
        }
        {
            std::vector<bool> hit(pnet.num_computational_layers, false);
            size_t count = 0;
            visit_layer_parameter_gradients(const_pnet, [&](size_t i, const tensor& ){hit[i] = true; ++count; });
            for (auto x : hit)
                DLIB_TEST(x);
            DLIB_TEST(count == pnet.num_computational_layers);
        }

        {
            size_t count = 0;
            std::vector<bool> hit2(pnet.num_computational_layers, false);
            visit_layer_parameters(pnet, [&](size_t i, tensor& ){hit2[i] = true; ++count; });
            for (auto x : hit2)
                DLIB_TEST(x);
            DLIB_TEST(count == pnet.num_computational_layers);
        }
        {
            size_t count = 0;
            std::vector<bool> hit2(pnet.num_computational_layers, false);
            visit_layer_parameters(const_pnet, [&](size_t i, const tensor& ){hit2[i] = true; ++count; });
            for (auto x : hit2)
                DLIB_TEST(x);
            DLIB_TEST(count == pnet.num_computational_layers);
        }

        int num_relus = 0;
        visit_computational_layers(pnet, [&num_relus](relu_&) { ++num_relus; });
        DLIB_TEST(num_relus == 10);
        num_relus = 0;
        visit_computational_layers(const_pnet, [&num_relus](const relu_&) { ++num_relus; });
        DLIB_TEST(num_relus == 10);
        num_relus = 0;
        visit_computational_layers(const_pnet, [&num_relus](relu_&) { ++num_relus; });
        // Visiting doesn't happen in this case because a const network can't bind the non-const
        // relu_ reference used above. 
        DLIB_TEST(num_relus == 0);

        DLIB_TEST(layer<leaky_relu>(pnet).layer_details().get_alpha() == 0.01f);
        visit_computational_layers(pnet, [](leaky_relu_& l) { l = leaky_relu_(0.001f); });
        DLIB_TEST(layer<leaky_relu>(pnet).layer_details().get_alpha() == 0.001f);

        // make sure count_parameters() works since it depends on visiting too.  Initially the
        // network has 0 parameters.  But once we run something through it it will allocate its
        // parameters.
        DLIB_TEST_MSG(count_parameters(pnet) == 0, "count_parameters(pnet): "<< count_parameters(pnet));
        const matrix<unsigned char> input = zeros_matrix<unsigned char>(40,40);
        pnet(input);
        DLIB_TEST_MSG(count_parameters(pnet) == 17606, "count_parameters(pnet): "<< count_parameters(pnet));

    }

    float tensor_read_cpu(const tensor& t, long i, long k, long r, long c)
    {
        const float* p = t.host() + t.k() * t.nr() * t.nc() * i +
                        t.nr() * t.nc() * k + t.nc() * r + c;
        return *p;
    }
    void test_copy_tensor_cpu()
    {
        using namespace dlib::tt;
        print_spinner();
        resizable_tensor dest(10, 9, 7, 15);
        resizable_tensor src1(10, 3, 7, 15);
        resizable_tensor src2(10, 3, 7, 15);
        resizable_tensor src3(10, 9, 7, 15);
        tt::tensor_rand rnd;
        rnd.fill_gaussian(dest);
        rnd.fill_gaussian(src1);
        rnd.fill_gaussian(src2);
        rnd.fill_gaussian(src3);

        cpu::copy_tensor(false, dest, 0, src1, 0,  src1.k()); //full copy src1->dest
        cpu::copy_tensor(false, dest, src1.k(), src2, 0,  src2.k()); //full copy src2->dest with offset of src1
        cpu::copy_tensor(false, dest, src1.k() + src2.k(), src3, 3,  3); //partial copy src3 into the rest place of dest


        for (long i = 0; i < dest.num_samples(); ++i)
        {
            for (long k = 0; k < dest.k(); ++k)
            {
                for (long r = 0; r < dest.nr(); ++r)
                {
                    for (long c = 0; c < dest.nc(); ++c)
                    {
                        float dest_value = tensor_read_cpu(dest, i, k, r, c);
                        // first part is from src1
                        if (k < src1.k())
                        {
                            float src_value = tensor_read_cpu(src1, i, k, r, c);
                            DLIB_TEST(src_value == dest_value);
                        }
                        // second part is from src2
                        else if (k < src1.k() + src2.k())
                        {
                            float src_value = tensor_read_cpu(src2, i, k - src1.k(), r, c);
                            DLIB_TEST(src_value == dest_value);
                        }
                        // third part is from src3
                        else
                        {
                            float src_value = tensor_read_cpu(src3, i, k - src1.k() - src2.k() + 3, r, c);
                            DLIB_TEST(src_value == dest_value);
                        }
                    }
                }
            }
        }
    }
    void test_copy_tensor_add_to_cpu()
    {
        using namespace dlib::tt;
        print_spinner();
        resizable_tensor dest(10, 9, 7, 15);
        resizable_tensor src1(10, 3, 7, 15);
        resizable_tensor src2(10, 3, 7, 15);
        resizable_tensor src3(10, 9, 7, 15);
        tt::tensor_rand rnd;
        rnd.fill_gaussian(dest);
        rnd.fill_gaussian(src1);
        rnd.fill_gaussian(src2);
        rnd.fill_gaussian(src3);

        const resizable_tensor old_dest = dest;

        cpu::copy_tensor(true, dest, 0, src1, 0,  src1.k()); //full copy src1->dest
        cpu::copy_tensor(true, dest, src1.k(), src2, 0,  src2.k()); //full copy src2->dest with offset of src1
        cpu::copy_tensor(true, dest, src1.k() + src2.k(), src3, 3,  3); //partial copy src3 into the rest place of dest


        for (long i = 0; i < dest.num_samples(); ++i)
        {
            for (long k = 0; k < dest.k(); ++k)
            {
                for (long r = 0; r < dest.nr(); ++r)
                {
                    for (long c = 0; c < dest.nc(); ++c)
                    {
                        float old_dest_value = tensor_read_cpu(old_dest, i, k, r, c);
                        float dest_value = tensor_read_cpu(dest, i, k, r, c);
                        // first part is from src1
                        if (k < src1.k())
                        {
                            float src_value = tensor_read_cpu(src1, i, k, r, c)+old_dest_value;
                            DLIB_TEST(std::abs(src_value - dest_value) < 1e-6);
                        }
                        // second part is from src2
                        else if (k < src1.k() + src2.k())
                        {
                            float src_value = tensor_read_cpu(src2, i, k - src1.k(), r, c)+old_dest_value;
                            DLIB_TEST(std::abs(src_value - dest_value) < 1e-6);
                        }
                        // third part is from src3
                        else
                        {
                            float src_value = tensor_read_cpu(src3, i, k - src1.k() - src2.k() + 3, r, c)+old_dest_value;
                            DLIB_TEST(std::abs(src_value - dest_value) < 1e-6);
                        }
                    }
                }
            }
        }
    }
#ifdef DLIB_USE_CUDA
    void test_copy_tensor_gpu()
    {
        using namespace dlib::tt;
        print_spinner();
        resizable_tensor dest(10, 9, 7, 15);
        resizable_tensor src1(10, 3, 7, 15);
        resizable_tensor src2(10, 3, 7, 15);
        resizable_tensor src3(10, 9, 7, 15);
        tt::tensor_rand rnd;
        rnd.fill_gaussian(dest);
        rnd.fill_gaussian(src1);
        rnd.fill_gaussian(src2);
        rnd.fill_gaussian(src3);
        cuda::copy_tensor(false, dest, 0, src1, 0,  src1.k()); //full copy src1->dest
        cuda::copy_tensor(false, dest, src1.k(), src2, 0,  src2.k()); //full copy src2->dest with offset of src1
        cuda::copy_tensor(false, dest, src1.k() + src2.k(), src3, 3,  3); //partial copy src3 into the rest place of dest


        for (long i = 0; i < dest.num_samples(); ++i)
        {
            for (long k = 0; k < dest.k(); ++k)
            {
                for (long r = 0; r < dest.nr(); ++r)
                {
                    for (long c = 0; c < dest.nc(); ++c)
                    {
                        float dest_value = tensor_read_cpu(dest, i, k, r, c);
                        // first part is from src1
                        if (k < src1.k())
                        {
                            float src_value = tensor_read_cpu(src1, i, k, r, c);
                            DLIB_TEST(src_value == dest_value);
                        }
                            // second part is from src2
                        else if (k < src1.k() + src2.k())
                        {
                            float src_value = tensor_read_cpu(src2, i, k - src1.k(), r, c);
                            DLIB_TEST(src_value == dest_value);
                        }
                            // third part is from src3
                        else
                        {
                            float src_value = tensor_read_cpu(src3, i, k - src1.k() - src2.k() + 3, r, c);
                            DLIB_TEST(src_value == dest_value);
                        }
                    }
                }
            }
        }
    }
    void test_copy_tensor_add_to_gpu()
    {
        using namespace dlib::tt;
        print_spinner();
        resizable_tensor dest(10, 9, 7, 15);
        resizable_tensor src1(10, 3, 7, 15);
        resizable_tensor src2(10, 3, 7, 15);
        resizable_tensor src3(10, 9, 7, 15);
        tt::tensor_rand rnd;
        rnd.fill_gaussian(dest);
        rnd.fill_gaussian(src1);
        rnd.fill_gaussian(src2);
        rnd.fill_gaussian(src3);

        const resizable_tensor old_dest = dest;

        cuda::copy_tensor(true, dest, 0, src1, 0,  src1.k()); //full copy src1->dest
        cuda::copy_tensor(true, dest, src1.k(), src2, 0,  src2.k()); //full copy src2->dest with offset of src1
        cuda::copy_tensor(true, dest, src1.k() + src2.k(), src3, 3,  3); //partial copy src3 into the rest place of dest


        for (long i = 0; i < dest.num_samples(); ++i)
        {
            for (long k = 0; k < dest.k(); ++k)
            {
                for (long r = 0; r < dest.nr(); ++r)
                {
                    for (long c = 0; c < dest.nc(); ++c)
                    {
                        float old_dest_value = tensor_read_cpu(old_dest, i, k, r, c);
                        float dest_value = tensor_read_cpu(dest, i, k, r, c);
                        // first part is from src1
                        if (k < src1.k())
                        {
                            float src_value = tensor_read_cpu(src1, i, k, r, c)+old_dest_value;
                            DLIB_TEST_MSG(std::abs(src_value - dest_value) < 1e-6, std::abs(src_value - dest_value));
                        }
                            // second part is from src2
                        else if (k < src1.k() + src2.k())
                        {
                            float src_value = tensor_read_cpu(src2, i, k - src1.k(), r, c)+old_dest_value;
                            DLIB_TEST(std::abs(src_value - dest_value) < 1e-6);
                        }
                            // third part is from src3
                        else
                        {
                            float src_value = tensor_read_cpu(src3, i, k - src1.k() - src2.k() + 3, r, c)+old_dest_value;
                            DLIB_TEST(std::abs(src_value - dest_value) < 1e-6);
                        }
                    }
                }
            }
        }
    }
#endif//DLIB_USE_CUDA

    template <typename SUBNET> using concat_block1 = con<5,1,1,1,1,SUBNET>;
    template <typename SUBNET> using concat_block2 = con<8,3,3,1,1,SUBNET>;
    template <typename SUBNET> using concat_block3 = max_pool<3,3,1,1,SUBNET>;
    template <typename SUBNET> using concat_incept = inception3<concat_block1,concat_block2,concat_block3,SUBNET>;

    void test_concat()
    {
        using namespace dlib::tt;
        print_spinner();

        using net_type = concat_incept<input<matrix<float>>>;

        resizable_tensor data(10, 1, 111, 222);
        tt::tensor_rand rnd;
        rnd.fill_gaussian(data);

        net_type net;


        auto& out = net.forward(data);

        auto& b1o = layer<itag1>(net).get_output();
        auto& b2o = layer<itag2>(net).get_output();
        auto& b3o = layer<itag3>(net).get_output();

        resizable_tensor dest(10, 14, 111, 222);
        copy_tensor(false, dest, 0, b1o, 0,  b1o.k());
        copy_tensor(false, dest, b1o.k(), b2o, 0,  b2o.k());
        copy_tensor(false, dest, b1o.k() + b2o.k(), b3o, 0,  b3o.k());

        DLIB_TEST(dest.size() == out.size());
        int error = memcmp(dest.host(), out.host(), dest.size());
        DLIB_TEST(error == 0);

        resizable_tensor gr(10, 14, 111, 222);
        rnd.fill_gaussian(gr);

        resizable_tensor params;
        net.layer_details().backward(gr, net, params);

        auto& b1g = layer<itag1>(net).subnet().get_gradient_input();
        auto& b2g = layer<itag2>(net).subnet().get_gradient_input();
        auto& b3g = layer<itag3>(net).subnet().get_gradient_input();

        resizable_tensor g1(10, 5, 111, 222);
        resizable_tensor g2(10, 8, 111, 222);
        resizable_tensor g3(10, 1, 111, 222);

        copy_tensor(false, g1, 0, gr, 0,  g1.k());
        copy_tensor(false, g2, 0, gr, g1.k(), g2.k());
        copy_tensor(false, g3, 0, gr, g1.k() + g2.k(), g3.k());
        DLIB_TEST(g1.size() == b1g.size());
        error = memcmp(g1.host(), b1g.host(), b1g.size());
        DLIB_TEST(error == 0);
        DLIB_TEST(g2.size() == b2g.size());
        error = memcmp(g2.host(), b2g.host(), b2g.size());
        DLIB_TEST(error == 0);
        DLIB_TEST(g3.size() == b3g.size());
        error = memcmp(g3.host(), b3g.host(), b3g.size());
        DLIB_TEST(error == 0);
    }

// ----------------------------------------------------------------------------------------

    void test_simple_linear_regression()
    {
        const int num_samples = 1000;
        ::std::vector<matrix<double>> x(num_samples);
        ::std::vector<float> y(num_samples);
        ::std::default_random_engine generator(16);
        ::std::normal_distribution<float> distribution(0,0.1);
        const float true_intercept = 50.0;
        const float true_slope = 10.0;
        for ( int ii = 0; ii < num_samples; ++ii )
        {
            const double val = static_cast<double>(ii)/10;
            matrix<double> tmp(1,1);
            tmp = val;
            x[ii] = tmp;
            y[ii] = (true_intercept + true_slope*static_cast<float>(val) + distribution(generator));
        }

        using net_type = loss_mean_squared<fc<1, input<matrix<double>>>>;
        net_type net;
        layer<1>(net).layer_details().set_bias_learning_rate_multiplier(300);
        sgd defsolver(0,0.9);
        dnn_trainer<net_type> trainer(net, defsolver);
        trainer.set_learning_rate(1e-5);
        trainer.set_min_learning_rate(1e-6);
        trainer.set_mini_batch_size(50);
        trainer.set_max_num_epochs(170);
        trainer.train(x, y);

        const float slope = layer<1>(net).layer_details().get_weights().host()[0];
        const float slope_error = abs(true_slope - slope);
        const float intercept = layer<1>(net).layer_details().get_biases().host()[0];
        const float intercept_error = abs(true_intercept - intercept);
        const float eps_slope = 0.05, eps_intercept = 0.1;

        DLIB_TEST_MSG(slope_error <= eps_slope,
                      "Expected slope = " << true_slope << " Estimated slope = " << slope << " Error limit = " << eps_slope);
        DLIB_TEST_MSG(intercept_error <= eps_intercept,
                      "Expected intercept = " << true_intercept << " Estimated intercept = " << intercept << " Error limit = " << eps_intercept);

    }

// ----------------------------------------------------------------------------------------

    void test_simple_linear_regression_eil()
    {
        print_spinner();
        const int num_samples = 1000;
        ::std::vector<matrix<double>> x(num_samples);
        ::std::vector<float> y(num_samples);
        ::std::default_random_engine generator(16);
        ::std::normal_distribution<float> distribution(0,0.0001);
        const float true_intercept = 50.0;
        const float true_slope = 10.0;
        for ( int ii = 0; ii < num_samples; ++ii )
        {
            const double val = static_cast<double>(ii)/10;
            matrix<double> tmp(1,1);
            tmp = val;
            x[ii] = tmp;
            y[ii] = (true_intercept + true_slope*static_cast<float>(val) + distribution(generator));
        }

        using net_type = loss_epsilon_insensitive<fc<1, input<matrix<double>>>>;
        net_type net(0.01);
        layer<1>(net).layer_details().set_bias_learning_rate_multiplier(300);
        sgd defsolver(0,0.9);
        dnn_trainer<net_type> trainer(net, defsolver);
        trainer.set_learning_rate(1e-5);
        trainer.set_min_learning_rate(1e-8);
        trainer.set_mini_batch_size(50);
        trainer.set_max_num_epochs(570);
        trainer.train(x, y);

        const float slope = layer<1>(net).layer_details().get_weights().host()[0];
        const float slope_error = abs(true_slope - slope);
        const float intercept = layer<1>(net).layer_details().get_biases().host()[0];
        const float intercept_error = abs(true_intercept - intercept);
        const float eps_slope = 0.01, eps_intercept = 0.1;

        dlog << LINFO << "slope_error: "<< slope_error;
        dlog << LINFO << "intercept_error: "<< intercept_error;
        DLIB_TEST_MSG(slope_error <= eps_slope,
                      "Expected slope = " << true_slope << " Estimated slope = " << slope << " Error limit = " << eps_slope);
        DLIB_TEST_MSG(intercept_error <= eps_intercept,
                      "Expected intercept = " << true_intercept << " Estimated intercept = " << intercept << " Error limit = " << eps_intercept);

    }

// ----------------------------------------------------------------------------------------

    void test_simple_linear_regression_with_mult_prev()
    {
        srand(1234);
        print_spinner();
        const int num_samples = 1000;
        ::std::vector<matrix<double>> x(num_samples);
        ::std::vector<float> y(num_samples);
        const float true_slope = 2.0;
        for ( int ii = 0; ii < num_samples; ++ii )
        {
            const double val = static_cast<double>(ii-500)/100;
            matrix<double> tmp(1,1);
            tmp = val;
            x[ii] = tmp;
            y[ii] = ( true_slope*static_cast<float>(val*val));
        }

        randomize_samples(x,y);

        using net_type = loss_mean_squared<fc<1, mult_prev1<fc<2,tag1<fc<2,input<matrix<double>>>>>>>>;
        net_type net;
        sgd defsolver(0,0.9);
        dnn_trainer<net_type> trainer(net, defsolver);
        trainer.set_learning_rate(1e-5);
        trainer.set_min_learning_rate(1e-11);
        trainer.set_mini_batch_size(50);
        trainer.set_max_num_epochs(2000);
        trainer.train(x, y);

        running_stats<double> rs;
        for (size_t i = 0; i < x.size(); ++i)
        {
            double val = y[i];
            double out = net(x[i]);
            rs.add(std::abs(val-out));
        }
        dlog << LINFO << "rs.mean(): " << rs.mean();
        dlog << LINFO << "rs.stddev(): " << rs.stddev();
        dlog << LINFO << "rs.max(): " << rs.max();
        DLIB_TEST(rs.mean() < 0.1);
    }

// ----------------------------------------------------------------------------------------

    void test_multioutput_linear_regression()
    {
        const int num_outputs = 2;
        const int num_samples = 1000;
        ::std::vector<matrix<double>> x(num_samples);
        ::std::vector<matrix<float>> y(num_samples);
        ::std::default_random_engine generator(16);
        ::std::normal_distribution<float> distribution(0,0.1);
        ::std::normal_distribution<float> slope_distribution(10,5);
        ::std::normal_distribution<float> intercept_distribution(50,10);
        ::std::vector<float> true_intercepts(num_outputs);
        ::std::vector<float> true_slopes(num_outputs);
        for ( int jj = 0; jj < num_outputs; ++jj )
        {
            true_slopes[jj] = slope_distribution(generator);
            true_intercepts[jj] = intercept_distribution(generator);
        }
        matrix<float> ytmp(num_outputs, 1);
        for ( int ii = 0; ii < num_samples; ++ii )
        {
            const double val = static_cast<double>(ii)/10;
            matrix<double> tmp(1,1);
            tmp = val;
            x[ii] = tmp;
            for ( int jj = 0; jj < num_outputs; ++jj )
                ytmp(jj, 0) = (true_intercepts[jj] + true_slopes[jj]*static_cast<float>(val) + distribution(generator));

            y[ii] = ytmp;
        }

        using net_type = loss_mean_squared_multioutput<fc<num_outputs, input<matrix<double>>>>;
        net_type net;
        layer<1>(net).layer_details().set_bias_learning_rate_multiplier(900);
        sgd defsolver(0,0.9);
        dnn_trainer<net_type> trainer(net, defsolver);
        trainer.set_learning_rate(1e-5);
        trainer.set_min_learning_rate(1e-6);
        trainer.set_mini_batch_size(50);
        trainer.set_max_num_epochs(170);
        trainer.train(x, y);

        float slope_error = 0.0;
        float intercept_error = 0.0;
        const float eps_slope = 0.05, eps_intercept = 0.1;

        for ( int jj = 0; jj < num_outputs; ++jj )
        {
            slope_error += abs(layer<1>(net).layer_details().get_weights().host()[jj] - true_slopes[jj]);
            intercept_error += abs(layer<1>(net).layer_details().get_biases().host()[jj] - true_intercepts[jj]);
        }

        slope_error /= float(num_outputs);
        intercept_error /= float(num_outputs);

        DLIB_TEST_MSG(slope_error <= eps_slope,
                      "Average absolute slope error = " << slope_error << " Error limit = " << eps_slope);
        DLIB_TEST_MSG(intercept_error <= eps_intercept,
                      "Average absolute intercept error = " << intercept_error << " Error limit = " << eps_intercept);

    }

// ----------------------------------------------------------------------------------------

    void test_simple_autoencoder()
    {
        print_spinner();

        srand(1234);

        const int output_width = 7;
        const int output_height = 7;
        const int num_samples = 100;
        ::std::vector<matrix<float>> x(num_samples);

        matrix<float> tmp(output_width, output_height);
        for (int i = 0; i < num_samples; ++i)
        {
            const int model = i % 4;

            for (int r = 0; r < output_height; ++r)
                for (int c = 0; c < output_width; ++c)
                    switch (model) {
                    case 0: tmp(r, c) = r / output_height; break;
                    case 1: tmp(r, c) = c / output_width; break;
                    case 2: tmp(r, c) = 1.0 - r / output_height; break;
                    case 3: tmp(r, c) = 1.0 - c / output_width; break;
                    default: DLIB_TEST_MSG(false, "Invalid model: " << model << " (should be between 0 and 3)");
                    }

            x[i] = tmp;
        }

        using net_type = loss_mean_squared_per_pixel<
                            cont<1,output_height,output_width,2,2,
                            relu<con<4,output_height,output_width,2,2,
                            input<matrix<float>>>>>>;
        net_type net;

        const auto autoencoder_error = [&x, &net, &output_height, &output_width]()
        {
            const auto y = net(x);
            double error = 0.0;
            for (size_t i = 0; i < x.size(); ++i)
                for (int r = 0; r < output_height; ++r)
                    for (int c = 0; c < output_width; ++c)
                        error += fabs(y[i](r, c) - x[i](r, c));

            return error / (x.size() * output_height * output_width);
        };

        // The autoencoder can't be very good before it's been trained
        // (or at least the probability of the reconstruction error
        // being small should be super low; in fact, the error ought to
        // be much higher than 0.01, however since the initialization
        // is random, putting the limit below too high could make the
        // tests fail when other, unrelated tests are added into the
        // sequence)
        const double error_before = autoencoder_error();
        DLIB_TEST_MSG(error_before > 0.01, "Autoencoder error before training = " << error_before);

        // Make sure there's an information bottleneck, as intended
        const auto& output2 = dlib::layer<2>(net).get_output();
        DLIB_TEST(output2.nr() == 1);
        DLIB_TEST(output2.nc() == 1);
        DLIB_TEST(output2.k() == 4);

        sgd defsolver(0,0.9);
        dnn_trainer<net_type> trainer(net, defsolver);
        trainer.set_learning_rate(0.01);
        trainer.set_max_num_epochs(1000);
        trainer.train(x, x);

        // Now we should have learned everything there is to it
        const double error_after = autoencoder_error();
        DLIB_TEST_MSG(error_after < 1e-6, "Autoencoder error after training = " << error_after);
    }

// ----------------------------------------------------------------------------------------

    void test_loss_mean_squared_per_channel_and_pixel()
    {
        print_spinner();

        const int num_samples = 1000;
        const long num_channels = 10;
        const long dimension = 3;
        ::std::vector<matrix<float>> inputs;
        ::std::vector<::std::array<matrix<float>, num_channels>> labels;
        for (int i = 0; i < num_samples; ++i)
        {
            matrix<float> x = matrix_cast<float>(randm(5, dimension));
            matrix<float> w = matrix_cast<float>(randm(num_channels, 5));
            matrix<float> y = w * x;
            DLIB_CASSERT(y.nr() == num_channels);
            ::std::array<matrix<float>, num_channels> y_arr;
            // convert y to an array of matrices
            for (long c = 0; c < num_channels; ++c)
            {
                y_arr[c] = rowm(y, c);
            }
            inputs.push_back(::std::move(x));
            labels.push_back(::std::move(y_arr));
        }

        const long num_outputs = num_channels * dimension;
        using net_type = loss_mean_squared_per_channel_and_pixel<num_channels,
                            extract<0, num_channels, 1, dimension,
                            fc<num_outputs,
                            relu<bn_fc<fc<500,
                            input<matrix<float>>>>>>>>;
        net_type net;

        const auto compute_error = [&inputs, &labels, &net, num_channels]()
        {
            const auto out = net(inputs);
            double error = 0.0;
            for (size_t i = 0; i < out.size(); ++i)
            {
                for (long c = 0; c < num_channels; ++c)
                {
                    error += mean(squared(out[i][c] - labels[i][c]));
                }
            }
            return error / out.size() / num_channels;
        };

        const auto error_before = compute_error();
        dnn_trainer<net_type> trainer(net);
        trainer.set_learning_rate(0.1);
        trainer.set_iterations_without_progress_threshold(500);
        trainer.set_min_learning_rate(1e-6);
        trainer.set_mini_batch_size(50);
        trainer.set_max_num_epochs(100);
        trainer.train(inputs, labels);
        const auto error_after = compute_error();
        DLIB_TEST_MSG(error_after < error_before, "multi channel error increased after training");
#if DLIB_USE_CUDA
        cuda::compute_loss_mean_squared_per_channel_and_pixel cuda_compute;
        cpu::compute_loss_mean_squared_per_channel_and_pixel cpu_compute;
        double cuda_loss, cpu_loss;
        const tensor& output_tensor = net.subnet().get_output();
        resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor);
        cuda_compute(labels.begin(), output_tensor, cuda_grad, cuda_loss);
        cpu_compute(labels.begin(), output_tensor, cpu_grad, cpu_loss);
        DLIB_TEST(cuda_grad.size() == cpu_grad.size());
        for (size_t i = 0; i < cuda_grad.size(); ++i)
        {
            DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8);
        }
        const auto err = abs(cuda_loss - cpu_loss) / cpu_loss;
        DLIB_TEST_MSG(err < 1e-6, "multi channel cuda and cpu losses differ");
#endif
    }

// ----------------------------------------------------------------------------------------

    void test_loss_binary_log_per_pixel_learned_params_on_trivial_two_pixel_task()
    {
        print_spinner();

        ::std::vector<matrix<float>> x({ matrix<float,2,1>({ -1, 1 }) });
        ::std::vector<matrix<float>> y({ matrix<float,2,1>({ -1, 1 }) });

        using net_type = loss_binary_log_per_pixel<con<1,1,1,1,1,input<matrix<float>>>>;
        net_type net;

        dnn_trainer<net_type> trainer(net, sgd(0,0));
        trainer.set_learning_rate(1e7);
        trainer.set_max_num_epochs(1);
        trainer.train(x, y);

        const tensor& learned_params = layer<1>(net).layer_details().get_layer_params();
        const float* learned_params_data = learned_params.host();

        DLIB_TEST(learned_params_data[0] > 1e5);
        DLIB_TEST(abs(learned_params_data[1]) < 1);
    }

// ----------------------------------------------------------------------------------------

    void test_loss_binary_log_per_pixel_outputs_on_trivial_task()
    {
        print_spinner();

        constexpr int input_height = 7;
        constexpr int input_width = 5;
        constexpr int output_height = input_height;
        constexpr int output_width = input_width;
        constexpr int num_samples = 7;

        ::std::vector<matrix<double>> x(num_samples);
        ::std::vector<matrix<float>> y(num_samples);

        matrix<double> xtmp(input_height, input_width);
        matrix<float> ytmp(output_height, output_width);

        ::std::default_random_engine generator(16);
        ::std::normal_distribution<double> n01(0);

        const auto z = 0.674490; // This should give us a 50/50 split between the classes

        // Generate training data: random inputs x, and the corresponding target outputs y
        for (int ii = 0; ii < num_samples; ++ii) {
            for (int jj = 0; jj < input_height; ++jj) {
                for (int kk = 0; kk < input_width; ++kk) {
                    xtmp(jj, kk) = n01(generator);
                    ytmp(jj, kk) = std::abs(xtmp(jj, kk)) > z ? 1.f : -1.f;
                }
            }
            x[ii] = xtmp;
            y[ii] = ytmp;
        }

        using net_type = loss_binary_log_per_pixel<con<1,1,1,1,1,relu<con<10,1,1,1,1,input<matrix<double>>>>>>;
        net_type net;

        dnn_trainer<net_type> trainer(net, sgd(0, 0.9));
        trainer.set_learning_rate(1);
        trainer.set_max_num_epochs(800);
        trainer.train(x, y);

        // The learning task is easy, so the net should have no problem
        // getting all the outputs right.
        const auto response = net(x);
        for (int ii = 0; ii < num_samples; ++ii)
            for (int jj = 0; jj < output_height; ++jj)
                for (int kk = 0; kk < output_width; ++kk)
                    DLIB_TEST((response[ii](jj,kk) > 0) == (y[ii](jj,kk) > 0));
    }

// ----------------------------------------------------------------------------------------

    void test_loss_binary_log_per_pixel_with_noise_and_pixels_to_ignore()
    {
        // Test learning when some pixels are to be ignored, etc.

        print_spinner();

        constexpr int input_height = 5;
        constexpr int input_width = 7;
        constexpr int output_height = input_height;
        constexpr int output_width = input_width;
        const int num_samples = 1000;
        const double ignore_probability = 0.5;
        const double noise_probability = 0.05;

        ::std::default_random_engine generator(16);
        ::std::bernoulli_distribution ignore(ignore_probability);
        ::std::bernoulli_distribution noise_occurrence(noise_probability);
        ::std::bernoulli_distribution noisy_label(0.5);

        ::std::vector<matrix<double>> x(num_samples);
        ::std::vector<matrix<float>> y(num_samples);

        ::std::vector<int> truth_histogram(2);

        matrix<double> xtmp(input_height, input_width);
        matrix<float> ytmp(output_height, output_width);

        // The function to be learned.
        const auto ground_truth = [](const matrix<double>& x, int row, int column) {
            double sum = 0.0;
            const int first_column = std::max(0, column - 1);
            const int last_column = std::min(static_cast<int>(x.nc() - 1), column + 1);
            for (int c = first_column; c <= last_column; ++c) {
                sum += x(row, c);
            }
            DLIB_TEST(sum < 2.0 * (last_column - first_column + 1));
            return sum > (last_column - first_column + 1);
        };

        for ( int ii = 0; ii < num_samples; ++ii ) {
            for ( int jj = 0; jj < input_height; ++jj ) {
                for ( int kk = 0; kk < input_width; ++kk ) {
                    // Generate numbers between 0 and 2.
                    double value = static_cast<double>(ii + jj + kk) / 10.0;
                    value -= (static_cast<int>(value) / 2) * 2;
                    DLIB_TEST(value >= 0.0 && value < 2.0);
                    xtmp(jj, kk) = value;
                }
            }
            x[ii] = xtmp;

            for ( int jj = 0; jj < output_height; ++jj ) {
                for ( int kk = 0; kk < output_width; ++kk ) {
                    const bool truth = ground_truth(x[ii], jj, kk);
                    ++truth_histogram[truth];
                    if (ignore(generator)) {
                        ytmp(jj, kk) = 0.f;
                    }
                    else if (noise_occurrence(generator)) {
                        ytmp(jj, kk) = noisy_label(generator) ? 1.f : -1.f;
                    }
                    else {
                        ytmp(jj, kk) = truth ? 1.f : -1.f;
                    }
                }
            }

            y[ii] = ytmp;
        }

        const int num_total_elements = num_samples * output_height * output_width;

        { // Require a reasonably balanced truth histogram in order to make sure that a trivial classifier is not enough
            const int required_min_histogram_value = static_cast<int>(::std::ceil(num_total_elements / 2.0 * 0.375));
            for (auto histogram_value : truth_histogram) {
                DLIB_TEST_MSG(histogram_value >= required_min_histogram_value,
                              "Histogram value = " << histogram_value << ", required = " << required_min_histogram_value);
            }
        }

        using net_type = loss_binary_log_per_pixel<con<1,1,input_width,1,1,input<matrix<double>>>>;
        net_type net;
        sgd defsolver(0,0.9);
        dnn_trainer<net_type> trainer(net, defsolver);
        trainer.set_learning_rate(0.1);
        trainer.set_min_learning_rate(0.01);
        trainer.set_mini_batch_size(50);
        trainer.set_max_num_epochs(170);
        trainer.train(x, y);

        const ::std::vector<matrix<float>> predictions = net(x);

        int num_correct = 0;

        for ( int ii = 0; ii < num_samples; ++ii ) {
            const matrix<float>& prediction = predictions[ii];
            DLIB_TEST(prediction.nr() == output_height);
            DLIB_TEST(prediction.nc() == output_width);
            for ( int jj = 0; jj < output_height; ++jj )
                for ( int kk = 0; kk < output_width; ++kk )
                    if ( (prediction(jj, kk) > 0.f) == ground_truth(x[ii], jj, kk) )
                        ++num_correct;
        }

        // First some sanity checks.
        const int num_correct_max = num_total_elements;
        DLIB_TEST(num_correct_max == ::std::accumulate(truth_histogram.begin(), truth_histogram.end(), 0));
        DLIB_TEST_MSG(num_correct <= num_correct_max,
                      "Number of correctly classified elements = " << num_correct << ", max = " << num_correct_max);

        // This is the real test, verifying that we have actually learned something.
        const int num_correct_required = static_cast<int>(::std::ceil(0.9 * num_correct_max));
        DLIB_TEST_MSG(num_correct >= num_correct_required,
                      "Number of correctly classified elements = " << num_correct << ", required = " << num_correct_required);

#if DLIB_USE_CUDA
        cuda::compute_loss_binary_log_per_pixel cuda_compute;
        cpu::compute_loss_binary_log_per_pixel cpu_compute;
        double cuda_loss, cpu_loss;
        const tensor& output_tensor = net.subnet().get_output();
        resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor);
        cuda_compute(y.begin(), output_tensor, cuda_grad, cuda_loss);
        cpu_compute(y.begin(), output_tensor, cpu_grad, cpu_loss);
        DLIB_TEST(cuda_grad.size() == cpu_grad.size());
        for (size_t i = 0; i < cuda_grad.size(); ++i)
        {
            DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8);
        }
        const auto err = abs(cuda_loss - cpu_loss) / cpu_loss;
        DLIB_TEST_MSG(err < 1e-6, "binary log per pixel cuda and cpu losses differ");
#endif
    }

// ----------------------------------------------------------------------------------------

    void test_loss_multiclass_per_pixel_learned_params_on_trivial_single_pixel_task()
    {
        print_spinner();

        constexpr uint16_t num_classes = 7;
        constexpr uint16_t true_label = num_classes / 2;

        ::std::vector<matrix<float>> x({ matrix<float,1,1>({ 1 }) });
        ::std::vector<matrix<uint16_t>> y({ matrix<uint16_t,1,1>({ true_label }) });

        using net_type = loss_multiclass_log_per_pixel<con<num_classes,1,1,1,1,input<matrix<float>>>>;
        net_type net;

        dnn_trainer<net_type> trainer(net, sgd(0,0));
        trainer.set_learning_rate(1e7);
        trainer.set_max_num_epochs(1);
        trainer.train(x, y);

        const tensor& learned_params = layer<1>(net).layer_details().get_layer_params();
        const float* learned_params_data = learned_params.host();

        for (int is_bias = 0; is_bias <= 1; ++is_bias) {
            for (uint16_t k = 0; k < num_classes; ++k) {
                size_t index = k + is_bias * num_classes;
                DLIB_TEST(index < learned_params.size());
                if (k == true_label) {
                    DLIB_TEST(learned_params_data[index] > 1e5);
                }
                else {
                    DLIB_TEST(learned_params_data[index] < -1e5);
                }
            }
        }
    }

// ----------------------------------------------------------------------------------------

    void test_loss_multiclass_per_pixel_activations_on_trivial_single_pixel_task()
    {
        print_spinner();

        constexpr int input_height = 35;
        constexpr int input_width = 27;
        constexpr int output_height = input_height;
        constexpr int output_width = input_width;
        constexpr int num_samples = 7;
        constexpr int num_classes = 5;

        ::std::vector<matrix<float>> x(num_samples);
        ::std::vector<matrix<uint16_t>> y(num_samples);

        matrix<float> xtmp(input_height, input_width);
        matrix<uint16_t> ytmp(output_height, output_width);

        ::std::default_random_engine generator(16);
        ::std::bernoulli_distribution coinflip(0.5);

        using filter_type = con<num_classes,1,1,1,1,input<matrix<float>>>;

        // Define a "truth" filter
        filter_type truth_filter;
        truth_filter(xtmp); // Set up the convolutional layer

        // Generate training data
        for (int ii = 0; ii < num_samples; ++ii) {
            // Generate random inputs x
            for (int jj = 0; jj < input_height; ++jj)
                for (int kk = 0; kk < input_width; ++kk)
                    xtmp(jj, kk) = coinflip(generator) ? 1.f : -1.f;
            x[ii] = xtmp;

            // Generate target output y by applying the truth filter on x
            const tensor& output = truth_filter(xtmp);
            const float* const out_data = output.host();

            const auto out_element = [&](int row, int column, int k) {
                return out_data[(k * output.nr() + row) * output.nc() + column];
            };

            for (int jj = 0; jj < output_height; ++jj) {
                for (int kk = 0; kk < output_width; ++kk) {
                    uint16_t label = 0;
                    float max_value = out_element(jj, kk, 0);
                    for (long k = 1; k < num_classes; ++k) {
                        const float value = out_element(jj, kk, k);
                        if (value > max_value) {
                            label = static_cast<uint16_t>(k);
                            max_value = value;
                        }
                    }
                    ytmp(jj, kk) = label;
                }
            }
            y[ii] = ytmp;
        }

        using net_type = loss_multiclass_log_per_pixel<filter_type>;
        net_type net;

        dnn_trainer<net_type> trainer(net, sgd(0,0));
        trainer.set_learning_rate(1e6);
        trainer.set_max_num_epochs(1);
        trainer.train(x, y);

        // Feed forward the training samples.
        resizable_tensor temp_tensor;
        net.to_tensor(&x[0], &x[0] + num_samples, temp_tensor);
        net.forward(temp_tensor);
        const dimpl::subnet_wrapper<filter_type> wsub(net.subnet());
        const tensor& output_tensor = wsub.get_output();
        const float* const out_data = output_tensor.host();

        // Let's have a look at the activations before softmax. They should be pretty high
        // (in terms of absolute value), because the learning task is trivial.
        for (int ii = 0; ii < num_samples; ++ii) {
            for (int jj = 0; jj < output_height; ++jj) {
                for (int kk = 0; kk < output_width; ++kk) {
                    const uint16_t true_label = y[ii](jj, kk);

                    for (long k = 0; k < num_classes; ++k) {
                        const size_t index = ((ii * output_tensor.k() + k) * output_tensor.nr() + jj) * output_tensor.nc() + kk;
                        DLIB_TEST(index < output_tensor.size());

                        if (k == true_label) {
                            DLIB_TEST(out_data[index] > 1e4);
                        }
                        else {
                            DLIB_TEST(out_data[index] < -1e4);
                        }
                    }
                }
            }
        }
    }

// ----------------------------------------------------------------------------------------

    void test_loss_multiclass_per_pixel_outputs_on_trivial_task()
    {
        print_spinner();

        constexpr int input_height = 7;
        constexpr int input_width = 5;
        constexpr int output_height = input_height;
        constexpr int output_width = input_width;
        constexpr int num_samples = 7;
        constexpr int num_classes = 5;
        constexpr int filter_height = 3;
        constexpr int filter_width = 3;

        ::std::vector<matrix<float>> x(num_samples);
        ::std::vector<matrix<uint16_t>> y(num_samples);

        matrix<float> xtmp(input_height, input_width);
        matrix<uint16_t> ytmp(output_height, output_width);

        ::std::default_random_engine generator(16);
        ::std::bernoulli_distribution coinflip(0.5);

        using filter_type = con<num_classes, filter_height, filter_width, 1, 1, input<matrix<float>>>;

        // Define a "truth" filter
        filter_type truth_filter;
        truth_filter(xtmp); // Set up the convolutional layer

        // Generate training data
        for (int ii = 0; ii < num_samples; ++ii) {
            // Generate random inputs x
            for (int jj = 0; jj < input_height; ++jj)
                for (int kk = 0; kk < input_width; ++kk)
                    xtmp(jj, kk) = coinflip(generator) ? 1.f : -1.f;
            x[ii] = xtmp;

            // Generate target output y by applying the truth filter on x
            const tensor& output = truth_filter(xtmp);
            const float* const out_data = output.host();

            const auto out_element = [&](int row, int column, int k) {
                return out_data[(k * output.nr() + row) * output.nc() + column];
            };

            for (int jj = 0; jj < output_height; ++jj) {
                for (int kk = 0; kk < output_width; ++kk) {
                    uint16_t label = 0;
                    float max_value = out_element(jj, kk, 0);
                    for (long k = 1; k < num_classes; ++k) {
                        const float value = out_element(jj, kk, k);
                        if (value > max_value) {
                            label = static_cast<uint16_t>(k);
                            max_value = value;
                        }
                    }
                    ytmp(jj, kk) = label;
                }
            }
            y[ii] = ytmp;
        }

        using net_type = loss_multiclass_log_per_pixel<filter_type>;
        net_type net;

        dnn_trainer<net_type> trainer(net, sgd(0, 0.9));
        trainer.set_learning_rate(1);
        trainer.set_max_num_epochs(2000);
        trainer.train(x, y);

        // The learning task is separable, so the net should have no problem
        // getting all the outputs right.
        DLIB_TEST(net(x) == y);
    }

// ----------------------------------------------------------------------------------------

    void test_loss_multiclass_per_pixel_with_noise_and_pixels_to_ignore()
    {
        // "Semantic segmentation" - see https://github.com/davisking/dlib/issues/288
        // Test learning when some pixels are to be ignored, etc.

        print_spinner();

        constexpr int input_height = 5;
        constexpr int input_width = 7;
        constexpr int output_height = input_height;
        constexpr int output_width = input_width;
        const int num_samples = 1000;
        const int num_classes = 6;
        const double ignore_probability = 0.5;
        const double noise_probability = 0.05;

        ::std::default_random_engine generator(16);
        ::std::bernoulli_distribution ignore(ignore_probability);
        ::std::bernoulli_distribution noise_occurrence(noise_probability);
        ::std::uniform_int_distribution<uint16_t> noisy_label(0, num_classes - 1);

        ::std::vector<matrix<double>> x(num_samples);
        ::std::vector<matrix<uint16_t>> y(num_samples);

        ::std::vector<int> truth_histogram(num_classes);

        matrix<double> xtmp(input_height, input_width);
        matrix<uint16_t> ytmp(output_height, output_width);

        // The function to be learned.
        const auto ground_truth = [num_classes](const matrix<double>& x, int row, int column) {
            double sum = 0.0;
            const int first_column = std::max(0, column - 1);
            const int last_column = std::min(static_cast<int>(x.nc() - 1), column + 1);
            for (int c = first_column; c <= last_column; ++c) {
                sum += x(row, c);
            }
            DLIB_TEST(sum < num_classes);
            return static_cast<uint16_t>(sum);
        };

        for ( int ii = 0; ii < num_samples; ++ii ) {
            for ( int jj = 0; jj < input_height; ++jj ) {
                for ( int kk = 0; kk < input_width; ++kk ) {
                    // Generate numbers between 0 and 2.
                    double value = static_cast<double>(ii + jj + kk) / 10.0;
                    value -= (static_cast<int>(value) / 2) * 2;
                    DLIB_TEST(value >= 0.0 && value < 2.0);
                    xtmp(jj, kk) = value;
                }
            }
            x[ii] = xtmp;

            for ( int jj = 0; jj < output_height; ++jj ) {
                for ( int kk = 0; kk < output_width; ++kk ) {
                    uint16_t truth = ground_truth(x[ii], jj, kk);
                    DLIB_TEST(truth < num_classes);
                    ++truth_histogram[truth];
                    if (ignore(generator)) {
                        ytmp(jj, kk) = loss_multiclass_log_per_pixel_::label_to_ignore;
                    }
                    else if (noise_occurrence(generator)) {
                        ytmp(jj, kk) = noisy_label(generator);
                    }
                    else {
                        ytmp(jj, kk) = truth;
                    }
                }
            }

            y[ii] = ytmp;
        }

        const int num_total_elements = num_samples * output_height * output_width;

        { // Require a reasonably balanced truth histogram in order to make sure that a trivial classifier is not enough
            const int required_min_histogram_value = static_cast<int>(::std::ceil(num_total_elements / num_classes * 0.375));
            for (auto histogram_value : truth_histogram) {
                DLIB_TEST_MSG(histogram_value >= required_min_histogram_value,
                              "Histogram value = " << histogram_value << ", required = " << required_min_histogram_value);
            }
        }

        using net_type = loss_multiclass_log_per_pixel<bn_con<con<num_classes,1,input_width,1,1,input<matrix<double>>>>>;
        net_type net;
        sgd defsolver(0,0.9);
        dnn_trainer<net_type> trainer(net, defsolver);
        trainer.set_learning_rate(0.1);
        trainer.set_min_learning_rate(0.01);
        trainer.set_mini_batch_size(50);
        trainer.set_max_num_epochs(170);
        trainer.train(x, y);

        const ::std::vector<matrix<uint16_t>> predictions = net(x);

        int num_correct = 0;

        for ( int ii = 0; ii < num_samples; ++ii ) {
            const matrix<uint16_t>& prediction = predictions[ii];
            DLIB_TEST(prediction.nr() == output_height);
            DLIB_TEST(prediction.nc() == output_width);
            for ( int jj = 0; jj < output_height; ++jj )
                for ( int kk = 0; kk < output_width; ++kk )
                    if ( prediction(jj, kk) == ground_truth(x[ii], jj, kk) )
                        ++num_correct;
        }

        // First some sanity checks.
        const int num_correct_max = num_total_elements;
        DLIB_TEST(num_correct_max == ::std::accumulate(truth_histogram.begin(), truth_histogram.end(), 0));
        DLIB_TEST_MSG(num_correct <= num_correct_max,
                      "Number of correctly classified elements = " << num_correct << ", max = " << num_correct_max);

        // This is the real test, verifying that we have actually learned something.
        const int num_correct_required = static_cast<int>(::std::ceil(0.9 * num_correct_max));
        DLIB_TEST_MSG(num_correct >= num_correct_required,
                      "Number of correctly classified elements = " << num_correct << ", required = " << num_correct_required);

#if DLIB_USE_CUDA
        cuda::compute_loss_multiclass_log_per_pixel cuda_compute;
        cpu::compute_loss_multiclass_log_per_pixel cpu_compute;
        double cuda_loss, cpu_loss;
        const tensor& output_tensor = net.subnet().get_output();
        resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor);
        cuda_compute(y.begin(), output_tensor, cuda_grad, cuda_loss);
        cpu_compute(y.begin(), output_tensor, cpu_grad, cpu_loss);
        DLIB_TEST(cuda_grad.size() == cpu_grad.size());
        for (size_t i = 0; i < cuda_grad.size(); ++i)
        {
            DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8);
        }
        const auto err = abs(cuda_loss - cpu_loss) / cpu_loss;
        DLIB_TEST_MSG(err < 1e-6, "multiclass log per pixel cuda and cpu losses differ");
#endif
    }

// ----------------------------------------------------------------------------------------

    void test_loss_multiclass_per_pixel_weighted()
    {
        // Train with pixel-specific weights

        print_spinner();

        constexpr int input_height = 5;
        constexpr int input_width = 7;
        constexpr int output_height = input_height;
        constexpr int output_width = input_width;
        const int num_samples = 1000;
        const int num_classes = 6;

        ::std::default_random_engine generator(16);
        ::std::uniform_real_distribution<double> u01(0.0, 1.0);
        ::std::uniform_int_distribution<uint16_t> noisy_label(0, num_classes - 1);

        ::std::vector<matrix<double>> x(num_samples);
        ::std::vector<matrix<uint16_t>> y(num_samples);

        matrix<double> xtmp(input_height, input_width);
        matrix<uint16_t> ytmp(output_height, output_width);

        // Generate input data
        for (int ii = 0; ii < num_samples; ++ii) {
            for (int jj = 0; jj < input_height; ++jj) {
                for (int kk = 0; kk < input_width; ++kk) {
                    xtmp(jj, kk) = u01(generator);
                    ytmp(jj, kk) = noisy_label(generator);
                }
            }
            x[ii] = xtmp;
            y[ii] = ytmp;
        }

        using net_type = loss_multiclass_log_per_pixel_weighted<con<num_classes,1,1,1,1,input<matrix<double>>>>;
        using weighted_label = loss_multiclass_log_per_pixel_weighted_::weighted_label;

        ::std::vector<matrix<weighted_label>> y_weighted(num_samples);

        for (int weighted_class = 0; weighted_class < num_classes; ++weighted_class) {

            print_spinner();

            // Assign weights
            for (int ii = 0; ii < num_samples; ++ii) {
                if (weighted_class == 0) {
                    y_weighted[ii].set_size(input_height, input_width);
                }
                for (int jj = 0; jj < input_height; ++jj) {
                    for (int kk = 0; kk < input_width; ++kk) {
                        const uint16_t label = y[ii](jj, kk);
                        const float weight
                            = label == weighted_class
                            ? 1.1f
                            : 0.9f;
                        y_weighted[ii](jj, kk) = weighted_label(label, weight);
                    }
                }
            }

            net_type net;
            sgd defsolver(0,0.9);
            dnn_trainer<net_type> trainer(net, defsolver);
            trainer.set_learning_rate(0.1);
            trainer.set_min_learning_rate(0.01);
            trainer.set_mini_batch_size(10);
            trainer.set_max_num_epochs(10);
            trainer.train(x, y_weighted);

            const ::std::vector<matrix<uint16_t>> predictions = net(x);

            int num_weighted_class = 0;
            int num_not_weighted_class = 0;

            for ( int ii = 0; ii < num_samples; ++ii ) {
                const matrix<uint16_t>& prediction = predictions[ii];
                DLIB_TEST(prediction.nr() == output_height);
                DLIB_TEST(prediction.nc() == output_width);
                for ( int jj = 0; jj < output_height; ++jj )
                    for ( int kk = 0; kk < output_width; ++kk )
                        if ( prediction(jj, kk) == weighted_class )
                            ++num_weighted_class;
                        else 
                            ++num_not_weighted_class;
            }

            DLIB_TEST_MSG(num_weighted_class > num_not_weighted_class,
                          "The weighted class (" << weighted_class << ") does not dominate: "
                          << num_weighted_class << " <= " << num_not_weighted_class);

#if DLIB_USE_CUDA
            cuda::compute_loss_multiclass_log_per_pixel_weighted cuda_compute;
            cpu::compute_loss_multiclass_log_per_pixel_weighted cpu_compute;
            double cuda_loss, cpu_loss;
            const tensor& output_tensor = net.subnet().get_output();
            resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor);
            cuda_compute(y_weighted.begin(), output_tensor, cuda_grad, cuda_loss);
            cpu_compute(y_weighted.begin(), output_tensor, cpu_grad, cpu_loss);
            DLIB_TEST(cuda_grad.size() == cpu_grad.size());
            for (size_t i = 0; i < cuda_grad.size(); ++i)
            {
                DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8);
            }
            const auto err = abs(cuda_loss - cpu_loss) / cpu_loss;
            DLIB_TEST_MSG(err < 1e-6, "multi class log per pixel weighted cuda and cpu losses differ");
#endif
        }
    }

// ----------------------------------------------------------------------------------------

    void test_loss_multiclass_log_weighted()
    {

        print_spinner();

        constexpr int input_height = 5;
        constexpr int input_width = 7;
        const size_t num_samples = 1000;
        const size_t num_classes = 4;

        ::std::vector<matrix<double>> x(num_samples);
        ::std::vector<unsigned long> y(num_samples);

        matrix<double> xtmp(input_height, input_width);

        dlib::rand rnd;
        // Generate input data
        for (size_t ii = 0; ii < num_samples; ++ii)
        {
            for (int jj = 0; jj < input_height; ++jj)
            {
                for (int kk = 0; kk < input_width; ++kk)
                {
                    xtmp(jj, kk) = rnd.get_random_float();
                }
            }
            x[ii] = xtmp;
            y[ii] = rnd.get_integer_in_range(0, num_classes);
        }

        using net_type = loss_multiclass_log_weighted<fc<num_classes, input<matrix<double>>>>;

        ::std::vector<weighted_label<unsigned long>> y_weighted(num_samples);

        for (size_t weighted_class = 0; weighted_class < num_classes; ++weighted_class)
        {

            print_spinner();

            // Assign weights
            for (size_t ii = 0; ii < num_samples; ++ii)
            {
                const unsigned long label = y[ii];
                const float weight
                    = label == weighted_class
                    ? 1.4f
                    : 0.6f;
                y_weighted[ii] = weighted_label<unsigned long>(label, weight);
            }

            net_type net;
            sgd defsolver(0, 0.9);
            dnn_trainer<net_type> trainer(net, defsolver);
            trainer.set_learning_rate(0.1);
            trainer.set_min_learning_rate(0.01);
            trainer.set_mini_batch_size(10);
            trainer.set_max_num_epochs(10);
            trainer.train(x, y_weighted);

            const ::std::vector<unsigned long> predictions = net(x);

            int num_weighted_class = 0;
            int num_not_weighted_class = 0;

            for (size_t ii = 0; ii < num_samples; ++ii)
            {
                if (predictions[ii] == weighted_class)
                    ++num_weighted_class;
                else
                    ++num_not_weighted_class;
            }

            DLIB_TEST_MSG(num_weighted_class > num_not_weighted_class,
                          "The weighted class (" << weighted_class << ") does not dominate: "
                          << num_weighted_class << " <= " << num_not_weighted_class);
        }
    }

// ----------------------------------------------------------------------------------------

    void test_loss_multibinary_log()
    {
        print_spinner();
        dlib::rand rnd;

        const long dims = 3;
        const std::vector<float> empty_label(2, -1.f);
        std::vector<matrix<float, 0, 1>> samples;
        std::vector<std::vector<float>> labels(128, empty_label);

        for (size_t i = 0; i < labels.size(); ++i)
        {
            matrix<float, 0, 1> x = matrix_cast<float>(randm(dims, 1)) * rnd.get_double_in_range(1, 9);
            const auto norm = sqrt(sum(squared(x)));
            if (norm < 3)
            {
                labels[i][0] = 1.f;
            }
            else if (3 <= norm && norm < 6)
            {
                labels[i][0] = 1.f;
                labels[i][1] = 1.f;
            }
            else
            {
                labels[i][1] = 1.f;
            }
            samples.push_back(std::move(x));
        }

        using net_type = loss_multibinary_log<fc<2, relu<bn_fc<fc<10, input<matrix<float, 0, 1>>>>>>>;
        net_type net;

        auto compute_error = [&net, &samples, &labels, dims]()
        {
            const auto preds = net(samples);
            double num_wrong = 0;
            for (size_t i = 0; i < labels.size(); ++i)
            {
                for (size_t j = 0; j < labels[i].size(); ++j)
                {
                    if ((labels[i][j] == 1 && preds[i][j] < 0) ||
                        (labels[i][j] == 0 && preds[i][j] > 0))
                    {
                        ++num_wrong;
                    }
                }
            }
            return num_wrong / labels.size() / dims;
        };

        dnn_trainer<net_type> trainer(net);
        const auto error_before = compute_error();
        trainer.set_learning_rate(0.1);
        trainer.set_iterations_without_progress_threshold(10);
        trainer.set_mini_batch_size(128);
        trainer.set_min_learning_rate(1e-3);
        trainer.train(samples, labels);
        const auto error_after = compute_error();

        DLIB_TEST_MSG(error_after < error_before && error_after == 0, "multibinary_log error increased after training");
    }

// ----------------------------------------------------------------------------------------

    void test_tensor_resize_bilinear(long samps, long k, long nr, long nc,  long onr, long onc)
    {
        resizable_tensor img(samps,k,nr,nc);
        resizable_tensor out(samps,k,onr,onc);
        resizable_tensor out2(samps,k,onr,onc);

        dlib::rand rnd;
        for (int iter = 0; iter < 10; ++iter)
        {
            print_spinner();

            const size_t idx = rnd.get_random_64bit_number()%img.size();

            img = 1;
            img.host()[idx] = 2;
            cpu::resize_bilinear(out, img);
#ifdef DLIB_USE_CUDA
            cuda::resize_bilinear(out2, img);
            DLIB_TEST(max(abs(mat(out)-mat(out2))) < 1e-5);
#endif

            resizable_tensor gradient_input;
            gradient_input.copy_size(out);
            tt::tensor_rand rnd;
            rnd.fill_uniform(gradient_input);

            const float h = 1e-2;

            img.host()[idx] = 2;
            cpu::resize_bilinear(out, img);
            float f1 = dot(out, gradient_input); 

            img.host()[idx] = 2+h;
            cpu::resize_bilinear(out, img);
            float f2 = dot(out, gradient_input); 

            const float numerical_grad = (f2-f1)/h;
            dlog << LINFO << "numerical grad: " << numerical_grad;


            resizable_tensor grad, grad2;
            grad.copy_size(img);
            grad = 0.1;
            grad2.copy_size(img);
            grad2 = 0.1;

            cpu::resize_bilinear_gradient(grad2, gradient_input);
            dlog << LINFO << "analytic grad: "<< grad2.host()[idx]-0.1;
            DLIB_TEST_MSG(std::abs(numerical_grad - grad2.host()[idx]+0.1) < 1e-2, std::abs(numerical_grad - grad2.host()[idx]+0.1) << "  numerical_grad: " << numerical_grad);

#ifdef DLIB_USE_CUDA
            cuda::resize_bilinear_gradient(grad, gradient_input);
            dlog << LINFO << "analytic grad: "<< grad.host()[idx]-0.1;
            DLIB_TEST_MSG(std::abs(numerical_grad - grad.host()[idx]+0.1) < 1e-2, std::abs(numerical_grad - grad.host()[idx]+0.1) << "  numerical_grad: " << numerical_grad);
            DLIB_TEST(max(abs(mat(grad)-mat(grad2))) < 1e-5);
#endif

        }


        // now test with strided/sub-window calls
        alias_tensor aimg(samps, k, nr-2,nc-2);
        alias_tensor aout(samps, k, onr-2,onc-2);
        for (int iter = 0; iter < 10; ++iter)
        {
            print_spinner();

            const size_t idx = rnd.get_random_64bit_number()%img.size();

            img = 1;
            img.host()[idx] = 2;
            out = 9;
            out2 = 9;
            auto wout = aout(out, out.nc()*1+1);
            auto wimg = aimg(img, img.nc()*1+1);
            cpu::resize_bilinear(wout,out.nc(),out.nr()*out.nc(),  wimg,img.nc(),img.nr()*img.nc());
#ifdef DLIB_USE_CUDA
            auto wout2 = aout(out2, out2.nc()*1+1);
            cuda::resize_bilinear(wout2,out2.nc(),out2.nr()*out2.nc(),  wimg,img.nc(),img.nr()*img.nc());
            DLIB_TEST(max(abs(mat(out)-mat(out2))) < 1e-5);
#endif


            resizable_tensor gradient_input;
            gradient_input.copy_size(out);
            tt::tensor_rand rnd;
            rnd.fill_uniform(gradient_input);

            const float h = 1e-2;

            img.host()[idx] = 2;
            out = 0;
            wout = aout(out, out.nc()*1+1);
            wimg = aimg(img, img.nc()*1+1);
            cpu::resize_bilinear(wout,out.nc(),out.nr()*out.nc(),  wimg,img.nc(),img.nr()*img.nc());
            float f1 = dot(out, gradient_input); 

            img.host()[idx] = 2+h;
            out = 0;
            cpu::resize_bilinear(wout,out.nc(),out.nr()*out.nc(),  wimg,img.nc(),img.nr()*img.nc());
            float f2 = dot(out, gradient_input); 

            const float numerical_grad = (f2-f1)/h;
            dlog << LINFO << "numerical grad: " << numerical_grad;


            resizable_tensor grad, grad2;
            grad.copy_size(img);
            grad = 0.1;
            grad2.copy_size(img);
            grad2 = 0.1;

            auto wgrad2 = aimg(grad2, grad2.nc()*1+1);
            auto wgradient_input = aout(gradient_input, gradient_input.nc()*1+1);
            cpu::resize_bilinear_gradient(wgrad2,grad2.nc(),grad2.nr()*grad2.nc(),  wgradient_input,gradient_input.nc(),gradient_input.nr()*gradient_input.nc());
            dlog << LINFO << "analytic grad: "<< grad2.host()[idx]-0.1;
            DLIB_TEST_MSG(std::abs(numerical_grad - grad2.host()[idx]+0.1) < 1e-2, std::abs(numerical_grad - grad2.host()[idx]+0.1) << "  numerical_grad: " << numerical_grad);

#ifdef DLIB_USE_CUDA
            wgrad2 = aimg(grad, grad.nc()*1+1);
            wgradient_input = aout(gradient_input, gradient_input.nc()*1+1);
            cuda::resize_bilinear_gradient(wgrad2,grad.nc(),grad.nr()*grad.nc(),  wgradient_input,gradient_input.nc(),gradient_input.nr()*gradient_input.nc());
            dlog << LINFO << "analytic grad: "<< grad.host()[idx]-0.1;
            DLIB_TEST_MSG(std::abs(numerical_grad - grad.host()[idx]+0.1) < 1e-2, std::abs(numerical_grad - grad.host()[idx]+0.1) << "  numerical_grad: " << numerical_grad);
            DLIB_TEST_MSG(max(abs(mat(grad)-mat(grad2))) < 1e-5, max(abs(mat(grad)-mat(grad2))));
#endif


        }
    }


    void test_serialization()
    {
        print_spinner();

        using net_type = loss_mean_squared<fc<1, input<matrix<double>>>>;
        net_type net, net2;

        std::ostringstream out;
        serialize(net, out);
        const std::string serialized = out.str();
        std::istringstream in(serialized);
        dlib::deserialize(net2, in);
        
        std::vector<char> buf1;
        dlib::serialize(buf1) << net;
        std::vector<uint8_t> buf2(buf1.begin(), buf1.end());
        dlib::deserialize(buf2) >> net2;
    }

// ----------------------------------------------------------------------------------------

    void test_loss_dot()
    {
        print_spinner();

        std::vector<matrix<float,0,1>> samples;
        std::vector<matrix<float,0,1>> labels;

        const matrix<float> proj = matrix_cast<float>(randm(2,3));
        for (int i = 0; i < 128; ++i)
        {
            // The task is going to be to learn the matrix proj.  So we make our
            // training data thusly:
            matrix<float,0,1> x = matrix_cast<float>(randm(3,1));
            matrix<float,0,1> y = normalize(proj*x);
            samples.push_back(x);
            labels.push_back(y);
        }

        using net_type = loss_dot<
            l2normalize<fc_no_bias<2, 
            input<matrix<float,0,1>> 
            >>>;

        net_type net;
        dnn_trainer<net_type> trainer(net, sgd(1e-4, 0.9));
        trainer.set_learning_rate(0.01);
        trainer.set_min_learning_rate(0.0000001);
        trainer.set_mini_batch_size(128);
        trainer.set_max_num_epochs(50000);
        trainer.train(samples, labels);


        for (size_t i = 0; i < samples.size(); ++i)
        {
            DLIB_TEST(std::abs(1-dot(net(samples[i]),labels[i])) < 0.001);
        }
    }

// ----------------------------------------------------------------------------------------

    void test_loss_multimulticlass_log()
    {
        print_spinner();
        std::map<string,std::vector<string>> all_labels;
        all_labels["c1"] = {"a", "b", "c"};
        all_labels["c2"] = {"d", "e", "f"};

        // make training data
        std::vector<matrix<float>> samples;
        std::vector<std::map<string,string>> labels;
        for (int i = 0; i < 3; ++i)
        {
            for (int j = 0; j < 3; ++j)
            {
                matrix<float> samp(2,3);
                samp = 0;
                samp(0,i) = 1;
                samp(1,j) = 1;
                samples.push_back(samp);

                std::map<string,string> l;
                if (i == 0) l["c1"] = "a";
                if (i == 1) l["c1"] = "b";
                if (i == 2) l["c1"] = "c";
                if (j == 0) l["c2"] = "d";
                if (j == 1) l["c2"] = "e";
                if (j == 2) l["c2"] = "f";
                labels.push_back(l);
            }
        }

        using net_type = loss_multimulticlass_log<
            fc<1,        
            input<matrix<float>> 
            >>;

        net_type net(all_labels);
        net.subnet().layer_details().set_num_outputs(net.loss_details().number_of_labels());

        dnn_trainer<net_type> trainer(net, sgd(0.1));
        trainer.set_learning_rate(0.1);
        trainer.set_min_learning_rate(0.00001);
        trainer.set_iterations_without_progress_threshold(500);

        trainer.train(samples, labels);

        auto predicted_labels = net(samples);

        // make sure the network predicts the right labels
        for (size_t i = 0; i < samples.size(); ++i)
        {
            DLIB_TEST(predicted_labels[i]["c1"] == labels[i]["c1"]);
            DLIB_TEST(predicted_labels[i]["c2"] == labels[i]["c2"]);
        }

    }

    void test_layers_scale_and_scale_prev()
    {
        print_spinner();
        using net_type1 = scale1<con<3,1,1,1,1,avg_pool_everything<tag1<input_rgb_image>>>>;
        using net_type2 = scale_prev2<skip1<tag2<con<3,1,1,1,1,avg_pool_everything<tag1<input_rgb_image>>>>>>;

        dlib::tt::tensor_rand rnd;
        dlib::resizable_tensor x(1, 3, 64, 64);
        rnd.fill_gaussian(x);
        net_type1 net1;
        net_type2 net2;
        net1.forward(x);
        net2.forward(x);

        // make sure both convolutional layers have the same weights
        layer<3>(net2).layer_details() = layer<1>(net1).layer_details();
        const auto& params1 = layer<1>(net1).layer_details().get_layer_params();
        const auto& params2 = layer<3>(net2).layer_details().get_layer_params();
        DLIB_CASSERT(params1.size() == params2.size());
        for (size_t i = 0; i < params1.size(); ++i)
        {
            DLIB_CASSERT(*(params1.begin() + i) == *(params2.begin() + i));
        }
        net2.forward(x);

        // make sure both outputs are the same
        const auto& out1 = net1.get_output();
        const auto& out2 = net2.get_output();
        DLIB_TEST(out1.size() == out2.size());
        for (size_t i = 0; i < out1.size(); ++i)
        {
            DLIB_TEST(*(out1.begin() + i) == *(out2.begin() + i));
        }

        // make sure gradients are the same (within some precision)
        const double epsilon = 1e-4;
        dlib::resizable_tensor gradient(out1);
        rnd.fill_gaussian(gradient);

        net1.back_propagate_error(x, gradient);
        const auto& grad1 = layer<1>(net1).get_parameter_gradient();

        net2.back_propagate_error(x, gradient);
        const auto& grad2 = layer<3>(net2).get_parameter_gradient();

        DLIB_TEST(grad1.size() == grad2.size());
        for (size_t i = 0; i < grad1.size(); ++i)
        {
            DLIB_TEST(::std::abs(*(grad1.begin() + i) - *(grad2.begin() + i)) < epsilon);
        }
    }

// ----------------------------------------------------------------------------------------

    template <long num_filters, long ks, int s, typename SUBNET>
    using conp = add_layer<con_<num_filters, ks, ks, s, s, ks/2, ks/2>, SUBNET>;
    template <typename INPUT>
    using stem = add_layer<max_pool_<3, 3, 2, 2, 1, 1>, relu<bn_con<conp<16, 7, 2, INPUT>>>>;
    template <long num_filters, long growth_rate, typename SUBNET>
    using dense_layer = concat2<tag1, tag2,
                        tag2<conp<growth_rate, 3, 1,
                        relu<bn_con<conp<4 * growth_rate, 1, 1,
                        relu<bn_con<tag1<SUBNET>>>>>>>>>;
    template <typename SUBNET> using dense_layer_32 = dense_layer<32, 8, SUBNET>;
    void test_disable_duplicative_biases()
    {
        print_spinner();
        using net_type = fc<10, relu<layer_norm<fc<15, relu<bn_fc<fc<20,
                         relu<layer_norm<conp<32, 3, 1,
                         repeat<2, dense_layer_32,
                         stem<input_rgb_image>>>>>>>>>>>>;
        net_type net;
        DLIB_TEST(layer<0>(net).layer_details().bias_is_disabled() == false);
        DLIB_TEST(layer<3>(net).layer_details().bias_is_disabled() == false);
        DLIB_TEST(layer<6>(net).layer_details().bias_is_disabled() == false);
        DLIB_TEST(layer<9>(net).layer_details().bias_is_disabled() == false);
        DLIB_TEST(layer<12>(net).layer_details().bias_is_disabled() == false);
        DLIB_TEST(layer<15>(net).layer_details().bias_is_disabled() == false);
        DLIB_TEST(layer<21>(net).layer_details().bias_is_disabled() == false);
        DLIB_TEST(layer<24>(net).layer_details().bias_is_disabled() == false);
        DLIB_TEST(layer<31>(net).layer_details().bias_is_disabled() == false);
        disable_duplicative_biases(net);
        DLIB_TEST(layer<0>(net).layer_details().bias_is_disabled() == false);
        DLIB_TEST(layer<3>(net).layer_details().bias_is_disabled() == true);
        DLIB_TEST(layer<6>(net).layer_details().bias_is_disabled() == true);
        DLIB_TEST(layer<9>(net).layer_details().bias_is_disabled() == true);
        DLIB_TEST(layer<12>(net).layer_details().bias_is_disabled() == false);
        DLIB_TEST(layer<15>(net).layer_details().bias_is_disabled() == true);
        DLIB_TEST(layer<21>(net).layer_details().bias_is_disabled() == false);
        DLIB_TEST(layer<24>(net).layer_details().bias_is_disabled() == true);
        DLIB_TEST(layer<31>(net).layer_details().bias_is_disabled() == true);
    }

// ----------------------------------------------------------------------------------------

    void test_set_learning_rate_multipliers()
    {
        print_spinner();
        using net_type = loss_binary_log<fc<2, relu<bn_con<con<16, 5, 5, 2, 2, input<matrix<float>>>>>>>;
        net_type net;
        set_all_learning_rate_multipliers(net, 0.5);
        DLIB_TEST(layer<1>(net).layer_details().get_learning_rate_multiplier() == 0.5);
        DLIB_TEST(layer<3>(net).layer_details().get_learning_rate_multiplier() == 0.5);
        DLIB_TEST(layer<4>(net).layer_details().get_learning_rate_multiplier() == 0.5);
        set_learning_rate_multipliers_range<2, 4>(net, 0.1);
        set_learning_rate_multipliers_range<4, 6>(net, 0.01);
        DLIB_TEST(layer<1>(net).layer_details().get_learning_rate_multiplier() == 0.5);
        DLIB_TEST(layer<3>(net).layer_details().get_learning_rate_multiplier() == 0.1);
        DLIB_TEST(layer<4>(net).layer_details().get_learning_rate_multiplier() == 0.01);
    }

// ----------------------------------------------------------------------------------------

    template <typename SUBNET>
    using conblock = relu<bn_con<add_layer<con_<16, 3, 3, 2, 2, 1, 1>, SUBNET>>>;

    void test_input_ouput_mappers()
    {
        using net_type = loss_binary_log_per_pixel<con<1, 1, 1, 1, 1,repeat<3, conblock, tag1<input_rgb_image>>>>;
        net_type net;
        point p(32, 32);
        DLIB_TEST(input_tensor_to_output_tensor(net, p) == p / 8);
        DLIB_TEST(output_tensor_to_input_tensor(net, p) == p * 8);
    }

// ----------------------------------------------------------------------------------------

    // This test really just checks if the mmod loss goes negative when a whole lot of overlapping
    // truth rectangles are given.  
    void test_loss_mmod()
    {
        print_spinner();

        // Define input image size.
        constexpr int nc = 20;
        constexpr int nr = 20;

        constexpr int margin = 3;

        // Create a checkerboard pattern.
        std::deque<point> labeled_points;
        for (int y = margin; y < nr - margin; ++y)
            for (int x = margin + 1 - y % 2; x < nc - margin; x += 2)
                labeled_points.emplace_back(x, y);

        // Create training data that follows the generated pattern.
        typedef matrix<float> input_image_type;

        const auto generate_input_image = [&labeled_points, nr, nc]()
        {
            input_image_type sample(nr, nc);
            sample = -1.0;

            for (const auto& point : labeled_points)
                sample(point.y(), point.x()) = 1.0;

            return sample;
        };

        const auto generate_labels = [&labeled_points]()
        {
            const auto point_to_rect = [](const point& point) {
                constexpr int rect_size = 5;
                return centered_rect(
                    point.x(), point.y(),
                    rect_size, rect_size
                );
            };

            std::vector<mmod_rect> labels;

            std::transform(
                labeled_points.begin(),
                labeled_points.end(),
                std::back_inserter(labels),
                point_to_rect
            );

            return labels;
        };

        const input_image_type input_image = generate_input_image();
        const std::vector<mmod_rect> labels = generate_labels();

        mmod_options options(use_image_pyramid::no, { labels });
        options.be_quiet = true;

        // Define a simple network.
        using net_type = loss_mmod<con<1,5,5,1,1,con<1,5,5,2,2,input<input_image_type>>>>;
        net_type net(options);
        dnn_trainer<net_type> trainer(net, sgd(0.1));

        // Train the network. The loss is not supposed to go negative.
        for (int i = 0; i < 100; ++i) {
            print_spinner();
            trainer.train_one_step({ input_image }, { labels });
            DLIB_TEST(trainer.get_average_loss() >= 0.0);
        }

        // Inference should return something for the training data.
        const auto dets = net(input_image);
        DLIB_TEST(dets.size() > 0);

        // Indeed many truth objects should be found.
        const auto approximate_desired_det_count = (nr - 2 * margin) * (nc - 2 * margin) / 2.0;
        DLIB_TEST(dets.size() > approximate_desired_det_count * 0.45);
        DLIB_TEST(dets.size() < approximate_desired_det_count * 1.05);
    }

// ----------------------------------------------------------------------------------------

    void test_fuse_layers()
    {
        print_spinner();
        using net_type = fc<10, avg_pool_everything<relu<bn_con<con<16, 3, 3, 1, 1, input_rgb_image>>>>>;
        using net_type_fused = fc<10, avg_pool_everything<relu<affine<con<16, 3, 3, 1, 1, input_rgb_image>>>>>;
        net_type net_bias, net_nobias;
        disable_duplicative_biases(net_nobias);
        resizable_tensor x;
        matrix<rgb_pixel> image(8, 8);
        net_bias.to_tensor(&image, &image+1, x);
        net_nobias.to_tensor(&image, &image+1, x);
        net_bias.forward(x);
        net_nobias.forward(x);
        net_type_fused net_fused_bias(net_bias);
        net_type_fused net_fused_nobias(net_nobias);
        const resizable_tensor out_bias = net_bias.get_output();
        const resizable_tensor out_nobias = net_nobias.get_output();
        fuse_layers(net_fused_bias);
        fuse_layers(net_fused_nobias);
        net_fused_bias.forward(x);
        net_fused_nobias.forward(x);
        const resizable_tensor out_bias_fused = net_fused_bias.get_output();
        const resizable_tensor out_nobias_fused = net_fused_nobias.get_output();

        DLIB_TEST(max(squared(mat(out_bias) - mat(out_bias_fused))) < 1e-10);
        DLIB_TEST(max(squared(mat(out_nobias) - mat(out_nobias_fused))) < 1e-10);
    }

// ----------------------------------------------------------------------------------------

    void test_reorg()
    {
#ifdef DLIB_USE_CUDA
        print_spinner();
        resizable_tensor x(2, 4, 8, 16);
        resizable_tensor out_cpu(2, 16, 4, 8), out_cuda(2, 16, 4, 8);
        resizable_tensor grad_cpu(x), grad_cuda(x);
        tt::tensor_rand rnd;
        rnd.fill_gaussian(x);
        cpu::reorg(out_cpu, 2, 2, x);
        cuda::reorg(out_cuda, 2, 2, x);
        DLIB_TEST(max(squared(mat(out_cuda) - mat(out_cpu))) == 0);
        cpu::reorg_gradient(grad_cpu, 2, 2, out_cpu);
        cuda::reorg_gradient(grad_cuda, 2, 2, out_cuda);
        DLIB_TEST(max(squared(mat(out_cuda) - mat(out_cpu))) == 0);
#endif
    }

// ----------------------------------------------------------------------------------------

    class dnn_tester : public tester
    {
    public:
        dnn_tester (
        ) :
            tester ("test_dnn",
                "Runs tests on the deep neural network tools.")
        {}

        void run_tests (
        )
        {
            // make the tests repeatable
            srand(1234);

            test_tagging();
#ifdef DLIB_USE_CUDA
            test_affine_rect();
            test_conv();
            test_more_ops2();
            test_more_ops(1,1);
            test_more_ops(3,4);
            test_more_ops(4,3);
            test_more_ops(4,1);
            test_more_ops(1,4);
            test_more_ops(10000,4);
            compare_bn_gpu_and_cpu();
            compare_bn_conv_gpu_and_cpu();
            test_add();
            test_multiply_zero_padded();
            compare_adam();
            test_copy_tensor_gpu();
            test_copy_tensor_add_to_gpu();
            test_scale_channels();
#endif
            test_tensor_resize_bilinear(2, 3, 6,6, 11, 11);
            test_tensor_resize_bilinear(2, 3, 6,6, 3, 4);
            test_tensor_resize_bilinear(2, 3, 5,6, 12, 21);
            test_max_pool(1,1,2,3,0,0);
            test_max_pool(3,3,1,1,0,0);
            test_max_pool(3,3,2,2,0,0);
            test_max_pool(2,2,2,2,0,0);
            test_max_pool(4,5,3,1,0,0);
            test_avg_pool(1,1,2,3,0,0);
            test_avg_pool(3,3,1,1,0,0);
            test_avg_pool(3,3,2,2,0,0);
            test_avg_pool(2,2,2,2,0,0);
            test_avg_pool(4,5,3,1,0,0);
            test_avg_pool(4,4,2,2,0,0);
            test_avg_pool(4,5,40,50,0,0);
            test_max_pool(2,2,2,3,1,1);
            test_max_pool(3,3,1,1,1,1);
            test_max_pool(3,3,2,2,2,1);
            test_max_pool(2,2,2,2,1,0);
            test_max_pool(4,5,3,1,2,3);
            test_avg_pool(1,1,2,3,0,0);
            test_avg_pool(3,3,1,1,1,2);
            test_avg_pool(3,3,2,2,2,1);
            test_avg_pool(2,2,2,2,1,0);
            test_avg_pool(4,5,3,1,2,4);
            test_avg_pool(4,4,2,2,1,3);
            test_avg_pool(4,5,40,50,0,1);
            test_tanh();
            test_softmax();
            test_softmax_all();
            test_sigmoid();
            test_mish();
            test_leaky_relu();
            test_clipped_relu();
            test_elu();
            test_gelu();
            test_smelu();
            test_silu();
            test_batch_normalize();
            test_batch_normalize_conv();
            test_layer_normalize();
            test_basic_tensor_ops();
            test_layers();
            test_visit_functions();
            test_copy_tensor_cpu();
            test_copy_tensor_add_to_cpu();
            test_concat();
            test_simple_linear_regression();
            test_simple_linear_regression_eil();
            test_simple_linear_regression_with_mult_prev();
            test_multioutput_linear_regression();
            test_simple_autoencoder();
            test_loss_mean_squared_per_channel_and_pixel();
            test_loss_binary_log_per_pixel_learned_params_on_trivial_two_pixel_task();
            test_loss_binary_log_per_pixel_outputs_on_trivial_task();
            test_loss_binary_log_per_pixel_with_noise_and_pixels_to_ignore();
            test_loss_multiclass_per_pixel_learned_params_on_trivial_single_pixel_task();
            test_loss_multiclass_per_pixel_activations_on_trivial_single_pixel_task();
            test_loss_multiclass_per_pixel_outputs_on_trivial_task();
            test_loss_multiclass_per_pixel_with_noise_and_pixels_to_ignore();
            test_loss_multiclass_per_pixel_weighted();
            test_loss_multiclass_log_weighted();
            test_loss_multibinary_log();
            test_serialization();
            test_loss_dot();
            test_loss_multimulticlass_log();
            test_loss_mmod();
            test_layers_scale_and_scale_prev();
            test_disable_duplicative_biases();
            test_set_learning_rate_multipliers();
            test_input_ouput_mappers();
            test_fuse_layers();
            test_reorg();
        }

        void perform_test()
        {
            dlog << LINFO << "NOW RUNNING TESTS WITH set_dnn_prefer_fastest_algorithms()";
            set_dnn_prefer_fastest_algorithms();
            run_tests();

            dlog << LINFO << "NOW RUNNING TESTS WITH set_dnn_prefer_smallest_algorithms()";
            set_dnn_prefer_smallest_algorithms();
            run_tests();


            {
                resizable_tensor a(2,3,4,5);
                resizable_tensor b(2,3,4,5);
                DLIB_TEST(have_same_dimensions(a,b));

                a.set_size(2,3,4,4);
                DLIB_TEST(!have_same_dimensions(a,b));
                a.set_size(2,3,3,5);
                DLIB_TEST(!have_same_dimensions(a,b));
                a.set_size(2,2,4,5);
                DLIB_TEST(!have_same_dimensions(a,b));
                a.set_size(1,3,4,5);
                DLIB_TEST(!have_same_dimensions(a,b));

                static_assert(!is_image_type<resizable_tensor>::value, "should be false");
            }
        }
    } a;
}

#endif // __INTELLISENSE__