dlib C++ Library - ffmpeg

// Copyright (C) 2023  Davis E. King (davis@dlib.net)
// License: Boost Software License   See LICENSE.txt for the full license.

//////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////
/*
    PRIVATE IMPLEMENTATION DETAILS
    USERS, YOU DO NOT NEED TO READ THIS
*/
//////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////
/*
    Notes for developers:

    The file structure for all things ffmpeg is as follows:

        - ffmpeg_details.h  : contains implementations details only and everything must be in the
                              dlib::ffmpeg::details namespace
        
        - ffmpeg_utils.h    : contains common public API. Definitions go at the bottom of the file
                              underneath a block comment saying "DEFINITIONS"
                              Also contains implementation details that depend on the public API.
                              This must still go in the dlib::ffmpeg::details namespace
        
        - ffmpeg_demuxer.h  : contains public API for all things decoding. Similarly, definitions go 
                              at the bottom of the file underneath a block comment saying "DEFINITIONS".

        - ffmpeg_muxer.h  :   contains public API for all things encoding. Similarly, definitions go 
                              at the bottom of the file underneath a block comment saying "DEFINITIONS".
                            
*/
//////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////

#ifndef DLIB_FFMPEG_DETAILS
#define DLIB_FFMPEG_DETAILS

#include "../test_for_odr_violations.h"

#ifndef DLIB_USE_FFMPEG
static_assert(false, "This version of dlib isn't built with the FFMPEG wrappers");
#endif

extern "C" {
#include <libavutil/dict.h>
#include <libavutil/opt.h>
#include <libavutil/pixdesc.h>
#include <libavutil/frame.h>
#include <libavutil/channel_layout.h>
#include <libavutil/audio_fifo.h>
#include <libavutil/imgutils.h>
#include <libavutil/log.h>
#include <libswscale/swscale.h>
#include <libswresample/swresample.h>
#include <libavformat/avformat.h>
#include <libavdevice/avdevice.h>
#include <libavcodec/avcodec.h>
}

#include <string>
#include <memory>
#include "../logger.h"

#if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(59, 24, 100)
#define FFMPEG_HAS_CH_LAYOUT 1
#endif

namespace dlib { namespace ffmpeg { namespace details
{

// ---------------------------------------------------------------------------------------------------

    inline dlib::logger& logger_ffmpeg_private()
    {
        static dlib::logger GLOBAL("ffmpeg.internal");
        return GLOBAL;
    }
// ---------------------------------------------------------------------------------------------------

    inline void register_ffmpeg()
    {
        static const bool REGISTERED = []
        {
            avdevice_register_all();
#if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(58, 10, 100)
            // See https://github.com/FFmpeg/FFmpeg/blob/70d25268c21cbee5f08304da95be1f647c630c15/doc/APIchanges#L91
            avcodec_register_all();
#endif
#if LIBAVFORMAT_VERSION_INT < AV_VERSION_INT(58, 9, 100) 
            // See https://github.com/FFmpeg/FFmpeg/blob/70d25268c21cbee5f08304da95be1f647c630c15/doc/APIchanges#L86
            av_register_all();
#endif

            av_log_set_callback([](void* ptr, int level, const char *fmt, va_list vl) 
            {
                auto& logger = logger_ffmpeg_private();

                char line[256] = {0};
                static int print_prefix = 1;

                // Not sure if copying to vl2 is required by internal ffmpeg functions do this...
                va_list vl2;
                va_copy(vl2, vl);
                int size = av_log_format_line2(ptr, level, fmt, vl2, &line[0], sizeof(line), &print_prefix);
                va_end(vl2);

                // Remove all '\n' since dlib's logger already adds one
                size = std::min<int>(size, sizeof(line) - 1);
                line[size] = '\0';
                for (int i = size - 1 ; i >= 0 ; --i)
                    if (line[i] == '\n')
                        line[i] = ' ';

                switch(level)
                {
                    case AV_LOG_PANIC:
                    case AV_LOG_FATAL:      logger << LFATAL << line; break;
                    case AV_LOG_ERROR:      logger << LERROR << line; break;
                    case AV_LOG_WARNING:    logger << LWARN  << line; break;
                    case AV_LOG_INFO:       
                    case AV_LOG_VERBOSE:    logger << LINFO  << line; break;
                    case AV_LOG_DEBUG:      logger << LDEBUG << line; break;
                    case AV_LOG_TRACE:      logger << LTRACE << line; break;
                    default: break;
                }
            });

            return true;
        }();
        (void)REGISTERED;
    }        

// ---------------------------------------------------------------------------------------------------

    inline std::string get_av_error(int ret)
    {
        char buf[128] = {0};
        int suc = av_strerror(ret, buf, sizeof(buf));
        return suc == 0 ? buf : "couldn't set error";
    }

// ---------------------------------------------------------------------------------------------------

    ///////////////////////////
    // Channel layout stuff 
    ///////////////////////////

    inline uint64_t get_layout_from_channels(const std::size_t nchannels)
    {
        // This function is a bit ambiguous but good enough for dlib.
        // Multiple layouts can have the same number of channels
        switch(nchannels)
        {
            case 1: return AV_CH_LAYOUT_MONO;
            case 2: return AV_CH_LAYOUT_STEREO;
            default: DLIB_CASSERT(false, "Don't support " << nchannels << " yet"); return 0;
        }
    }

#if FFMPEG_HAS_CH_LAYOUT

    inline AVChannelLayout convert_layout(const uint64_t channel_layout)
    {
        AVChannelLayout ch_layout;
        ch_layout.order         = AV_CHANNEL_ORDER_NATIVE;
        ch_layout.u.mask        = channel_layout;
        ch_layout.nb_channels   = [=] 
        {
            switch(channel_layout)
            {
                case AV_CH_LAYOUT_MONO:                 return 1;
                case AV_CH_LAYOUT_STEREO:               return 2;
                case AV_CH_LAYOUT_2POINT1:              return 3;
                case AV_CH_LAYOUT_2_1:                  return 3;
                case AV_CH_LAYOUT_SURROUND:             return 3;
                case AV_CH_LAYOUT_3POINT1:              return 4;
                case AV_CH_LAYOUT_4POINT0:              return 4;
                case AV_CH_LAYOUT_4POINT1:              return 5;
                case AV_CH_LAYOUT_2_2:                  return 4;
                case AV_CH_LAYOUT_QUAD:                 return 4;
                case AV_CH_LAYOUT_5POINT0:              return 5;
                case AV_CH_LAYOUT_5POINT1:              return 6;
                case AV_CH_LAYOUT_5POINT0_BACK:         return 5;
                case AV_CH_LAYOUT_5POINT1_BACK:         return 6;
                case AV_CH_LAYOUT_6POINT0:              return 6;
                case AV_CH_LAYOUT_6POINT0_FRONT:        return 6;
                case AV_CH_LAYOUT_HEXAGONAL:            return 6;
                case AV_CH_LAYOUT_6POINT1:              return 7;
                case AV_CH_LAYOUT_6POINT1_BACK:         return 7;
                case AV_CH_LAYOUT_6POINT1_FRONT:        return 7;
                case AV_CH_LAYOUT_7POINT0:              return 7;
                case AV_CH_LAYOUT_7POINT0_FRONT:        return 7;
                case AV_CH_LAYOUT_7POINT1:              return 8;
                case AV_CH_LAYOUT_7POINT1_WIDE:         return 8;
                case AV_CH_LAYOUT_7POINT1_WIDE_BACK:    return 8;
                case AV_CH_LAYOUT_OCTAGONAL:            return 8;
                case AV_CH_LAYOUT_HEXADECAGONAL:        return 16;
                case AV_CH_LAYOUT_STEREO_DOWNMIX:       return 2;
#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(56, 58, 100)
                case AV_CH_LAYOUT_22POINT2:             return 24;
#endif
                default: break;
            }
            return 0;
        }();

        return ch_layout;
    }

    inline std::string get_channel_layout_str(const AVChannelLayout& ch_layout)
    {
        std::string str(32, '\0');
        const int ret = av_channel_layout_describe(&ch_layout, &str[0], str.size());
        if (ret > 0)
            str.resize(ret);
        else
            str.clear();
        return str;
    }

    inline std::string get_channel_layout_str(uint64_t channel_layout)
    {
        return get_channel_layout_str(convert_layout(channel_layout));
    }

    inline std::string get_channel_layout_str(const AVCodecContext* pCodecCtx)
    {
        return get_channel_layout_str(pCodecCtx->ch_layout);
    }

    inline bool channel_layout_empty(const AVCodecContext* pCodecCtx)
    {
        return av_channel_layout_check(&pCodecCtx->ch_layout) == 0;
    }

    inline bool channel_layout_empty(const AVFrame* frame)
    {
        return frame && av_channel_layout_check(&frame->ch_layout) == 0;
    }

    inline uint64_t get_layout(const AVCodecContext* pCodecCtx)
    {
        return pCodecCtx ? pCodecCtx->ch_layout.u.mask : 0;
    }

    inline uint64_t get_layout(const AVFrame* frame)
    {
        return frame ? frame->ch_layout.u.mask : 0;
    }

    inline void set_layout(AVCodecContext* pCodecCtx, const uint64_t channel_layout)
    {
        pCodecCtx->ch_layout = convert_layout(channel_layout);
    }

    inline void set_layout(AVFrame* frame, const uint64_t channel_layout)
    {
        frame->ch_layout = convert_layout(channel_layout);
    }

    inline int get_nchannels(const AVCodecContext* pCodecCtx)
    {
        return pCodecCtx ? pCodecCtx->ch_layout.nb_channels : 0;
    }

    inline int get_nchannels(const AVFrame* frame)
    {
        return frame ? frame->ch_layout.nb_channels : 0;
    }

    inline int get_nchannels(const uint64_t channel_layout)
    {
        return convert_layout(channel_layout).nb_channels;
    }

    inline void check_layout(AVCodecContext* pCodecCtx)
    {
        if (get_layout(pCodecCtx) == 0 && pCodecCtx->ch_layout.nb_channels > 0)
            av_channel_layout_default(&pCodecCtx->ch_layout, pCodecCtx->ch_layout.nb_channels);
    }

#else

    inline std::string get_channel_layout_str(uint64_t channel_layout)
    {
        std::string str(32, '\0');
        av_get_channel_layout_string(&str[0], str.size(), 0, channel_layout);
        str.resize(strlen(str.data()));
        return str;
    }

    inline std::string get_channel_layout_str(const AVCodecContext* pCodecCtx)
    {
        return get_channel_layout_str(pCodecCtx->channel_layout);
    }

    inline bool channel_layout_empty(const AVCodecContext* pCodecCtx)
    {
        return pCodecCtx->channel_layout == 0;
    }

    inline bool channel_layout_empty(const AVFrame* frame)
    {
        return frame->channel_layout == 0;
    }

    inline uint64_t get_layout(const AVCodecContext* pCodecCtx)
    {
        return pCodecCtx ? pCodecCtx->channel_layout : 0;
    }

    inline uint64_t get_layout(const AVFrame* frame)
    {
        return frame ? frame->channel_layout : 0;
    }

    inline void set_layout(AVCodecContext* pCodecCtx, const uint64_t channel_layout)
    {
        pCodecCtx->channel_layout = channel_layout;
    }

    inline void set_layout(AVFrame* frame, const uint64_t channel_layout)
    {
        frame->channel_layout = channel_layout;
    }

    inline int get_nchannels(const uint64_t channel_layout)
    {
        return av_get_channel_layout_nb_channels(channel_layout);
    }

    inline int get_nchannels(const AVCodecContext* pCodecCtx)
    {
        return pCodecCtx ? get_nchannels(pCodecCtx->channel_layout) : 0;
    }

    inline int get_nchannels(const AVFrame* frame)
    {
        return frame ? get_nchannels(frame->channel_layout) : 0;
    }    

    inline void check_layout(AVCodecContext* pCodecCtx) 
    {
        if (pCodecCtx->channel_layout == 0 && pCodecCtx->channels > 0)
            pCodecCtx->channel_layout = av_get_default_channel_layout(pCodecCtx->channels);
    }       
#endif

// ---------------------------------------------------------------------------------------------------

    struct av_deleter
    {
        void operator()(AVFrame* ptr)               const;
        void operator()(AVPacket* ptr)              const;
        void operator()(AVAudioFifo* ptr)           const;
        void operator()(SwsContext* ptr)            const;
        void operator()(SwrContext* ptr)            const;
        void operator()(AVCodecContext* ptr)        const;
        void operator()(AVCodecParserContext* ptr)  const;
        void operator()(AVFormatContext* ptr)       const;
        void operator()(AVDeviceInfoList* ptr)      const;
    };

    inline void av_deleter::operator()(AVFrame *ptr)               const { if (ptr) av_frame_free(&ptr); }
    inline void av_deleter::operator()(AVPacket *ptr)              const { if (ptr) av_packet_free(&ptr); }
    inline void av_deleter::operator()(AVAudioFifo *ptr)           const { if (ptr) av_audio_fifo_free(ptr); }
    inline void av_deleter::operator()(SwsContext *ptr)            const { if (ptr) sws_freeContext(ptr); }
    inline void av_deleter::operator()(SwrContext *ptr)            const { if (ptr) swr_free(&ptr); }
    inline void av_deleter::operator()(AVCodecContext *ptr)        const { if (ptr) avcodec_free_context(&ptr); }
    inline void av_deleter::operator()(AVCodecParserContext *ptr)  const { if (ptr) av_parser_close(ptr); }
    inline void av_deleter::operator()(AVDeviceInfoList* ptr)      const { if (ptr) avdevice_free_list_devices(&ptr); }
    inline void av_deleter::operator()(AVFormatContext *ptr)       const 
    { 
        if (ptr) 
        {
            if (ptr->iformat)
                avformat_close_input(&ptr); 
            else if (ptr->oformat)
                avformat_free_context(ptr);
        }
    }

    template<class AVObject>
    using av_ptr = std::unique_ptr<AVObject, details::av_deleter>;


// ---------------------------------------------------------------------------------------------------

    inline av_ptr<AVFrame> make_avframe()
    {
        av_ptr<AVFrame> obj(av_frame_alloc());
        if (!obj)
            throw std::runtime_error("Failed to allocate AVframe");
        return obj;
    }

    inline av_ptr<AVPacket> make_avpacket()
    {
        av_ptr<AVPacket> obj(av_packet_alloc());
        if (!obj)
            throw std::runtime_error("Failed to allocate AVPacket");
        return obj;
    }

// ---------------------------------------------------------------------------------------------------

    struct av_dict
    {
        av_dict() = default;
        av_dict(const std::unordered_map<std::string, std::string> &options);
        av_dict(const av_dict &ori);
        av_dict &operator=(const av_dict &ori);
        av_dict(av_dict &&ori) noexcept;
        av_dict &operator=(av_dict &&ori) noexcept;
        ~av_dict();
        size_t size() const;
        void print() const;
        AVDictionary** get();

        AVDictionary *avdic = nullptr;
    };

    inline av_dict::av_dict(const std::unordered_map<std::string, std::string>& options)
    {
        int ret = 0;

        for (const auto& opt : options) {
            if ((ret = av_dict_set(&avdic, opt.first.c_str(), opt.second.c_str(), 0)) < 0) {
                printf("av_dict_set() failed : %s\n", get_av_error(ret).c_str());
                break;
            }
        }
    }

    inline av_dict::av_dict(const av_dict& ori)
    {
        av_dict_copy(&avdic, ori.avdic, 0);
    }

    inline av_dict& av_dict::operator=(const av_dict& ori)
    {
        *this = std::move(av_dict{ori});
        return *this;
    }

    inline av_dict::av_dict(av_dict &&ori) noexcept
    : avdic{std::exchange(ori.avdic, nullptr)}
    {
    }

    inline av_dict &av_dict::operator=(av_dict &&ori) noexcept
    {
        if (this != &ori)
            avdic = std::exchange(ori.avdic, nullptr);
        return *this;
    }

    inline av_dict::~av_dict()
    {
        if (avdic)
            av_dict_free(&avdic);
    }

    inline AVDictionary** av_dict::get()
    {
        return avdic ? &avdic: nullptr;
    }

    inline std::size_t av_dict::size() const
    {
        return avdic ? av_dict_count(avdic) : 0;
    }

    inline void av_dict::print() const
    {
        if (avdic)
        {
            AVDictionaryEntry *tag = nullptr;
            while ((tag = av_dict_get(avdic, "", tag, AV_DICT_IGNORE_SUFFIX)))
                printf("%s : %s\n", tag->key, tag->value);
        }
    }    

// ---------------------------------------------------------------------------------------------------

    inline AVCodecID pick_codec_from_filename(const std::string& filename)
    {
        const auto ext_pos = filename.find_last_of(".");

        if (ext_pos != std::string::npos)
        {
            const std::string ext = filename.substr(ext_pos + 1);

            if (ext == "png" || ext == "PNG")
                return AV_CODEC_ID_PNG;
            else if (ext == "jpeg" || ext == "jpg" || ext == "JPEG")
                return AV_CODEC_ID_MJPEG;
            else if (ext == "tiff")
                return AV_CODEC_ID_TIFF;
            else if (ext == "webp")
                return AV_CODEC_ID_WEBP;
            else if (ext == "bmp")
                return AV_CODEC_ID_BMP;
            else if (ext == "h264")
                return AV_CODEC_ID_H264;
            else if (ext == "h265" || ext == "hevc")
                return AV_CODEC_ID_H265;
            else if (ext == "aac")
                return AV_CODEC_ID_AAC;
            else if (ext == "ac3")
                return AV_CODEC_ID_AC3;
            else if (ext == "jls")
                return AV_CODEC_ID_JPEGLS;
            else if (ext == "jp2")
                return AV_CODEC_ID_JPEG2000;
#if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(59, 37, 100)
            else if (ext == "jxl")
                return AV_CODEC_ID_JPEGXL;
#endif
        }

        return AV_CODEC_ID_NONE;
    }

// ---------------------------------------------------------------------------------------------------

}}}

#endif //DLIB_FFMPEG_DETAILS