dlib C++ Library - bpe_tokenizer

// Copyright (C) 2025 Davis E. King (davis@dlib.net)
// License: Boost Software License   See LICENSE.txt for the full license.
#undef DLIB_BPE_TOKENIZER_ABSTRACT_
#ifdef DLIB_BPE_TOKENIZER_ABSTRACT_

#include <string>
#include <vector>
#include <map>
#include <unordered_map>
#include <iostream>
#include <mutex>
#include <thread>
#include <future>
#include <queue>

namespace dlib
{

    class bpe_tokenizer
    {
        /*!
            WHAT THIS OBJECT REPRESENTS
                This class implements a Byte Pair Encoding (BPE) tokenizer, which is a subword
                tokenization algorithm commonly used in natural language processing (NLP). The
                BPE algorithm iteratively merges the most frequent pairs of bytes or characters
                to form a vocabulary of subword units. This approach is particularly useful for
                handling out-of-vocabulary words and reducing the size of the vocabulary while
                maintaining the ability to represent any text.

                The tokenizer supports special tokens, which can be used to mark specific elements
                in the text (e.g., `<text>`, `<url>`, `<image>`, etc.). These special tokens are
                treated as atomic units during tokenization and are not subject to further splitting.

                The class provides methods for training the tokenizer on a given text corpus, encoding
                text into subword tokens, and decoding tokens back into text. The tokenizer can be
                serialized and deserialized to/from a file, allowing for easy storage and reuse.

                REFERENCES
                    - Sennrich, R., Haddow, B., & Birch, A. (2016). Neural Machine Translation of
                      Rare Words with Subword Units. In Proceedings of the 54th Annual Meeting of
                      the Association for Computational Linguistics (ACL 2016).

            INITIAL VALUE
                - The base vocabulary is initialized with single-byte tokens (0-255).
                - Special tokens are pre-defined and assigned IDs starting from 256.
                - The maximum token length is set to 8 bytes.
        !*/

    public:
        bpe_tokenizer();
        /*!
            ensures
                - Initializes the tokenizer with a base vocabulary of single-byte tokens (0-255).
                - Pre-defines special tokens and assigns them unique IDs starting from 256.
        !*/

        void train(
            const std::string& text,
            int vocab_size,
            bool verbose = false
        );
        /*!
            requires
                - vocab_size >= 256
            ensures
                - Trains the tokenizer on the provided text corpus.
                - Iteratively merges the most frequent pairs of tokens to form a subword vocabulary
                  of size `vocab_size`.
                - If `verbose` is true, progress information is printed to the standard output.
        !*/

        std::vector<int> encode(
            const std::string& text
        ) const;
        /*!
            ensures
                - Encodes the input text into a sequence of subword tokens.
                - Special tokens are automatically added to mark the beginning and end of paragraphs.
                - Returns a vector of token IDs representing the encoded text.
        !*/

        std::string decode(
            const std::vector<int>& ids,
            bool display_special_tokens = true
        ) const;
        /*!
            ensures
                - Decodes a sequence of token IDs back into a human-readable string.
                - If `display_special_tokens` is true, special tokens are included in the output.
                - Returns the decoded text as a UTF-8 encoded string.
        !*/

        std::string decode(int id, bool display_special_tokens = true) const
        { return decode(std::vector<int>({ id }), display_special_tokens); }
        /*!
            ensures
                - decode a single token back into text.
        !*/

        int get_special_token_id(
            const std::string& token
        ) const;
        /*!
            ensures
                - Returns the ID of the specified special token.
                - Throws an exception if the token is not found in the special tokens map.
        !*/

        size_t get_vocab_size() const;
        /*!
            ensures
                - Returns the total size of the vocabulary, including base tokens and special tokens.
        !*/
    };

    void serialize(
        const bpe_tokenizer& tok,
        std::ostream& out
    );
    /*!
        ensures
            - Saves the entire state of tok to out.
    !*/

    void deserialize(
        bpe_tokenizer& tok,
        std::istream& in
    );
    /*!
        ensures
            - Restores the state of a bpe_tokenizer from a serialized state.
    !*/
}

#endif // DLIB_BPE_TOKENIZER_ABSTRACT_