// Copyright (C) 2007 Davis E. King (davis@dlib.net), and Nils Labugt
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_UNICODe_H_
#define DLIB_UNICODe_H_
#include "../uintn.h"
#include "../algs.h"
#include "unicode_abstract.h"
#include <string>
#include <cstring>
#include <fstream>
namespace dlib
{
// ----------------------------------------------------------------------------------------
using unichar = char32_t;
using ustring = std::basic_string<unichar>;
// ----------------------------------------------------------------------------------------
namespace unicode_helpers
{
template <
typename charT,
typename forward_iterator
>
int u8_to_u32(
charT& result,
forward_iterator ibegin,
forward_iterator iend
)
/*!
requires
- ibegin == iterator pointing to the start of the range
- iend == iterator pointing to the end of the range
ensures
- if (there just wasn't any more data and ibegin >= iend) then
- returns 0
- else if (we decoded another character without error) then
- #result == the decoded character
- returns the number of bytes consumed to make this character
- else
- some error occurred
- returns -1
!*/
{
if (ibegin >= iend)
return 0;
int val = static_cast<unsigned char>(*ibegin);
unichar ch[4];
ch[0] = zero_extend_cast<unichar>(val);
if (ch[0] < 0x80)
{
result = static_cast<charT>(ch[0]);
return 1;
}
if ((ch[0] & ~0x3F ) == 0x80)
{
// invalid leading byte
return -1;
}
if ((ch[0] & ~0x1F) == 0xC0)
{
if (++ibegin == iend)
return -1;
val = static_cast<unsigned char>(*ibegin);
ch[1] = zero_extend_cast<unichar>(val);
if ((ch[1] & ~0x3F ) != 0x80)
return -1; // invalid tail
if ((ch[0] & ~0x01 ) == 0xC0)
return -1; // overlong form
ch[0] &= 0x1F;
ch[1] &= 0x3F;
result = static_cast<charT>((ch[0] << 6) | ch[1]);
return 2;
}
if ((ch[0] & ~0x0F ) == 0xE0)
{
for (unsigned n = 1; n < 3; ++n)
{
if (++ibegin == iend)
return -1;
val = static_cast<unsigned char>(*ibegin);
ch[n] = zero_extend_cast<unichar>(val);
if ((ch[n] & ~0x3F) != 0x80)
return -1; // invalid tail
ch[n] &= 0x3F;
}
ch[0] &= 0x0F;
result = static_cast<charT>((ch[0] << 12) | (ch[1] << 6) | ch[2]);
if (result < 0x0800)
return -1; // overlong form
if (result >= 0xD800 && result < 0xE000)
return -1; // invalid character (UTF-16 surrogate pairs)
if (result >= 0xFDD0 && result <= 0xFDEF)
return -1; // noncharacter
if (result >= 0xFFFE)
return -1; // noncharacter
return 3;
}
if ((ch[0] & ~0x07) == 0xF0)
{
for (unsigned n = 1; n < 4; ++n)
{
if (++ibegin == iend)
return -1;
val = static_cast<unsigned char>(*ibegin);
ch[n] = zero_extend_cast<unichar>(val);
if ((ch[n] & ~0x3F) != 0x80)
return -1; // invalid tail
ch[n] &= 0x3F;
}
if ((ch[0] ^ 0xF6) < 4)
return -1;
ch[0] &= 0x07;
result = static_cast<charT>((ch[0] << 18) | (ch[1] << 12) | (ch[2] << 6) | ch[3]);
if (result < 0x10000)
return -1; // overlong form
if ((result & 0xFFFF) >= 0xFFFE)
return -1; // noncharacter
return 4;
}
return -1;
}
// ------------------------------------------------------------------------------------
template <typename charT>
class basic_utf8_streambuf : public std::basic_streambuf<charT>
{
public:
basic_utf8_streambuf (
std::ifstream& fin_
) :
fin(fin_)
{
this->setg(in_buffer+max_putback,
in_buffer+max_putback,
in_buffer+max_putback);
}
protected:
using int_type = typename std::basic_streambuf<charT>::int_type;
// input functions
int_type underflow(
)
{
if (this->gptr() < this->egptr())
{
return zero_extend_cast<int_type>(*this->gptr());
}
int num_put_back = static_cast<int>(this->gptr() - this->eback());
if (num_put_back > max_putback)
{
num_put_back = max_putback;
}
// copy the putback characters into the putback end of the in_buffer
std::memmove(in_buffer+(max_putback-num_put_back), this->gptr()-num_put_back, num_put_back);
// fill the buffer with characters
int n = in_buffer_size-max_putback;
int i;
for (i = 0; i < n; ++i)
{
charT ch;
using iter_type = std::istreambuf_iterator<char>;
if (unicode_helpers::u8_to_u32(ch, iter_type(fin), iter_type()) > 0)
{
(in_buffer+max_putback)[i] = ch;
}
else
{
break;
}
}
if (i == 0)
{
// an error occurred or we hit EOF
return EOF;
}
// reset in_buffer pointers
this->setg (in_buffer+(max_putback-num_put_back),
in_buffer+max_putback,
in_buffer+max_putback+i);
return zero_extend_cast<int_type>(*this->gptr());
}
private:
std::ifstream& fin;
static const int max_putback = 4;
static const int in_buffer_size = 10;
charT in_buffer[in_buffer_size];
};
static const unichar SURROGATE_FIRST_TOP = 0xD800;
static const unichar SURROGATE_SECOND_TOP = 0xDC00;
static const unichar SURROGATE_CLEARING_MASK = 0x03FF;
static const unichar SURROGATE_TOP = SURROGATE_FIRST_TOP;
static const unichar SURROGATE_END = 0xE000;
static const unichar SMP_TOP = 0x10000;
static const int VALID_BITS = 10;
}
// ----------------------------------------------------------------------------------------
template <typename T>
bool is_combining_char(
const T ch_
)
{
const unichar ch = zero_extend_cast<unichar>(ch_);
if (ch < 0x300)
return false;
if (ch < 0x370)
return true;
if (ch < 0x800) {
if (ch < 0x483)
return false;
if (ch < 0x48A)
return true;
if (ch < 0x591)
return false;
if (ch < 0x5D0) {
if (ch == 0x5C0)
return false;
if (ch == 0x5C3)
return false;
if (ch == 0x5C6)
return false;
return true;
}
if (ch < 0x610)
return false;
if (ch < 0x616)
return true;
if (ch < 0x64B)
return false;
if (ch < 0x660)
return true;
if (ch == 0x670)
return true;
if (ch < 0x6D6)
return false;
if (ch < 0x6EE) {
if (ch == 0x6DD)
return false;
if (ch == 0x6E5)
return false;
if (ch == 0x6E6)
return false;
if (ch == 0x6E9)
return false;
return true;
}
if (ch == 0x711)
return true;
if (ch < 0x730)
return false;
if (ch < 0x74B)
return true;
if (ch < 0x7A6)
return false;
if (ch < 0x7B1)
return true;
if (ch < 0x7EB)
return false;
if (ch < 0x7F4)
return true;
return false;
}
if (ch < 0xA00) {
if (ch < 0x901)
return false;
if (ch < 0x904)
return true;
if (ch < 0x93C)
return false;
if (ch < 0x955) {
if (ch == 0x93D)
return false;
if (ch == 0x950)
return false;
return true;
}
if (ch < 0x962)
return false;
if (ch < 0x964)
return true;
if (ch < 0x981)
return false;
if (ch < 0x984)
return true;
if (ch < 0x9BC)
return false;
if (ch < 0x9D8) {
if (ch == 0x9BD)
return false;
if (ch == 0x9CE)
return false;
return true;
}
if (ch < 0x9E2)
return false;
if (ch < 0x9E4)
return true;
return false;
}
if (ch < 0xC00) {
if (ch < 0xA01)
return false;
if (ch < 0xA04)
return true;
if (ch < 0xA3C)
return false;
if (ch < 0xA4E)
return true;
if (ch < 0xA70)
return false;
if (ch < 0xA72)
return true;
if (ch < 0xA81)
return false;
if (ch < 0xA84)
return true;
if (ch < 0xABC)
return false;
if (ch < 0xACE) {
if (ch == 0xABD)
return false;
return true;
}
if (ch < 0xAE2)
return false;
if (ch < 0xAE4)
return true;
if (ch < 0xB01)
return false;
if (ch < 0xB04)
return true;
if (ch < 0xB3C)
return false;
if (ch < 0xB58) {
if (ch == 0xB3D)
return false;
return true;
}
if (ch == 0xB82)
return true;
if (ch < 0xBBE)
return false;
if (ch < 0xBD8)
return true;
if (ch == 0xBF4)
return true;
if (ch == 0xBF8)
return true;
return false;
}
if (ch < 0xE00) {
if (ch < 0xC01)
return false;
if (ch < 0xC04)
return true;
if (ch < 0xC3E)
return false;
if (ch < 0xC57)
return true;
if (ch < 0xC82)
return false;
if (ch < 0xC84)
return true;
if (ch < 0xCBC)
return false;
if (ch < 0xCD7) {
if (ch == 0xCBD)
return false;
return true;
}
if (ch < 0xCE2)
return false;
if (ch < 0xCE4)
return true;
if (ch < 0xD02)
return false;
if (ch < 0xD04)
return true;
if (ch < 0xD3E)
return false;
if (ch < 0xD58)
return true;
if (ch < 0xD82)
return false;
if (ch < 0xD84)
return true;
if (ch < 0xDCA)
return false;
if (ch < 0xDF4)
return true;
return false;
}
if (ch < 0x1000) {
if (ch == 0xE31)
return true;
if (ch < 0xE34)
return false;
if (ch < 0xE3B)
return true;
if (ch < 0xE47)
return false;
if (ch < 0xE4F)
return true;
if (ch == 0xEB1)
return true;
if (ch < 0xEB4)
return false;
if (ch < 0xEBD)
return true;
if (ch < 0xEC8)
return false;
if (ch < 0xECE)
return true;
if (ch < 0xF18)
return false;
if (ch < 0xF1A)
return true;
if (ch == 0xF35)
return true;
if (ch == 0xF37)
return true;
if (ch == 0xF39)
return true;
if (ch < 0xF3E)
return false;
if (ch < 0xF40)
return true;
if (ch < 0xF71)
return false;
if (ch < 0xF88) {
if (ch == 0xF85)
return false;
return true;
}
if (ch < 0xF90)
return false;
if (ch < 0xFBD)
return true;
if (ch == 0xFC6)
return true;
return false;
}
if (ch < 0x1800) {
if (ch < 0x102C)
return false;
if (ch < 0x1040)
return true;
if (ch < 0x1056)
return false;
if (ch < 0x105A)
return true;
if (ch == 0x135F)
return true;
if (ch < 0x1712)
return false;
if (ch < 0x1715)
return true;
if (ch < 0x1732)
return false;
if (ch < 0x1735)
return true;
if (ch < 0x1752)
return false;
if (ch < 0x1754)
return true;
if (ch < 0x1772)
return false;
if (ch < 0x1774)
return true;
if (ch < 0x17B6)
return false;
if (ch < 0x17D4)
return true;
if (ch == 0x17DD)
return true;
return false;
}
if (ch < 0x2000) {
if (ch < 0x180B)
return false;
if (ch < 0x180E)
return true;
if (ch == 0x18A9)
return true;
if (ch < 0x1920)
return false;
if (ch < 0x193C)
return true;
if (ch < 0x19B0)
return false;
if (ch < 0x19C1)
return true;
if (ch < 0x19C8)
return false;
if (ch < 0x19CA)
return true;
if (ch < 0x1A17)
return false;
if (ch < 0x1A1C)
return true;
if (ch < 0x1B00)
return false;
if (ch < 0x1B05)
return true;
if (ch < 0x1B34)
return false;
if (ch < 0x1B45)
return true;
if (ch < 0x1B6B)
return false;
if (ch < 0x1B74)
return true;
if (ch < 0x1DC0)
return false;
if (ch < 0x1E00)
return true;
return false;
}
if (ch < 0x20D0)
return false;
if (ch < 0x2100)
return true;
if (ch < 0x302A)
return false;
if (ch < 0x3030)
return true;
if (ch < 0x3099)
return false;
if (ch < 0x309B)
return true;
if (ch == 0xA802)
return true;
if (ch == 0xA806)
return true;
if (ch == 0xA80B)
return true;
if (ch < 0xA823)
return false;
if (ch < 0xA828)
return true;
if (ch == 0xFB1E)
return true;
if (ch < 0xFE00)
return false;
if (ch < 0xFE10)
return true;
if (ch < 0xFE20)
return false;
if (ch < 0xFE30)
return true;
if (ch < 0x10A01)
return false;
if (ch < 0x10A10)
return true;
if (ch < 0x10A38)
return false;
if (ch < 0x10A40)
return true;
if (ch < 0x1D165)
return false;
if (ch < 0x1D16A)
return true;
if (ch < 0x1D16D)
return false;
if (ch < 0x1D173)
return true;
if (ch < 0x1D17B)
return false;
if (ch < 0x1D183)
return true;
if (ch < 0x1D185)
return false;
if (ch < 0x1D18C)
return true;
if (ch < 0x1D1AA)
return false;
if (ch < 0x1D1AE)
return true;
if (ch < 0x1D242)
return false;
if (ch < 0x1D245)
return true;
if (ch < 0xE0100)
return false;
if (ch < 0xE01F0)
return true;
return false;
}
// ----------------------------------------------------------------------------------------
void unichar_to_surrogate_pair(unichar input, unichar &first, unichar &second);
// ----------------------------------------------------------------------------------------
template <typename T> bool is_surrogate(T ch)
{
using namespace unicode_helpers;
return (zero_extend_cast<unichar>(ch) >= SURROGATE_TOP &&
zero_extend_cast<unichar>(ch) < SURROGATE_END);
}
// ----------------------------------------------------------------------------------------
template <typename T> unichar surrogate_pair_to_unichar(T first, T second)
{
using namespace unicode_helpers;
return ((first & SURROGATE_CLEARING_MASK) << VALID_BITS) | ((second & SURROGATE_CLEARING_MASK) + SMP_TOP);
}
//110110 0000000000
//110111 0000000000
// ----------------------------------------------------------------------------------------
class invalid_utf8_error : public error
{
public:
invalid_utf8_error():error(EUTF8_TO_UTF32) {}
};
template <typename forward_iterator, typename unary_op>
inline void convert_to_utf32(
forward_iterator ibegin,
forward_iterator iend,
unary_op op
)
{
using char_type = std::decay_t<decltype(*ibegin)>;
static_assert(std::is_same<char_type, char>::value ||
std::is_same<char_type, wchar_t>::value ||
std::is_same<char_type, unichar>::value,
"char_type must be either char or unichar");
if (std::is_same<char_type, unichar>::value)
{
while (ibegin != iend)
op(*(ibegin++));
return;
}
if (std::is_same<char_type, wchar_t>::value)
{
// Unix
if (sizeof(wchar_t) == 4)
{
while (ibegin != iend)
op(static_cast<unichar>(*(ibegin++)));
return;
}
// Win32
if (sizeof(wchar_t) == 2)
{
while (ibegin != iend)
{
if (is_surrogate(*ibegin))
{
op(surrogate_pair_to_unichar(*ibegin, *(ibegin+ 1)));
ibegin += 2;
}
else
{
op(zero_extend_cast<unichar>(*ibegin));
ibegin += 1;
}
}
return;
}
throw invalid_utf8_error();
}
if (std::is_same<char_type, char>::value)
{
unichar ch;
int status = 0;
while (ibegin != iend)
{
status = unicode_helpers::u8_to_u32(ch, ibegin, iend);
if (status > 0)
{
op(ch);
ibegin += status;
}
else
{
break;
}
}
if (status < 0)
throw invalid_utf8_error();
}
}
template <typename char_type, typename traits, typename alloc>
const ustring convert_to_utf32 (
const std::basic_string<char_type, traits, alloc>& str
)
{
ustring temp;
temp.reserve(str.size());
convert_to_utf32(str.begin(), str.end(), [&](unichar ch) { temp.push_back(ch); });
return temp;
}
const ustring convert_utf8_to_utf32(const std::string& str);
const ustring convert_wstring_to_utf32(const std::wstring& str);
// ----------------------------------------------------------------------------------------
const std::wstring convert_utf32_to_wstring (
const ustring &src
);
const std::wstring convert_mbstring_to_wstring (
const std::string &src
);
const std::string convert_wstring_to_mbstring(
const std::wstring &src
);
// ----------------------------------------------------------------------------------------
template <typename charT>
class basic_utf8_ifstream : public std::basic_istream<charT>
{
public:
basic_utf8_ifstream (
) : std::basic_istream<charT>(&buf), buf(fin) {}
basic_utf8_ifstream (
const char* file_name,
std::ios_base::openmode mode = std::ios::in
) :
std::basic_istream<charT>(&buf),
buf(fin)
{
fin.open(file_name,mode);
// make this have the same error state as fin
this->clear(fin.rdstate());
}
basic_utf8_ifstream (
const std::string& file_name,
std::ios_base::openmode mode = std::ios::in
) :
std::basic_istream<charT>(&buf),
buf(fin)
{
fin.open(file_name.c_str(),mode);
// make this have the same error state as fin
this->clear(fin.rdstate());
}
void open(
const std::string& file_name,
std::ios_base::openmode mode = std::ios::in
)
{
open(file_name.c_str(),mode);
}
void open (
const char* file_name,
std::ios_base::openmode mode = std::ios::in
)
{
fin.close();
fin.clear();
fin.open(file_name,mode);
// make this have the same error state as fin
this->clear(fin.rdstate());
}
void close (
)
{
fin.close();
// make this have the same error state as fin
this->clear(fin.rdstate());
}
private:
std::ifstream fin;
unicode_helpers::basic_utf8_streambuf<charT> buf;
};
using utf8_uifstream = basic_utf8_ifstream<unichar>;
using utf8_wifstream = basic_utf8_ifstream<wchar_t>;
// ----------------------------------------------------------------------------------------
}
#ifdef NO_MAKEFILE
#include "unicode.cpp"
#endif
#endif // DLIB_UNICODe_H_