// Copyright (C) 2005  Davis E. King (davis@dlib.net)
// License: Boost Software License   See LICENSE.txt for the full license.
#ifndef DLIB_TOKENIZER_KERNEL_1_CPp_
#define DLIB_TOKENIZER_KERNEL_1_CPp_
#include "tokenizer_kernel_1.h"

#include <iostream>
#include <cstdio>

namespace dlib
{

// ----------------------------------------------------------------------------------------

    tokenizer_kernel_1::
    tokenizer_kernel_1 (        
    ) :
        headset(0),
        bodyset(0),
        have_peeked(false)
    {
        try
        {
            headset = new bool[UCHAR_MAX];
            bodyset = new bool[UCHAR_MAX];

            clear();
        }
        catch (...)
        {
            if (headset) delete [] headset;
            if (bodyset) delete [] bodyset;
            throw;
        }
    }

// ----------------------------------------------------------------------------------------

    tokenizer_kernel_1::
    ~tokenizer_kernel_1 (
    )
    {
        delete [] bodyset;
        delete [] headset;
    }

// ----------------------------------------------------------------------------------------

    void tokenizer_kernel_1::
    clear(
    )
    {
        using namespace std;

        in = 0;
        streambuf = 0;
        have_peeked = false;

        head = "_" + lowercase_letters() + uppercase_letters();
        body = "_" + lowercase_letters() + uppercase_letters() + numbers();

        for (unsigned long i = 0; i < UCHAR_MAX; ++i)
        {
            headset[i] = false;
            bodyset[i] = false;
        }

        for (string::size_type i = 0; i < head.size(); ++i)
            headset[static_cast<unsigned char>(head[i])] = true;
        for (string::size_type i = 0; i < body.size(); ++i)
            bodyset[static_cast<unsigned char>(body[i])] = true;
    }

// ----------------------------------------------------------------------------------------

    void tokenizer_kernel_1::
    set_stream (
        std::istream& in_
    )
    {
        in = &in_;
        streambuf = in_.rdbuf();
        have_peeked = false;
    }

// ----------------------------------------------------------------------------------------

    bool tokenizer_kernel_1::
    stream_is_set (
    ) const
    {
        return (in != 0);
    }

// ----------------------------------------------------------------------------------------

    std::istream& tokenizer_kernel_1::
    get_stream (
    ) const
    {
        return *in;
    }

// ----------------------------------------------------------------------------------------

    void tokenizer_kernel_1::
    get_token (
        int& type,
        std::string& token
    )
    {
        if (!have_peeked)
        {
            std::streambuf::int_type ch;
            ch = streambuf->sbumpc();

            switch (ch)
            {
            case EOF:
                type = END_OF_FILE;
                token.clear();
                return;

            case '\n':
                type = END_OF_LINE;
                token = "\n";
                return;

            case '\r':
            case ' ':
            case '\t':
                type = WHITE_SPACE;
                token = static_cast<char>(ch);
                ch = streambuf->sgetc();
                while ((ch == ' ' || ch == '\t' || ch == '\r') && ch != EOF)
                {
                    token += static_cast<char>(ch);
                    ch = streambuf->snextc();
                }
                return;

            default:
                if (headset[static_cast<unsigned char>(ch)])
                {
                    type = IDENTIFIER;
                    token = static_cast<char>(ch);
                    ch = streambuf->sgetc();
                    while ( bodyset[static_cast<unsigned char>(ch)] && ch != EOF )
                    {
                        token += static_cast<char>(ch);
                        ch = streambuf->snextc();
                    }
                }
                else if ('0' <= ch && ch <= '9')
                {
                    type = NUMBER;
                    token = static_cast<char>(ch);
                    ch = streambuf->sgetc();
                    while (('0' <= ch && ch <= '9') && ch != EOF)
                    {
                        token += static_cast<char>(ch);
                        ch = streambuf->snextc();
                    }
                }
                else
                {
                    type = CHAR;
                    token = static_cast<char>(ch);
                }
                return;
            } // switch (ch)
        }
        
        // if we get this far it means we have peeked so we should 
        // return the peek data.
        type = next_type;
        token = next_token;
        have_peeked = false;
    }

// ----------------------------------------------------------------------------------------

    int tokenizer_kernel_1::
    peek_type (
    ) const
    {
        const_cast<tokenizer_kernel_1*>(this)->get_token(next_type,next_token);
        have_peeked = true;
        return next_type;
    }

// ----------------------------------------------------------------------------------------

    const std::string& tokenizer_kernel_1::
    peek_token (
    ) const
    {
        const_cast<tokenizer_kernel_1*>(this)->get_token(next_type,next_token);
        have_peeked = true;
        return next_token;
    }

// ----------------------------------------------------------------------------------------

    void tokenizer_kernel_1::
    swap (
        tokenizer_kernel_1& item
    )
    {
        exchange(in,item.in);
        exchange(streambuf,item.streambuf);
        exchange(head,item.head);
        exchange(body,item.body);
        exchange(bodyset,item.bodyset);
        exchange(headset,item.headset);
        exchange(have_peeked,item.have_peeked);
        exchange(next_type,item.next_type);
        exchange(next_token,item.next_token);
    }

// ----------------------------------------------------------------------------------------
    
    void tokenizer_kernel_1::
    set_identifier_token (
        const std::string& head_,
        const std::string& body_
    )
    {
        using namespace std;

        head = head_;
        body = body_;

        for (unsigned long i = 0; i < UCHAR_MAX; ++i)
        {
            headset[i] = false;
            bodyset[i] = false;
        }

        for (string::size_type i = 0; i < head.size(); ++i)
            headset[static_cast<unsigned char>(head[i])] = true;
        for (string::size_type i = 0; i < body.size(); ++i)
            bodyset[static_cast<unsigned char>(body[i])] = true;
    }

// ----------------------------------------------------------------------------------------
    
    const std::string tokenizer_kernel_1::
    get_identifier_head (
    ) const
    {
        return head;
    }

// ----------------------------------------------------------------------------------------
    
    const std::string tokenizer_kernel_1::
    get_identifier_body (
    ) const
    {
        return body;
    }

// ----------------------------------------------------------------------------------------
    
    const std::string tokenizer_kernel_1::
    lowercase_letters (
    ) const
    {
        return std::string("abcdefghijklmnopqrstuvwxyz");
    }

// ----------------------------------------------------------------------------------------
    
    const std::string tokenizer_kernel_1::
    uppercase_letters (
    ) const
    {
        return std::string("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
    }

// ----------------------------------------------------------------------------------------
    
    const std::string tokenizer_kernel_1::
    numbers (
    ) const
    {
        return std::string("0123456789");
    }
    
// ----------------------------------------------------------------------------------------
    
}
#endif // DLIB_TOKENIZER_KERNEL_1_CPp_