///###////////////////////////////////////////////////////////////////////////
//
// Burton Computer Corporation
// http://www.burton-computer.com
// http://www.cooldevtools.com
// $Id: TraditionalMailMessageParser.cc 97 2004-11-19 20:06:18Z brian $
//
// Copyright (C) 2000 Burton Computer Corporation
// ALL RIGHTS RESERVED
//
// This program is open source software; you can redistribute it
// and/or modify it under the terms of the Q Public License (QPL)
// version 1.0. Use of this software in whole or in part, including
// linking it (modified or unmodified) into other programs is
// subject to the terms of the QPL.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// Q Public License for more details.
//
// You should have received a copy of the Q Public License
// along with this program; see the file LICENSE.txt.  If not, visit
// the Burton Computer Corporation or CoolDevTools web site
// QPL pages at:
//
//    http://www.burton-computer.com/qpl.html
//    http://www.cooldevtools.com/qpl.html
//

#include "Configuration.h"
#include "AbstractMultiLineString.h"
#include "AbstractTokenizer.h"
#include "AbstractTokenReceiver.h"
#include "MessageHeader.h"
#include "MessageHeaderList.h"
#include "MailMessageList.h"
#include "SimpleTokenizer.h"
#include "PhrasingTokenizer.h"
#include "UrlOnlyHtmlTokenizer.h"
#include "TokenFilteringTokenizer.h"
#include "SimpleMultiLineStringCharReader.h"
#include "StringReader.h"
#include "MimeDecoder.h"
#include "TraditionalMailMessageParser.h"

static const string URL_PREFIX("U_");
static const string IP_ADDRESS_REGEX("[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+");
static const string IP_ADDRESS_TERM("IP_ADDRESS");
static const string LINE_SEPARATOR(" ");

class ParserTokenReceiver : public AbstractTokenReceiver
{
public:
    ParserTokenReceiver(TraditionalMailMessageParser *target,
                        bool is_text)
    : m_target(target), m_isText(is_text)
    {
    }

    ~ParserTokenReceiver()
    {
    }

    void receiveToken(const string &token)
    {
        if (m_isText) {
            m_target->receiveTextToken(token);
        } else {
            m_target->receiveTagToken(token);
        }
    }

private:
    TraditionalMailMessageParser *m_target;
    bool m_isText;
};

TraditionalMailMessageParser::TraditionalMailMessageParser(Configuration *config)
  : m_config(config),
    m_ipRegex(IP_ADDRESS_REGEX)
{
}

TraditionalMailMessageParser::~TraditionalMailMessageParser()
{
}

OWNED AbstractTokenizer *TraditionalMailMessageParser::createTokenizer()
{
  NewPtr<AbstractTokenizer> answer;
  answer.set(new SimpleTokenizer(m_config->getNonAsciiCharReplacement()));
  answer.set(new TokenFilteringTokenizer(answer.release(), m_config->getMinTermLength(), m_config->getMaxTermLength(), false));
  answer.set(new PhrasingTokenizer(answer.release(), m_config->getMinPhraseTerms(), m_config->getMaxPhraseTerms(), m_config->getMaxPhraseChars()));
  return answer.release();
}

OWNED Message *TraditionalMailMessageParser::parseMailMessage(MailMessage *source)
{
    m_message.set(new Message());
    m_message->setMaxTokenCount(m_config->getMaxTermsPerMessage());
    m_textTokenizer.set(createTokenizer());
    m_tagTokenizer.set(createTokenizer());
    m_textReceiver.set(new ParserTokenReceiver(this, true));
    m_tagReceiver.set(new ParserTokenReceiver(this, false));
    if (m_config->getRemoveHTML()) {
      m_htmlTokenizer.set(new UrlOnlyHtmlTokenizer(m_textTokenizer.get(), m_tagTokenizer.get(), 256, m_tagReceiver.get()));
    } else {
      m_htmlTokenizer.set(new HtmlTokenizer(m_textTokenizer.get(), m_tagTokenizer.get(), 256, m_tagReceiver.get()));
    }
    parseBody(source);
    return m_message.release();
}

void TraditionalMailMessageParser::receiveTagToken(const string &token)
{
    addTerm(URL_PREFIX, token, Token::FLAG_NORMAL);
    addDerivedTerms(URL_PREFIX, token);
    addTerm(m_prefix, token, Token::FLAG_NORMAL);
    addDerivedTerms(m_prefix, token);
}

void TraditionalMailMessageParser::receiveTextToken(const string &token)
{
    addTerm(m_prefix, token, Token::FLAG_NORMAL);
    addDerivedTerms(m_prefix, token);
}

void TraditionalMailMessageParser::addDerivedTerms(const string &prefix,
                                                   const string &token)
{
    if (isPhrase(token)) {
        return;
    }

    if (m_ipRegex.match(token)) {
        addTerm(prefix, IP_ADDRESS_TERM, Token::FLAG_DERIVED);
    }

    addTokenParts(prefix, token);
}

void TraditionalMailMessageParser::addTokenParts(const string &prefix,
                                                 const string &token)
{
    const char *word_start = token.c_str();
    const char *s = word_start;
    while (*s) {
        while (*s && !is_alnum(*s) && !(*s & 0x80)) {
            ++s;
        }

        bool all_digits = true;
        const char *start = s;
        while (*s && (is_alnum(*s) || (*s & 0x80))) {
            all_digits = all_digits && is_digit(*s);
            ++s;
        }
        const char *end = s;

        if (!all_digits) {
            if (start != word_start) {
                addTerm(prefix, start, Token::FLAG_DERIVED);
            }

            if (((end - start) > 1) && *end && !all_digits) {
                addTerm(prefix, string(start, end), Token::FLAG_DERIVED);
            }
        }
    }

}

void TraditionalMailMessageParser::addTerm(const string &prefix,
                                           const string &term,
                                           int flags)
{
    m_message->addToken(term, prefix, flags);
}

bool TraditionalMailMessageParser::isPhrase(const string &token)
{
    return token.find(' ') != string::npos;
}

void TraditionalMailMessageParser::parseHtmlBodyText(const AbstractMultiLineString *text)
{
    if (is_debug) {
        cerr << "PARSING HTML BODY TEXT LINES: " << text->lineCount() << endl;
    }
    m_prefix.erase();
    SimpleMultiLineStringCharReader reader(text, LINE_SEPARATOR);
    m_htmlTokenizer->tokenize(m_textReceiver.get(), &reader);
    if (is_debug) {
        cerr << "FINISHED PARSING HTML BODY TEXT LINES: " << text->lineCount() << endl;
    }
}

void TraditionalMailMessageParser::parsePlainBodyText(const AbstractMultiLineString *text)
{
    if (is_debug) {
        cerr << "PARSING PLAIN BODY TEXT LINES: " << text->lineCount() << endl;
    }
    m_prefix.erase();
    SimpleMultiLineStringCharReader reader(text, LINE_SEPARATOR);
    m_textTokenizer->tokenize(m_textReceiver.get(), &reader);
    if (is_debug) {
        cerr << "FINISHED PARSING PLAIN BODY TEXT LINES: " << text->lineCount() << endl;
    }
}

void TraditionalMailMessageParser::parseBodyText(MailMessage *source)
{
    bool is_html;
    const AbstractMultiLineString *text =source->asText(is_html);
    if (!text) {
        if (is_debug) {
            cerr << "IGNORING NON-TEXT PART" << endl;
        }
        return;
    }
    if (is_html) {
        parseHtmlBodyText(text);
    } else {
        parsePlainBodyText(text);
    }
}

void TraditionalMailMessageParser::parseCharset(MailMessage *source,
                                                const string &prefix)
{
    string charset;
    source->head()->getCharsetString(charset);
    if (charset.length() > 0) {
        m_prefix = prefix;
        StringReader reader(charset);
        m_textTokenizer->tokenize(m_textReceiver.get(), &reader);
    }
}

void TraditionalMailMessageParser::parseHeader(const MessageHeader *header,
                                               MimeDecoder *decoder)
{
  RCPtr<AbstractMultiLineString> lines(decoder->decodeHeaderString(header->lines()));
  SimpleMultiLineStringCharReader reader(lines.get());
  m_textTokenizer->tokenize(m_textReceiver.get(), &reader);
}

void TraditionalMailMessageParser::parseHeaders(MailMessage *source)
{
  MimeDecoder decoder;
  m_config->headers()->resetHeaderCounts();
  const MessageHeaderList *head = source->head();
  for (int i = 0; i < head->headerCount(); ++i) {
    const MessageHeader *header = head->header(i);
    if (m_config->headers()->shouldProcessHeader(header->lowerName(), m_prefix)) {
      parseHeader(header, &decoder);
    }
  }
  parseCharset(source, "CS_");
}

void TraditionalMailMessageParser::parseBody(MailMessage *source)
{
  if (is_debug) {
    cerr << "parseBody: begins " << *source->bodyText()->line(0) << endl;
  }
  parseHeaders(source);

  if (!m_config->getIgnoreBody()) {
    if (source->hasParts()) {
      for (int i = 0; i < source->body()->messageCount(); ++i) {
        parseBody(source->body()->message(i));
      }
    } else {
      parseBodyText(source);
    }
  }

  if (is_debug) {
    cerr << "parseBody: ends " << *source->bodyText()->line(source->bodyText()->lineCount() - 1) << endl;
  }
}
