/*****************************************************************
* Unipro UGENE - Integrated Bioinformatics Suite
* Copyright (C) 2008 Unipro, Russia (http://ugene.unipro.ru)
* All Rights Reserved
* 
*     This source code is distributed under the terms of the
*     GNU General Public License. See the files COPYING and LICENSE
*     for details.
*****************************************************************/

#include "EMBLGenbankAbstractDocument.h"

#include "GenbankLocationParser.h"
#include "DocumentFormatUtils.h"

#include <gobjects/DNASequenceObject.h>
#include <gobjects/GObjectUtils.h>
#include <gobjects/GObjectRelationRoles.h>

#include <core_api/IOAdapter.h>
#include <core_api/Task.h>
#include <core_api/DNAAlphabet.h>

#include <util_text/TextUtils.h>
#include <memory>

namespace GB2 {

/* TRANSLATOR GB2::EMBLGenbankAbstractDocument */    
/* TRANSLATOR GB2::IOAdapter */    

//TODO: local8bit or ascii??

EMBLGenbankAbstractDocument::EMBLGenbankAbstractDocument(const DocumentFormatId& _id, const QString& _formatName, int mls, QObject* p) 
: DocumentFormat(p), id(_id), formatName(_formatName), maxAnnotationLineLen(mls)
{
}

bool EMBLGenbankAbstractDocument::isObjectOpSupported(const Document* d , DocumentFormat::DocObjectOp op, GObjectType t) const {
    Q_UNUSED(d); Q_UNUSED(op);
	return (t == GObjectTypes::ANNOTATION_TABLE || t == GObjectTypes::DNA_SEQUENCE);
}



bool EMBLGenbankAbstractDocument::checkConstraints(const DocumentFormatConstraints& c) const {
	foreach (GObjectType t, c.supportedObjectTypes) { 
		if (t!=GObjectTypes::ANNOTATION_TABLE && t!=GObjectTypes::DNA_SEQUENCE) {
			return false;
		}
	}
	if (c.checkRawData) {
		return isDataFormatSupported(c.rawData.constData(), c.rawData.size());
	}
    return true;
}


//////////////////////////////////////////////////////////////////////////
// loading

Document* EMBLGenbankAbstractDocument::loadExistingDocument(IOAdapterFactory* iof, const QString& url, TaskStateInfo& ti, const QVariantMap& _fs) {
	std::auto_ptr<IOAdapter> io(iof->createIOAdapter());
	if (!io->open(url, IOAdapterMode_Read)) {
		ti.error = IOAdapter::tr("error_opening_url_for_read '%1'").arg(url);
		return NULL;
	}

    QVariantMap fs = _fs;
    QList<GObject*> objects;
    QString writeLockReason;
    load(url, io.get(), objects, fs, ti, writeLockReason);
	io->close();

    if (ti.hasErrors() || ti.cancelFlag) {
		return NULL;
	}

    DocumentFormatUtils::updateFormatSettings(objects, fs);
    Document* doc = new Document(this, iof, url, objects, fs, writeLockReason);
	return doc;
}


const QString EMBLGenbankAbstractDocument::UNIMARK("UNIMARK");
const QString EMBLGenbankAbstractDocument::UNKNOWN_OBJ_NAME("_object");


void EMBLGenbankAbstractDocument::load(const QString& docUrl, IOAdapter* io, QList<GObject*>& objects, QVariantMap& fs, TaskStateInfo& si, QString& writeLockReason) {
    writeLockReason.clear();

    //get settings
    int gapSize = qBound(-1, DocumentFormatUtils::getIntSettings(fs, MERGE_MULTI_DOC_GAP_SIZE_SETTINGS, -1), 1000*1000);
    bool merge = gapSize!=-1;

    QByteArray sequence;
    QByteArray  gapSequence((merge ? gapSize : 0), 0);
    AnnotationTableObject* mergedAnnotations = NULL;
    QStringList contigs;
    QList<LRegion> mergedMapping;

    int predictedSize = qMax(100*1000, 
        DocumentFormatUtils::getIntSettings(fs, MERGE_MULTI_DOC_SEQUENCE_SIZE_SETTINGS, merge ? 0 : io->left()));

    sequence.reserve(predictedSize + (merge ? gapSize : 0));
    
	QStringList prevNames;
	
	bool toolMark = false;
    int sequenceStart = 0;
    GObjectReference sequenceRef(GObjectReference(docUrl, "", GObjectTypes::DNA_SEQUENCE));

	for (int i=0; !si.hasErrors() && !si.cancelFlag; i++) {
        if (!merge) {
            sequence.clear();
        } else if (sequence.size() > 0) {
            sequence.append(gapSequence);
            sequenceStart = sequence.size();
        }

		//reading meta info
        si.stateDesc = tr("reading_header");
		std::auto_ptr<EMBLGenbankAbstractHeader> h(readHeader(io, si));
		if (si.hasErrors() || h.get() == NULL || si.cancelFlag) {
			break;
		}
        int headerSeqLen = h->idLine->seqLen;

		toolMark = h->tags.contains(UNIMARK);
		
		// reading annotations
        si.stateDesc = tr("reading_annotations_for_%1").arg(h->idLine->name);
        QList<SharedAnnotationData> features = readAnnotations(io, si, sequence.size());
		if (si.hasErrors() || si.cancelFlag) {
			break;
		}
        AnnotationTableObject* annotationsObject  = NULL;
		if (!features.isEmpty()) {
            QString annotationName = genObjectName(prevNames, h.get(), "FT", i+1);
            if (merge && mergedAnnotations == NULL) {
                mergedAnnotations = new AnnotationTableObject(annotationName);
            }
            annotationsObject = merge ? mergedAnnotations : new AnnotationTableObject(annotationName);

            QString groupName;//TODO:
			foreach(SharedAnnotationData d, features) {
                annotationsObject->addAnnotation(new Annotation(d), groupName);
			}

            if (!merge) {
                objects.append(annotationsObject);
            }
		}
        
        //reading sequence
        si.stateDesc = tr("reading_sequence_for_%1").arg(h->idLine->name);
        sequence.reserve(sequence.size() + headerSeqLen);
        readSequence(sequence, io, si);
		if (!si.hasErrors() && !si.cancelFlag) {
            int sequenceLen = sequence.size() - sequenceStart;
            QString sequenceName = genObjectName(prevNames, h.get(), "SEQ", i+1);
            if (merge && sequenceLen == 0 && annotationsObject!=NULL) {
                si.error = tr("error_merging_no_sequence_for_annotation_object");
                break;
            } else if (merge) {
                contigs.append(sequenceName);
                mergedMapping.append(LRegion(sequenceStart, sequenceLen));
            } else if (sequenceLen!=0){
                DNASequenceObject* seqObj = DocumentFormatUtils::addSequenceObject(objects, sequenceName, sequence);
                if (annotationsObject!=NULL) {
                    sequenceRef.objName = seqObj->getGObjectName();
                    annotationsObject->addObjectRelation(GObjectRelation(sequenceRef, GObjectRelationRole::SEQUENCE));
                }
            }
        }
    }
    
    if (!si.hasErrors() && !si.cancelFlag && merge && !contigs.isEmpty()) {
        assert(sequence.size() > gapSize);
        assert(qEqual(sequence.constEnd() - gapSize, sequence.constEnd(), gapSequence.constBegin()));
        sequence.resize(sequence.size() - gapSize);//remove last gap
        DNASequenceObject* so = DocumentFormatUtils::addMergedSequenceObject(objects, docUrl, contigs, sequence, mergedMapping);
        if (mergedAnnotations!=NULL) {
            sequenceRef.objName = so->getGObjectName();
            mergedAnnotations->addObjectRelation(GObjectRelation(sequenceRef, GObjectRelationRole::SEQUENCE));
            objects.append(mergedAnnotations);
        }
    }

    if (!toolMark) {
		writeLockReason = DocumentFormat::CREATED_NOT_BY_UGENE;
    }
}

//column annotation data starts with
#define A_COL 21
//column qualifier name starts with
#define QN_COL 22
//column annotation key starts with
#define K_COL 5


static bool isNewQStart(const char* s, int l) {
    if (l < A_COL + 1 || s[A_COL]!='/') {
        return false;
    }
    const QBitArray& WHITES = TextUtils::WHITES;
    for (int i = QN_COL; i < l; i++) {
        char c = s[i];
        if (c == '=' && i > QN_COL) {
            return true;
        }
        if (WHITES[(uchar)c]) {
            break;
        }
    }
    return false;
}

//TODO: make it IO active -> read util the end. Otherwise qualifier is limited in size by maxSize
int EMBLGenbankAbstractDocument::readMultilineQualifier(IOAdapter* io, char* cbuff, int maxSize, bool _prevLineHasMaxSize) {
	int len = 0;
	bool lineOk = true;
    static const int MAX_LINE = 256;
    int sizeToSkip = maxSize - MAX_LINE;
    const QBitArray& LINE_BREAKS = TextUtils::LINE_BREAKS;
    bool breakWords = !_prevLineHasMaxSize; //todo: create a parameter and make it depends on annotation name.
    do {
        if (len >= sizeToSkip) {
            QByteArray skip(MAX_LINE, 0);
            char* skipBuff = skip.data();
            do {
                int readLen = io->readUntil(skipBuff, MAX_LINE, TextUtils::LINE_BREAKS, IOAdapter::Term_Include, &lineOk);
                if (!lineOk) {
                    continue;//todo: report error!
                }
                int lineLen = readLen;
                for (; A_COL < lineLen && LINE_BREAKS[(uchar)skipBuff[lineLen-1]]; lineLen--){}; //remove line breaks
                if (lineLen == 0 || lineLen < A_COL || skip[0]!=fPrefix[0] || skip[1]!=fPrefix[1] 
                    || skip[K_COL]!=' ' || (skip[A_COL]=='/' && isNewQStart(skip, lineLen))) {
                    io->skip(-readLen);
                    break;
                }
            } while (true);
            break;
        }
        char* lineBuf = cbuff + len;
        int readLen = io->readUntil(lineBuf, maxSize-len, TextUtils::LINE_BREAKS, IOAdapter::Term_Include, &lineOk);
        int lineLen = readLen;
        for (; A_COL < lineLen && LINE_BREAKS[(uchar)lineBuf[lineLen-1]]; lineLen--){}; //remove line breaks
        if (!lineOk || lineLen == 0 || lineLen < A_COL || lineBuf[0]!=fPrefix[0]  
            || lineBuf[1]!=fPrefix[1] || lineBuf[K_COL]!=' ' || (lineBuf[A_COL]=='/' && isNewQStart(lineBuf, lineLen))) 
        {
            io->skip(-readLen);
            break;
		}
        if (breakWords && lineLen-A_COL > 0 && lineBuf[A_COL]!=' ') { //add space to separate words
            cbuff[len] = ' ';
            len++;
        }
		qMemCopy(cbuff + len, lineBuf + A_COL, lineLen - A_COL);
		len+=lineLen-A_COL;
        breakWords = breakWords || lineLen < maxAnnotationLineLen;
	} while (true);
	return len;
}


QString EMBLGenbankAbstractDocument::genObjectName(QStringList& prevNames, const EMBLGenbankAbstractHeader* h, const QString& suffix, int n) {
    QString name = h->tags.value("ACCESSION");
    if (name.isEmpty()) {
        name = h->idLine->name;
    }
    if (name.isEmpty()) {
        name = h->tags.value(UNIMARK);
    }
    if (name.isEmpty()) {
        name = EMBLGenbankAbstractDocument::UNKNOWN_OBJ_NAME;
    }
    int n2 = 1;
    while (name.isEmpty() || prevNames.contains(name)) {
        name = name + "_" + suffix + "_" + QString::number(n) + (n2 == 1 ? QString("") : ("_"+QString::number(n2)));
    }
    prevNames.append(name);
    return name;
}


static void checkQuotes(const char* str, int len, bool& outerQuotes, bool& doubleQuotes) {
    char qChar = '\"';
    assert(len>=0);
    outerQuotes = str[0] == qChar && str[len-1]==qChar;
    for(int i=1; i < len; i++) {
        if (str[i-1]==qChar && str[i] == qChar) {
            doubleQuotes = true;
            break;
        }
    }
}


const QByteArray EMBLGenbankAbstractDocument::AMINO_STRAND_QNAME("uni_amino");
const QByteArray EMBLGenbankAbstractDocument::AMINO_STRAND_QVAL_YES("yes");
const QByteArray EMBLGenbankAbstractDocument::AMINO_STRAND_QVAL_NO("no");

SharedAnnotationData EMBLGenbankAbstractDocument::readAnnotation(IOAdapter* io, char* cbuff, int len, 
                                                                 int READ_BUFF_SIZE, TaskStateInfo& si, int offset) 
{
    AnnotationData* a = new AnnotationData();
    SharedAnnotationData f(a);
    a->name = QString::fromAscii(cbuff+5, 15).trimmed();
    if (a->name.isEmpty()) {
        si.error = EMBLGenbankAbstractDocument::tr("annotation_name_is_empty");
        return SharedAnnotationData();
    }

    //qualifier starts on offset 22;
    int qlen = len + readMultilineQualifier(io, cbuff+len, READ_BUFF_SIZE - len, true);
    if (qlen < 21) {
        si.error = EMBLGenbankAbstractDocument::tr("annotation_location_is_empty");
        return SharedAnnotationData();
    }

    Genbank::LocationParser::parseLocation(cbuff+21, qlen-21, a->complement, a->location);
    if (a->location.empty()) {
        si.error = EMBLGenbankAbstractDocument::tr("annotation_location_is_empty");
        return SharedAnnotationData();
    } else if (a->location.size() > 1) {
        qSort(a->location);
    }
    if (offset!=0) {
        DocumentFormatUtils::addOffset(a->location, offset);
    }

    const QBitArray& LINE_BREAKS = TextUtils::LINE_BREAKS;

    //here we have valid key and location;
    //reading qualifiers
    bool lineOk = true;
    while ((len = io->readUntil(cbuff, READ_BUFF_SIZE, TextUtils::LINE_BREAKS, IOAdapter::Term_Include, &lineOk) ) > 0)  {
        if (len == 0 || len < QN_COL+1 || cbuff[K_COL]!=' ' || cbuff[A_COL]!='/' || cbuff[0]!=fPrefix[0] || cbuff[1]!=fPrefix[1]) {
            io->skip(-len);
            break;
        }
        if (!lineOk) {
            si.error = EMBLGenbankAbstractDocument::tr("line_is_too_long_or_unexpected_oef");
            break;
        }
        for (; QN_COL < len && LINE_BREAKS[(uchar)cbuff[len-1]]; len--){}; //remove line breaks
        int flen = len + readMultilineQualifier(io, cbuff+len, READ_BUFF_SIZE-len, len == maxAnnotationLineLen);
        //now the whole feature is in cbuff
        int valStart = A_COL + 1;
        for (; valStart < flen && cbuff[valStart] != '='; valStart++){}; //find '==' and valStart
        if (valStart < flen) {
            valStart++; //skip '=' char
        }
        const QBitArray& WHITE_SPACES = TextUtils::WHITES;
        for (; valStart < flen && WHITE_SPACES[(uchar)cbuff[flen-1]]; flen--){}; //trim value
        const char* qname = cbuff + QN_COL;
        int qnameLen = valStart - (QN_COL + 1);
        const char* qval = cbuff + valStart;
        int qvalLen = flen - valStart;
        bool removeQuotes = false;
        bool containsDoubleQuotes = false;
        checkQuotes(qval, qvalLen, removeQuotes, containsDoubleQuotes);
        if (removeQuotes) {
            qval++;
            qvalLen-=2;
        }
        if (qnameLen == AMINO_STRAND_QNAME.length() && TextUtils::equals(qname, AMINO_STRAND_QNAME.constData(), qnameLen)) {
            a->aminoStrand = qvalLen == AMINO_STRAND_QVAL_YES.length() && TextUtils::equals(qval, AMINO_STRAND_QVAL_YES.constData(), qvalLen) 
                ? TriState_Yes : TriState_No;
        } else {
            QString nameQStr = QString::fromAscii(qname, qnameLen);
            QString valQStr = QString::fromAscii(qval, qvalLen);
            if (containsDoubleQuotes) {
                valQStr = valQStr.replace("\"\"", "\"");
            }
            a->qualifiers.append(Qualifier(nameQStr, valQStr));
        }
    }
    return f;
}

void EMBLGenbankAbstractDocument::readSequence(QByteArray& res, IOAdapter* io, TaskStateInfo& si) {
    static int READ_BUFF_SIZE = 4096;
    QByteArray readBuffer(READ_BUFF_SIZE, '\0');
    char* buff  = readBuffer.data();

    //reading sequence		
    const QBitArray& LINE_BREAKS = TextUtils::LINE_BREAKS;

    int len = io->readUntil(buff, READ_BUFF_SIZE, TextUtils::LINE_BREAKS, IOAdapter::Term_Include);
    if (len < 6 || !TextUtils::equals(buff, sequenceStartPrefix.constData(), sequenceStartPrefix.size())) {
        return;
    }
    QBuffer writer(&res);
    writer.open( QIODevice::WriteOnly | QIODevice::Append );
    bool ok = true;
    int dataOffset = 0;
    bool numIsPrefix = getFormatId() == BaseDocumentFormats::PLAIN_GENBANK;
    while (ok && (len = io->readUntil(buff, READ_BUFF_SIZE, LINE_BREAKS, IOAdapter::Term_Include)) > 0) {
        if (si.cancelFlag) {
            res.clear();
            break;
        }
        
        while(len > 0 && LINE_BREAKS[(uchar)buff[len-1]])  { //remove line breaks from sequence
            len--;
        }

        if (len <= 0)  {
            si.error = tr("error_reading_sequence_abnormal_term");
            break;
        }

        if (buff[0] == '/') { //end of the sequence
            break;
        }
        
        //compute data offset
        bool foundNum = false;
        bool foundSpaceAfterNum = false;
        for(dataOffset = 0 ; dataOffset < len; dataOffset++) {
            char c = numIsPrefix ? buff[dataOffset] : buff[len - dataOffset - 1];
            bool isNum = c >= '0' && c <= '9';
            bool isSpace = c == ' ' || c == '\t';
            if (!isSpace && (!isNum || foundSpaceAfterNum)) {
                if (!foundSpaceAfterNum) {
                    //unknown character -> stop iteration
                    dataOffset = len;   
                }
                break;
            }
            foundNum = foundNum || isNum;
            foundSpaceAfterNum = foundSpaceAfterNum || (isSpace && foundNum);
        }
        
        if (dataOffset == len) {
            si.error=tr("error_reading_sequence_invalid_format");    
            break;
        }
        
        //add buffer to result
        for (int i= (numIsPrefix ? dataOffset : 0), n = (numIsPrefix ? len : len -  dataOffset) ; i < n; i++) {
            char c = buff[i];
            if (c != ' ' && c != '\t') {
                ok = writer.putChar(c);
                if (!ok) {
                    break;
                }
            }
        }
        if (!ok) {
            si.error = tr("erorr_reading_sequence_enable_to_add_more_data");
            break;
        }

		si.progress = io->getProgress();
    }
    writer.close();
}


}//namespace
