// indexRWIEntry.java
// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 20.05.2006 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
// 
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

package de.anomic.index;

import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroColumn;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRow.Entry;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.yacy.yacySeedDB;

public class indexRWIEntry implements Cloneable {

    // this object stores attributes to URL references inside RWI collections

    
    public static kelondroRow urlEntryRow = new kelondroRow(new kelondroColumn[]{
            new kelondroColumn("h", kelondroColumn.celltype_string,    kelondroColumn.encoder_bytes, yacySeedDB.commonHashLength, "urlhash"),
            new kelondroColumn("a", kelondroColumn.celltype_cardinal,  kelondroColumn.encoder_b256,  2, "lastModified"),
            new kelondroColumn("s", kelondroColumn.celltype_cardinal,  kelondroColumn.encoder_b256,  2, "freshUntil"),
            new kelondroColumn("u", kelondroColumn.celltype_cardinal,  kelondroColumn.encoder_b256,  1, "wordsInTitle"),
            new kelondroColumn("w", kelondroColumn.celltype_cardinal,  kelondroColumn.encoder_b256,  2, "wordsInText"),
            new kelondroColumn("p", kelondroColumn.celltype_cardinal,  kelondroColumn.encoder_b256,  2, "phrasesInText"),
            new kelondroColumn("d", kelondroColumn.celltype_binary,    kelondroColumn.encoder_bytes, 1, "doctype"),
            new kelondroColumn("l", kelondroColumn.celltype_string,    kelondroColumn.encoder_bytes, 2, "language"),
            new kelondroColumn("x", kelondroColumn.celltype_cardinal,  kelondroColumn.encoder_b256,  1, "llocal"),
            new kelondroColumn("y", kelondroColumn.celltype_cardinal,  kelondroColumn.encoder_b256,  1, "lother"),
            new kelondroColumn("m", kelondroColumn.celltype_cardinal,  kelondroColumn.encoder_b256,  1, "urlLength"),
            new kelondroColumn("n", kelondroColumn.celltype_cardinal,  kelondroColumn.encoder_b256,  1, "urlComps"),
            new kelondroColumn("g", kelondroColumn.celltype_binary,    kelondroColumn.encoder_bytes, 1, "typeofword"),
            new kelondroColumn("z", kelondroColumn.celltype_bitfield,  kelondroColumn.encoder_bytes, 4, "flags"),
            new kelondroColumn("c", kelondroColumn.celltype_cardinal,  kelondroColumn.encoder_b256,  1, "hitcount"),
            new kelondroColumn("t", kelondroColumn.celltype_cardinal,  kelondroColumn.encoder_b256,  2, "posintext"),
            new kelondroColumn("r", kelondroColumn.celltype_cardinal,  kelondroColumn.encoder_b256,  1, "posinphrase"),
            new kelondroColumn("o", kelondroColumn.celltype_cardinal,  kelondroColumn.encoder_b256,  1, "posofphrase"),
            new kelondroColumn("i", kelondroColumn.celltype_cardinal,  kelondroColumn.encoder_b256,  1, "worddistance"),
            new kelondroColumn("k", kelondroColumn.celltype_cardinal,  kelondroColumn.encoder_b256,  1, "reserve")
    },
    kelondroBase64Order.enhancedCoder,
    0);
    // available chars: b,e,j,q
    
    // static properties
    private static final int col_urlhash       =  0; // h 12 the url hash b64-encoded
    private static final int col_lastModified  =  1; // a  2 last-modified time of the document where word appears
    private static final int col_freshUntil    =  2; // s  2 TTL for the word, so it can be removed easily if the TTL is short
    private static final int col_wordsInTitle  =  3; // u  1 words in description/length (longer are better?)
    private static final int col_wordsInText   =  4; // w  2 total number of words in document
    private static final int col_phrasesInText =  5; // p  2 total number of phrases in document
    private static final int col_doctype       =  6; // d  1 type of document
    private static final int col_language      =  7; // l  2 (guessed) language of document
    private static final int col_llocal        =  8; // x  1 outlinks to same domain
    private static final int col_lother        =  9; // y  1 outlinks to other domain
    private static final int col_urlLength     = 10; // m  1 byte-length of complete URL
    private static final int col_urlComps      = 11; // n  1 number of path components

    // dynamic properties    
    private static final int col_typeofword    = 12; // g  1 grammatical classification
    private static final int col_flags         = 13; // z  4 b64-encoded appearance flags (24 bit, see definition below)
    private static final int col_hitcount      = 14; // c  1 number of occurrences of this word in text
    private static final int col_posintext     = 15; // t  2 first appearance of word in text
    private static final int col_posinphrase   = 16; // r  1 position of word in its phrase
    private static final int col_posofphrase   = 17; // o  1 number of the phrase where word appears
    private static final int col_worddistance  = 18; // i  1 initial zero; may be used as reserve: is filled during search
    private static final int col_reserve       = 19; // k  1 reserve
    
    // appearance flags, used in RWI entry
    // the flags 0..23 are identical to the category flags in plasmaCondenser
    public  static final int flag_app_url           = 24; // word appears in url
    public  static final int flag_app_descr         = 25; // word appears in headline (or any description part)
    public  static final int flag_app_author        = 26; // word appears in author
    public  static final int flag_app_tags          = 27; // word appears in header tags
    public  static final int flag_app_reference     = 28; // word appears in anchor description text (the reference to an url), or any alternative text field of a link
    public  static final int flag_app_emphasized    = 29; // word is emphasized in text (i.e. bold, italics, special size)
   
    private kelondroRow.Entry entry;
    
    public indexRWIEntry(String  urlHash,
            int      urlLength,     // byte-length of complete URL
            int      urlComps,      // number of path components
            int      titleLength,   // length of description/length (longer are better?)
            int      hitcount,      // how often appears this word in the text
            int      wordcount,     // total number of words
            int      phrasecount,   // total number of phrases
            int      posintext,     // position of word in all words
            int      posinphrase,   // position of word in its phrase
            int      posofphrase,   // number of the phrase where word appears
            int      worddistance,  // word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search
            int      sizeOfPage,    // # of bytes of the page TODO: not needed any more
            long     lastmodified,  // last-modified time of the document where word appears
            long     updatetime,    // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
            String   language,      // (guessed) language of document
            char     doctype,       // type of document
            int      outlinksSame,  // outlinks to same domain
            int      outlinksOther, // outlinks to other domain
            kelondroBitfield flags  // attributes to the url and to the word according the url
    ) {

        assert (urlHash.length() == 12) : "urlhash = " + urlHash;
        if ((language == null) || (language.length() != urlEntryRow.width(col_language))) language = "uk";
        this.entry = urlEntryRow.newEntry();
        int mddlm = plasmaWordIndex.microDateDays(lastmodified);
        int mddct = plasmaWordIndex.microDateDays(updatetime);
        this.entry.setCol(col_urlhash, urlHash, null);
        this.entry.setCol(col_lastModified, mddlm);
        this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation
        this.entry.setCol(col_wordsInTitle, titleLength / 6); // word count estimation; TODO: change value handover to number of words
        this.entry.setCol(col_wordsInText, wordcount);
        this.entry.setCol(col_phrasesInText, phrasecount);
        this.entry.setCol(col_doctype, new byte[]{(byte) doctype});
        this.entry.setCol(col_language, language, null);
        this.entry.setCol(col_llocal, outlinksSame);
        this.entry.setCol(col_lother, outlinksOther);
        this.entry.setCol(col_urlLength, urlLength);
        this.entry.setCol(col_urlComps, urlComps);
        this.entry.setCol(col_typeofword, new byte[]{(byte) 0}); // TODO: grammatical classification
        this.entry.setCol(col_flags, flags.bytes());
        this.entry.setCol(col_hitcount, hitcount);
        this.entry.setCol(col_posintext, posintext);
        this.entry.setCol(col_posinphrase, posinphrase);
        this.entry.setCol(col_posofphrase, posofphrase);
        this.entry.setCol(col_worddistance, worddistance);
        this.entry.setCol(col_reserve, 0);
    }
    
    public indexRWIEntry(String urlHash, String code) {
        // the code is the external form of the row minus the leading urlHash entry
        this.entry = urlEntryRow.newEntry((urlHash + code).getBytes());
    }
    
    public indexRWIEntry(String external) {
        this.entry = urlEntryRow.newEntry(external, true);
    }
    
    public indexRWIEntry(byte[] row) {
        this.entry = urlEntryRow.newEntry(row);
    }
    
    public indexRWIEntry(kelondroRow.Entry rentry) {
        // FIXME: see if cloning is necessary
        this.entry = rentry;
    }
    
    public static int days(long time) {
        // calculates the number of days since 1.1.1970 and returns this as 4-byte array
        return (int) (time / 86400000);
    }
    
    public Object clone() {
        byte[] b = new byte[urlEntryRow.objectsize()];
        System.arraycopy(entry.bytes(), 0, b, 0, urlEntryRow.objectsize());
        return new indexRWIEntry(b);
    }

    public String toPropertyForm() {
        return entry.toPropertyForm(true, true, false);
    }
    
    public Entry toKelondroEntry() {
        return this.entry;
    }

    public String urlHash() {
        return this.entry.getColString(col_urlhash, null);
    }

    public int quality() {
        return 0; // not used any more
    }

    public int virtualAge() {
        return (int) this.entry.getColLong(col_lastModified);  // this is the time in MicoDateDays format
    }

    public long lastModified() {
        return plasmaWordIndex.reverseMicroDateDays((int) this.entry.getColLong(col_lastModified));
    }
    
    public long freshUntil() {
        return plasmaWordIndex.reverseMicroDateDays((int) this.entry.getColLong(col_freshUntil));
    }

    public int hitcount() {
        return (int) this.entry.getColLong(col_hitcount);
    }

    public int posintext() {
        return (int) this.entry.getColLong(col_posintext);
    }

    public int posinphrase() {
        return (int) this.entry.getColLong(col_posinphrase);
    }

    public int posofphrase() {
        return (int) this.entry.getColLong(col_posofphrase);
    }

    public int wordsintext() {
        return (int) this.entry.getColLong(col_wordsInText);
    }

    public int phrasesintext() {
        return (int) this.entry.getColLong(col_phrasesInText);
    }

    public String getLanguage() {
        return this.entry.getColString(col_language, null);
    }

    public char getType() {
        return (char) this.entry.getColByte(col_doctype);
    }

    public int wordsintitle() {
        return (int) this.entry.getColLong(col_wordsInTitle);
    }
    
    public int llocal() {
        return (int) this.entry.getColLong(col_llocal);
    }
    
    public int lother() {
        return (int) this.entry.getColLong(col_lother);
    }
    
    public int urllength() {
        return (int) this.entry.getColLong(col_urlLength);
    }
    
    public int urlcomps() {
        return (int) this.entry.getColLong(col_urlComps);
    }
    
    public kelondroBitfield flags() {
        return new kelondroBitfield(this.entry.getColBytes(col_flags));
    }
    
    public String toString() {
        return toPropertyForm();
    }
    
    public static indexRWIEntry combineDistance(indexRWIEntry ie1, indexRWIEntry ie2) {
        // returns a modified entry of the first argument
        ie1.entry.setCol(col_worddistance, ie1.worddistance() + ie2.worddistance() + Math.abs(ie1.posintext() - ie2.posintext()));
        ie1.entry.setCol(col_posintext, Math.min(ie1.posintext(), ie2.posintext()));
        ie1.entry.setCol(col_posinphrase, (ie1.posofphrase() == ie2.posofphrase()) ? ie1.posofphrase() : 0 /*unknown*/);
        ie1.entry.setCol(col_posofphrase, Math.min(ie1.posofphrase(), ie2.posofphrase()));
        ie1.entry.setCol(col_wordsInText, (ie1.wordsintext() + ie2.wordsintext()) / 2);
        return ie1;
    }
    
     public void combineDistance(indexRWIEntry oe) {
        combineDistance(this, oe);
    }

    public int worddistance() {
        return (int) this.entry.getColLong(col_worddistance);
    }
    
    public static final void min(indexRWIEntry t, indexRWIEntry other) {
        int v;
        long w;
        if (t.hitcount() > (v = other.hitcount())) t.entry.setCol(col_hitcount, other.hitcount());
        if (t.wordsintext() > (v = other.wordsintext())) t.entry.setCol(col_wordsInText, v);
        if (t.phrasesintext() > (v = other.phrasesintext())) t.entry.setCol(col_phrasesInText, v);
        if (t.posintext() > (v = other.posintext())) t.entry.setCol(col_posintext, v);
        if (t.posinphrase() > (v = other.posinphrase())) t.entry.setCol(col_posinphrase, v);
        if (t.posofphrase() > (v = other.posofphrase())) t.entry.setCol(col_posofphrase, v);
        if (t.worddistance() > (v = other.worddistance())) t.entry.setCol(col_worddistance, v);
        if (t.lastModified() > (w = other.lastModified())) t.entry.setCol(col_lastModified, w);
        if (t.urllength() > (v = other.urllength())) t.entry.setCol(col_urlLength, v);
        if (t.urlcomps() > (v = other.urlcomps())) t.entry.setCol(col_urlComps, v);
        if (t.wordsintitle() > (v = other.wordsintitle())) t.entry.setCol(col_wordsInTitle, v);
    }
    
    public static final void max(indexRWIEntry t, indexRWIEntry other) {
        int v;
        long w;
        if (t.hitcount() < (v = other.hitcount())) t.entry.setCol(col_hitcount, v);
        if (t.wordsintext() < (v = other.wordsintext())) t.entry.setCol(col_wordsInText, v);
        if (t.phrasesintext() < (v = other.phrasesintext())) t.entry.setCol(col_phrasesInText, v);
        if (t.posintext() < (v = other.posintext())) t.entry.setCol(col_posintext, v);
        if (t.posinphrase() < (v = other.posinphrase())) t.entry.setCol(col_posinphrase, v);
        if (t.posofphrase() < (v = other.posofphrase())) t.entry.setCol(col_posofphrase, v);
        if (t.worddistance() < (v = other.worddistance())) t.entry.setCol(col_worddistance, v);
        if (t.lastModified() < (w = other.lastModified())) t.entry.setCol(col_lastModified, w);
        if (t.urllength() < (v = other.urllength())) t.entry.setCol(col_urlLength, v);
        if (t.urlcomps() < (v = other.urlcomps())) t.entry.setCol(col_urlComps, v);
        if (t.wordsintitle() < (v = other.wordsintitle())) t.entry.setCol(col_wordsInTitle, v);
    }
    
    
    public void min(indexRWIEntry other) {
        min(this, other);
    }

    public void max(indexRWIEntry other) {
        max(this, other);
    }
    
    public boolean isNewer(indexRWIEntry other) {
        if (other == null) return true;
        if (this.lastModified() > other.lastModified()) return true;
        if (this.lastModified() == other.lastModified()) {
            if (this.quality() > other.quality()) return true;
        }
        return false;
    }
 
    public boolean isOlder(indexRWIEntry other) {
        if (other == null) return false;
        if (this.lastModified() < other.lastModified()) return true;
        if (this.lastModified() == other.lastModified()) {
            if (this.quality() < other.quality()) return true;
        }
        return false;
    }

}