diff --git a/source/de/anomic/index/indexURLEntryNew.java b/source/de/anomic/index/indexURLEntryNew.java new file mode 100644 index 000000000..30a4a0d0a --- /dev/null +++ b/source/de/anomic/index/indexURLEntryNew.java @@ -0,0 +1,294 @@ +// indexURLEntryNew.java +// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany +// first published 21.07.2006 on http://www.anomic.de +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +package de.anomic.index; + +import de.anomic.kelondro.kelondroColumn; +import de.anomic.kelondro.kelondroRow; +import de.anomic.kelondro.kelondroRow.Entry; +import de.anomic.plasma.plasmaWordIndex; + +public class indexURLEntryNew implements Cloneable, indexEntry { + + public static kelondroRow urlEntryRow = new kelondroRow(new kelondroColumn[]{ + new kelondroColumn("h", kelondroColumn.celltype_string, kelondroColumn.encoder_none, indexURL.urlHashLength, "urlhash"), + new kelondroColumn("q", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, indexURL.urlQualityLength, "quality"), + new kelondroColumn("a", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 3, "lastModified"), + new kelondroColumn("c", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "hitcount"), + new kelondroColumn("l", kelondroColumn.celltype_string, kelondroColumn.encoder_none, indexURL.urlLanguageLength, "language"), + new kelondroColumn("d", kelondroColumn.celltype_binary, kelondroColumn.encoder_none, 1, "doctype"), + new kelondroColumn("f", kelondroColumn.celltype_binary, kelondroColumn.encoder_none, 1, "localflag"), + new kelondroColumn("t", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posintext"), + new kelondroColumn("r", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posinphrase"), + new kelondroColumn("o", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posofphrase"), + new kelondroColumn("i", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "worddistance"), + new kelondroColumn("w", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "wordcount"), + new kelondroColumn("p", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "phrasecount") + }); + + private static final int col_urlhash = 0; + private static final int col_quality = 1; + private static final int col_lastModified = 2; + private static final int col_hitcount = 3; + private static final int col_language = 4; + private static final int col_doctype = 5; + private static final int col_localflag = 6; + private static final int col_posintext = 7; + private static final int col_posinphrase = 8; + private static final int col_posofphrase = 9; + private static final int col_worddistance = 10; + private static final int col_wordcount = 11; + private static final int col_phrasecount = 12; + + + private kelondroRow.Entry entry; + + public indexURLEntryNew(String urlHash, + int urlLength, // byte-length of complete URL + int urlComps, // number of path components + int titleLength, // length of description/length (longer are better?) + int hitcount, //*how often appears this word in the text + int wordcount, //*total number of words + int phrasecount, //*total number of phrases + int posintext, //*position of word in all words + int posinphrase, //*position of word in its phrase + int posofphrase, //*number of the phrase where word appears + int worddistance, //*word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search + int sizeOfPage, // # of bytes of the page + long lastmodified, //*last-modified time of the document where word appears + long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short + int quality, //*the entropy value + String language, //*(guessed) language of document + char doctype, //*type of document + int outlinksSame, // outlinks to same domain + int outlinksOther,// outlinks to other domain + boolean local //*flag shows that this index was generated locally; othervise its from a remote peer + ) { + + // more needed attributes: + // - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag etc + // - boolean: URL attributes + + if ((language == null) || (language.length() != indexURL.urlLanguageLength)) language = "uk"; + this.entry.setColString(col_urlhash, urlHash, null); + this.entry.setColLong(col_quality, quality); + this.entry.setColLong(col_lastModified, lastmodified); + this.entry.setColLong(col_hitcount, hitcount); + this.entry.setColString(col_language, language, null); + this.entry.setColByte(col_doctype, (byte) doctype); + this.entry.setColByte(col_localflag, (byte) ((local) ? indexEntryAttribute.LT_LOCAL : indexEntryAttribute.LT_GLOBAL)); + this.entry.setColLong(col_posintext, posintext); + this.entry.setColLong(col_posinphrase, posinphrase); + this.entry.setColLong(col_posofphrase, posofphrase); + this.entry.setColLong(col_worddistance, worddistance); + this.entry.setColLong(col_wordcount, wordcount); + this.entry.setColLong(col_phrasecount, phrasecount); + } + + public indexURLEntryNew(String urlHash, String code) { + // the code is the external form of the row minus the leading urlHash entry + this.entry = urlEntryRow.newEntry((urlHash + code).getBytes()); + } + + public indexURLEntryNew(String external) { + + } + + /* + public indexURLEntryNew(kelondroRow.Entry entry) { + this.entry = entry; + } + */ + + public indexURLEntryNew(byte[] row) { + this.entry = urlEntryRow.newEntry(row); + } + + public Object clone() { + return new indexURLEntryNew(toEncodedByteArrayForm()); + } + + public String toEncodedStringForm() { + return new String(toEncodedByteArrayForm()); + } + + public byte[] toEncodedByteArrayForm() { + return entry.bytes(); + } + + public String toPropertyForm() { + return entry.toPropertyForm(); + } + + public Entry toKelondroEntry() { + return this.entry; + } + + public String urlHash() { + return this.entry.getColString(col_urlhash, null); + } + + public int quality() { + return (int) this.entry.getColLong(col_quality); + } + + public int virtualAge() { + return plasmaWordIndex.microDateDays(lastModified()); + } + + public long lastModified() { + return (int) this.entry.getColLong(col_lastModified); + } + + public int hitcount() { + return (int) this.entry.getColLong(col_hitcount); + } + + public int posintext() { + return (int) this.entry.getColLong(col_posintext); + } + + public int posinphrase() { + return (int) this.entry.getColLong(col_posinphrase); + } + + public int posofphrase() { + return (int) this.entry.getColLong(col_posofphrase); + } + + public int wordcount() { + return (int) this.entry.getColLong(col_wordcount); + } + + public int phrasecount() { + return (int) this.entry.getColLong(col_phrasecount); + } + + public String getLanguage() { + return this.entry.getColString(col_language, null); + } + + public char getType() { + return (char) this.entry.getColByte(col_doctype); + } + + public boolean isLocal() { + return this.entry.getColByte(col_localflag) == indexEntryAttribute.LT_LOCAL; + } + + public static indexURLEntryNew combineDistance(indexURLEntryNew ie1, indexEntry ie2) { + // returns a modified entry of the first argument + ie1.entry.setColLong(col_worddistance, ie1.worddistance() + ie2.worddistance() + Math.abs(ie1.posintext() - ie2.posintext())); + ie1.entry.setColLong(col_posintext, Math.min(ie1.posintext(), ie2.posintext())); + ie1.entry.setColLong(col_posinphrase, (ie1.posofphrase() == ie2.posofphrase()) ? ie1.posofphrase() : 0 /*unknown*/); + ie1.entry.setColLong(col_posofphrase, Math.min(ie1.posofphrase(), ie2.posofphrase())); + ie1.entry.setColLong(col_wordcount, (ie1.wordcount() + ie2.wordcount()) / 2); + return ie1; + } + + public void combineDistance(indexEntry oe) { + combineDistance(this, oe); + } + + public int worddistance() { + return (int) this.entry.getColLong(col_worddistance); + } + + public static final void min(indexURLEntryNew t, indexEntry other) { + if (t.hitcount() > other.hitcount()) t.entry.setColLong(col_hitcount, other.hitcount()); + if (t.wordcount() > other.wordcount()) t.entry.setColLong(col_wordcount, other.wordcount()); + if (t.phrasecount() > other.phrasecount()) t.entry.setColLong(col_phrasecount, other.phrasecount()); + if (t.posintext() > other.posintext()) t.entry.setColLong(col_posintext, other.posintext()); + if (t.posinphrase() > other.posinphrase()) t.entry.setColLong(col_posinphrase, other.posinphrase()); + if (t.posofphrase() > other.posofphrase()) t.entry.setColLong(col_posofphrase, other.posofphrase()); + if (t.worddistance() > other.worddistance()) t.entry.setColLong(col_worddistance, other.worddistance()); + if (t.lastModified() > other.lastModified()) t.entry.setColLong(col_lastModified, other.lastModified()); + if (t.quality() > other.quality()) t.entry.setColLong(col_quality, other.quality()); + } + + public static final void max(indexURLEntryNew t, indexEntry other) { + if (t.hitcount() < other.hitcount()) t.entry.setColLong(col_hitcount, other.hitcount()); + if (t.wordcount() < other.wordcount()) t.entry.setColLong(col_wordcount, other.wordcount()); + if (t.phrasecount() < other.phrasecount()) t.entry.setColLong(col_phrasecount, other.phrasecount()); + if (t.posintext() < other.posintext()) t.entry.setColLong(col_posintext, other.posintext()); + if (t.posinphrase() < other.posinphrase()) t.entry.setColLong(col_posinphrase, other.posinphrase()); + if (t.posofphrase() < other.posofphrase()) t.entry.setColLong(col_posofphrase, other.posofphrase()); + if (t.worddistance() < other.worddistance()) t.entry.setColLong(col_worddistance, other.worddistance()); + if (t.lastModified() < other.lastModified()) t.entry.setColLong(col_lastModified, other.lastModified()); + if (t.quality() < other.quality()) t.entry.setColLong(col_quality, other.quality()); + } + + + public void min(indexEntry other) { + min(this, other); + } + + public void max(indexEntry other) { + max(this, other); + } + + static void normalize(indexURLEntryNew t, indexEntry min, indexEntry max) { + t.entry.setColLong(col_hitcount , (t.hitcount() == 0) ? 0 : 1 + 255 * (t.hitcount() - min.hitcount() ) / (1 + max.hitcount() - min.hitcount())); + t.entry.setColLong(col_wordcount , (t.wordcount() == 0) ? 0 : 1 + 255 * (t.wordcount() - min.wordcount() ) / (1 + max.wordcount() - min.wordcount())); + t.entry.setColLong(col_phrasecount , (t.phrasecount() == 0) ? 0 : 1 + 255 * (t.phrasecount() - min.phrasecount() ) / (1 + max.phrasecount() - min.phrasecount())); + t.entry.setColLong(col_posintext , (t.posintext() == 0) ? 0 : 1 + 255 * (t.posintext() - min.posintext() ) / (1 + max.posintext() - min.posintext())); + t.entry.setColLong(col_posinphrase , (t.posinphrase() == 0) ? 0 : 1 + 255 * (t.posinphrase() - min.posinphrase() ) / (1 + max.posinphrase() - min.posinphrase())); + t.entry.setColLong(col_posofphrase , (t.posofphrase() == 0) ? 0 : 1 + 255 * (t.posofphrase() - min.posofphrase() ) / (1 + max.posofphrase() - min.posofphrase())); + t.entry.setColLong(col_worddistance , (t.worddistance() == 0) ? 0 : 1 + 255 * (t.worddistance() - min.worddistance()) / (1 + max.worddistance() - min.worddistance())); + t.entry.setColLong(col_lastModified , (t.lastModified() == 0) ? 0 : 1 + 255 * (t.lastModified() - min.lastModified()) / (1 + max.lastModified() - min.lastModified())); + t.entry.setColLong(col_quality , (t.quality() == 0) ? 0 : 1 + 255 * (t.quality() - min.quality() ) / (1 + max.quality() - min.quality())); + } + + public void normalize(indexEntry min, indexEntry max) { + normalize(this, min, max); + } + + public indexEntry generateNormalized(indexEntry min, indexEntry max) { + indexURLEntryNew e = (indexURLEntryNew) this.clone(); + e.normalize(min, max); + return e; + } + + public boolean isNewer(indexEntry other) { + if (other == null) return true; + if (this.lastModified() > other.lastModified()) return true; + if (this.lastModified() == other.lastModified()) { + if (this.quality() > other.quality()) return true; + } + return false; + } + + public boolean isOlder(indexEntry other) { + if (other == null) return false; + if (this.lastModified() < ((indexAbstractEntry) other).lastModified()) return true; + if (this.lastModified() == ((indexAbstractEntry) other).lastModified()) { + if (this.quality() < ((indexAbstractEntry) other).quality) return true; + } + return false; + } + +}