diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index f0fbf3b14..c140706df 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -62,7 +62,7 @@ import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaURL; import de.anomic.plasma.plasmaWordIndex; -import de.anomic.plasma.plasmaWordIndexEntry; +import de.anomic.plasma.plasmaWordIndexEntryInstance; import de.anomic.plasma.plasmaWordIndexEntryContainer; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -153,7 +153,7 @@ public class IndexControl_p { int i = 0; urlx = new String[index.size()]; while (en.hasNext()) { - urlx[i++] = ((plasmaWordIndexEntry) en.next()).getUrlHash(); + urlx[i++] = ((plasmaWordIndexEntryInstance) en.next()).getUrlHash(); } index = null; } @@ -254,10 +254,10 @@ public class IndexControl_p { Iterator urlIter = index.entries(); HashMap knownURLs = new HashMap(); HashSet unknownURLEntries = new HashSet(); - plasmaWordIndexEntry indexEntry; + plasmaWordIndexEntryInstance indexEntry; plasmaCrawlLURL.Entry lurl; while (urlIter.hasNext()) { - indexEntry = (plasmaWordIndexEntry) urlIter.next(); + indexEntry = (plasmaWordIndexEntryInstance) urlIter.next(); try { lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), null); if (lurl.toString() == null) { @@ -437,9 +437,9 @@ public class IndexControl_p { int i = 0; final TreeMap tm = new TreeMap(); - plasmaWordIndexEntry xi; + plasmaWordIndexEntryInstance xi; while (en.hasNext()) { - xi = (plasmaWordIndexEntry) en.next(); + xi = (plasmaWordIndexEntryInstance) en.next(); uh = new String[]{xi.getUrlHash(), Integer.toString(xi.posintext())}; try { us = switchboard.urlPool.loadedURL.getEntry(uh[0], null).url().toString(); diff --git a/htroot/index.html b/htroot/index.html index 373ade029..f388059c2 100644 --- a/htroot/index.html +++ b/htroot/index.html @@ -38,7 +38,7 @@ - more options... + more options... :: diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index 03f18497e..eb8e255b5 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -52,7 +52,7 @@ import java.util.LinkedList; import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaSwitchboard; -import de.anomic.plasma.plasmaWordIndexEntry; +import de.anomic.plasma.plasmaWordIndexEntryInstance; import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -123,7 +123,7 @@ public final class transferRWI { int p; String wordHash; String urlHash; - plasmaWordIndexEntry entry; + plasmaWordIndexEntryInstance entry; int wordhashesSize = v.size(); final HashSet unknownURL = new HashSet(); String[] wordhashes = new String[v.size()]; @@ -136,7 +136,7 @@ public final class transferRWI { if (p > 0) { wordHash = estring.substring(0, p); wordhashes[received] = wordHash; - entry = new plasmaWordIndexEntry(estring.substring(p)); + entry = new plasmaWordIndexEntryInstance(estring.substring(p)); sb.wordIndex.addEntry(wordHash, entry, System.currentTimeMillis(), true); serverCore.checkInterruption(); diff --git a/source/de/anomic/index/indexEntry.java b/source/de/anomic/index/indexEntry.java new file mode 100644 index 000000000..4d3876720 --- /dev/null +++ b/source/de/anomic/index/indexEntry.java @@ -0,0 +1,44 @@ +// indexEntry.java +// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany +// first published 20.05.2006 on http://www.anomic.de +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.index; + +public interface indexEntry { + + public Object clone(); + public String toEncodedStringForm(); + public byte[] toEncodedByteArrayForm(); + public String toPropertyForm(); + + public void combineDistance(indexEntry oe); + public void min(indexEntry other); + public void max(indexEntry other); + public void normalize(indexEntry min, indexEntry max); + public indexEntry generateNormalized(indexEntry min, indexEntry max); + public boolean isNewer(indexEntry other); + public boolean isOlder(indexEntry other); + +} diff --git a/source/de/anomic/index/indexEntryPrototype.java b/source/de/anomic/index/indexEntryPrototype.java new file mode 100644 index 000000000..89b51917d --- /dev/null +++ b/source/de/anomic/index/indexEntryPrototype.java @@ -0,0 +1,154 @@ +// indexEntryPrototype.java +// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany +// first published 20.05.2006 on http://www.anomic.de +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.index; + +import de.anomic.plasma.plasmaURL; +import de.anomic.plasma.plasmaWordIndex; + +public abstract class indexEntryPrototype implements indexEntry { + + // the associated hash + protected String urlHash; + + // discrete values + protected int hitcount; // number of this words in file + protected int wordcount; // number of all words in the file + protected int phrasecount; // number of all phrases in the file + protected int posintext; // first position of the word in text as number of word; 0=unknown or irrelevant position + protected int posinphrase; // position within a phrase of the word + protected int posofphrase; // position of the phrase in the text as count of sentences; 0=unknown; 1=path; 2=keywords; 3=headline; >4: in text + protected int worddistance;// distance between the words, only used if the index is artificial (from a conjunction) + protected long lastModified;// calculated by using last-modified + protected int quality; // result of a heuristic on the source file + protected byte[] language; // essentially the country code (the TLD as heuristic), two letters lowercase only + protected char doctype; // type of source + protected char localflag; // indicates if the index was created locally + + public abstract Object clone(); + + public abstract String toEncodedStringForm(); + + public abstract byte[] toEncodedByteArrayForm(); + + public abstract String toPropertyForm(); + + public void combineDistance(indexEntry oe) { + this.worddistance = this.worddistance + ((indexEntryPrototype) oe).worddistance + Math.abs(this.posintext - ((indexEntryPrototype) oe).posintext); + this.posintext = Math.min(this.posintext, ((indexEntryPrototype) oe).posintext); + if (this.posofphrase != ((indexEntryPrototype) oe).posofphrase) this.posinphrase = 0; // (unknown) + this.posofphrase = Math.min(this.posofphrase, ((indexEntryPrototype) oe).posofphrase); + this.wordcount = (this.wordcount + ((indexEntryPrototype) oe).wordcount) / 2; + } + + public void min(indexEntry other) { + if (this.hitcount > ((indexEntryPrototype) other).hitcount) this.hitcount = ((indexEntryPrototype) other).hitcount; + if (this.wordcount > ((indexEntryPrototype) other).wordcount) this.wordcount = ((indexEntryPrototype) other).wordcount; + if (this.phrasecount > ((indexEntryPrototype) other).phrasecount) this.phrasecount = ((indexEntryPrototype) other).phrasecount; + if (this.posintext > ((indexEntryPrototype) other).posintext) this.posintext = ((indexEntryPrototype) other).posintext; + if (this.posinphrase > ((indexEntryPrototype) other).posinphrase) this.posinphrase = ((indexEntryPrototype) other).posinphrase; + if (this.posofphrase > ((indexEntryPrototype) other).posofphrase) this.posofphrase = ((indexEntryPrototype) other).posofphrase; + if (this.worddistance > ((indexEntryPrototype) other).worddistance) this.worddistance = ((indexEntryPrototype) other).worddistance; + if (this.lastModified > ((indexEntryPrototype) other).lastModified) this.lastModified = ((indexEntryPrototype) other).lastModified; + if (this.quality > ((indexEntryPrototype) other).quality) this.quality = ((indexEntryPrototype) other).quality; + } + + public void max(indexEntry other) { + if (this.hitcount < ((indexEntryPrototype) other).hitcount) this.hitcount = ((indexEntryPrototype) other).hitcount; + if (this.wordcount < ((indexEntryPrototype) other).wordcount) this.wordcount = ((indexEntryPrototype) other).wordcount; + if (this.phrasecount < ((indexEntryPrototype) other).phrasecount) this.phrasecount = ((indexEntryPrototype) other).phrasecount; + if (this.posintext < ((indexEntryPrototype) other).posintext) this.posintext = ((indexEntryPrototype) other).posintext; + if (this.posinphrase < ((indexEntryPrototype) other).posinphrase) this.posinphrase = ((indexEntryPrototype) other).posinphrase; + if (this.posofphrase < ((indexEntryPrototype) other).posofphrase) this.posofphrase = ((indexEntryPrototype) other).posofphrase; + if (this.worddistance < ((indexEntryPrototype) other).worddistance) this.worddistance = ((indexEntryPrototype) other).worddistance; + if (this.lastModified < ((indexEntryPrototype) other).lastModified) this.lastModified = ((indexEntryPrototype) other).lastModified; + if (this.quality < ((indexEntryPrototype) other).quality) this.quality = ((indexEntryPrototype) other).quality; + } + + public void normalize(indexEntry mi, indexEntry ma) { + indexEntryPrototype min = (indexEntryPrototype) mi; + indexEntryPrototype max = (indexEntryPrototype) ma; + this.hitcount = (this.hitcount == 0) ? 0 : 1 + 255 * (this.hitcount - min.hitcount ) / (1 + max.hitcount - min.hitcount); + this.wordcount = (this.wordcount == 0) ? 0 : 1 + 255 * (this.wordcount - min.wordcount ) / (1 + max.wordcount - min.wordcount); + this.phrasecount = (this.phrasecount == 0) ? 0 : 1 + 255 * (this.phrasecount - min.phrasecount ) / (1 + max.phrasecount - min.phrasecount); + this.posintext = (this.posintext == 0) ? 0 : 1 + 255 * (this.posintext - min.posintext ) / (1 + max.posintext - min.posintext); + this.posinphrase = (this.posinphrase == 0) ? 0 : 1 + 255 * (this.posinphrase - min.posinphrase ) / (1 + max.posinphrase - min.posinphrase); + this.posofphrase = (this.posofphrase == 0) ? 0 : 1 + 255 * (this.posofphrase - min.posofphrase ) / (1 + max.posofphrase - min.posofphrase); + this.worddistance = (this.worddistance == 0) ? 0 : 1 + 255 * (this.worddistance - min.worddistance) / (1 + max.worddistance - min.worddistance); + this.lastModified = (this.lastModified == 0) ? 0 : 1 + 255 * (this.lastModified - min.lastModified) / (1 + max.lastModified - min.lastModified); + this.quality = (this.quality == 0) ? 0 : 1 + 255 * (this.quality - min.quality ) / (1 + max.quality - min.quality); + } + + public indexEntry generateNormalized(indexEntry min, indexEntry max) { + indexEntry e = (indexEntryPrototype) this.clone(); + e.normalize(min, max); + return e; + } + + public String getUrlHash() { return urlHash; } + public int getQuality() { return quality; } + public int getVirtualAge() { return plasmaWordIndex.microDateDays(lastModified); } + public long getLastModified() { return lastModified; } + public int hitcount() { return hitcount; } + public int posintext() { return posintext; } + public int posinphrase() { return posinphrase; } + public int posofphrase() { return posofphrase; } + public int worddistance() { return worddistance; } + public int wordcount() { return wordcount; } + public int phrasecount() { return phrasecount; } + public String getLanguage() { return new String(language); } + public char getType() { return doctype; } + public boolean isLocal() { return localflag == indexEntryAttribute.LT_LOCAL; } + + public boolean isNewer(indexEntry other) { + if (other == null) return true; + if (this.lastModified > ((indexEntryPrototype) other).lastModified) return true; + if (this.lastModified == ((indexEntryPrototype) other).getLastModified()) { + if (this.quality > ((indexEntryPrototype) other).quality) return true; + } + return false; + } + + public boolean isOlder(indexEntry other) { + if (other == null) return false; + if (this.lastModified < ((indexEntryPrototype) other).getLastModified()) return true; + if (this.lastModified == ((indexEntryPrototype) other).getLastModified()) { + if (this.quality < ((indexEntryPrototype) other).quality) return true; + } + return false; + } + + public int domlengthNormalized() { + return 255 * plasmaURL.domLengthEstimation(this.urlHash) / 30; + } + + public static void main(String[] args) { + // outputs the word hash to a given word + if (args.length != 1) System.exit(0); + System.out.println("WORDHASH: " + indexEntryAttribute.word2hash(args[0])); + } + +} diff --git a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java index 85ebf22dc..6a4b05ad1 100644 --- a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java @@ -10,7 +10,7 @@ import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndex; -import de.anomic.plasma.plasmaWordIndexEntry; +import de.anomic.plasma.plasmaWordIndexEntryInstance; import de.anomic.plasma.plasmaWordIndexEntryContainer; import de.anomic.server.serverDate; @@ -128,13 +128,13 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { // loop throug the entities of the container and get the // urlhash Iterator importWordIdxEntries = newContainer.entries(); - plasmaWordIndexEntry importWordIdxEntry; + plasmaWordIndexEntryInstance importWordIdxEntry; while (importWordIdxEntries.hasNext()) { // testing if import process was aborted if (isAborted()) break; // getting next word index entry - importWordIdxEntry = (plasmaWordIndexEntry) importWordIdxEntries.next(); + importWordIdxEntry = (plasmaWordIndexEntryInstance) importWordIdxEntries.next(); String urlHash = importWordIdxEntry.getUrlHash(); entityUrls.add(urlHash); } diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index 7512ea5b5..3b55a34f2 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -179,7 +179,7 @@ public final class plasmaCrawlLURL extends plasmaURL { gcrawlResultStack.add(urlHash + initiatorHash + executorHash); } - public Entry getEntry(String hash, plasmaWordIndexEntry searchedWord) throws IOException { + public Entry getEntry(String hash, plasmaWordIndexEntryInstance searchedWord) throws IOException { return new Entry(hash, searchedWord); } @@ -410,7 +410,7 @@ public final class plasmaCrawlLURL extends plasmaURL { private int size; private int wordCount; private String snippet; - private plasmaWordIndexEntry word; // this is only used if the url is transported via remote search requests + private plasmaWordIndexEntryInstance word; // this is only used if the url is transported via remote search requests private boolean stored; // more needed attributes: @@ -445,7 +445,7 @@ public final class plasmaCrawlLURL extends plasmaURL { this.stored = false; } - public Entry(String urlHash, plasmaWordIndexEntry searchedWord) throws IOException { + public Entry(String urlHash, plasmaWordIndexEntryInstance searchedWord) throws IOException { // generates an plasmaLURLEntry using the url hash // to speed up the access, the url-hashes are buffered // in the hash cache. @@ -506,7 +506,7 @@ public final class plasmaCrawlLURL extends plasmaURL { this.wordCount = Integer.parseInt(prop.getProperty("wc", "0")); this.snippet = prop.getProperty("snippet", ""); if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null); - this.word = (prop.containsKey("word")) ? new plasmaWordIndexEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word",""))) : null; + this.word = (prop.containsKey("word")) ? new plasmaWordIndexEntryInstance(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word",""))) : null; this.stored = false; //} } catch (Exception e) { @@ -646,7 +646,7 @@ public final class plasmaCrawlLURL extends plasmaURL { return snippet; } - public plasmaWordIndexEntry word() { + public plasmaWordIndexEntryInstance word() { return word; } @@ -683,7 +683,7 @@ public final class plasmaCrawlLURL extends plasmaURL { if (this.word != null) { // append also word properties - corePropStr.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toExternalForm())); + corePropStr.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toPropertyForm())); } return corePropStr; diff --git a/source/de/anomic/plasma/plasmaDHTChunk.java b/source/de/anomic/plasma/plasmaDHTChunk.java index 7ef16a899..6b34dcce5 100644 --- a/source/de/anomic/plasma/plasmaDHTChunk.java +++ b/source/de/anomic/plasma/plasmaDHTChunk.java @@ -188,7 +188,7 @@ public class plasmaDHTChunk { Iterator wordHashIterator = wordIndex.wordHashSet(hash, resourceLevel, true, maxcount).iterator(); plasmaWordIndexEntryContainer indexContainer; Iterator urlIter; - plasmaWordIndexEntry indexEntry; + plasmaWordIndexEntryInstance indexEntry; plasmaCrawlLURL.Entry lurl; int refcount = 0; @@ -204,7 +204,7 @@ public class plasmaDHTChunk { urlIter = indexContainer.entries(); // iterate over indexes to fetch url entries and store them in the urlCache while ((urlIter.hasNext()) && (maxcount > refcount)) { - indexEntry = (plasmaWordIndexEntry) urlIter.next(); + indexEntry = (plasmaWordIndexEntryInstance) urlIter.next(); try { lurl = lurls.getEntry(indexEntry.getUrlHash(), indexEntry); if ((lurl == null) || (lurl.url() == null)) { @@ -224,7 +224,7 @@ public class plasmaDHTChunk { // remove all remaining; we have enough while (urlIter.hasNext()) { - indexEntry = (plasmaWordIndexEntry) urlIter.next(); + indexEntry = (plasmaWordIndexEntryInstance) urlIter.next(); urlIter.remove(); } @@ -266,7 +266,7 @@ public class plasmaDHTChunk { public int deleteTransferIndexes() { Iterator urlIter; - plasmaWordIndexEntry indexEntry; + plasmaWordIndexEntryInstance indexEntry; String[] urlHashes; int count = 0; for (int i = 0; i < this.indexContainers.length; i++) { @@ -275,7 +275,7 @@ public class plasmaDHTChunk { urlHashes = new String[this.indexContainers[i].size()]; urlIter = this.indexContainers[i].entries(); while (urlIter.hasNext()) { - indexEntry = (plasmaWordIndexEntry) urlIter.next(); + indexEntry = (plasmaWordIndexEntryInstance) urlIter.next(); urlHashes[c++] = indexEntry.getUrlHash(); } count += wordIndex.removeEntries(this.indexContainers[i].wordHash(), urlHashes, true); diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 1927210c6..b4a38b975 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -239,7 +239,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { //if (searchResult == null) return acc; // strange case where searchResult is not proper: acc is then empty //if (searchResult.size() == 0) return acc; // case that we have nothing to do - plasmaWordIndexEntry entry; + plasmaWordIndexEntryInstance entry; plasmaCrawlLURL.Entry page; int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT); try { diff --git a/source/de/anomic/plasma/plasmaSearchPreOrder.java b/source/de/anomic/plasma/plasmaSearchPreOrder.java index dbfac54d0..9ea5de0c3 100644 --- a/source/de/anomic/plasma/plasmaSearchPreOrder.java +++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java @@ -56,7 +56,7 @@ public final class plasmaSearchPreOrder { public static kelondroBinSearch[] ybrTables = null; // block-rank tables private static boolean useYBR = true; - private plasmaWordIndexEntry entryMin, entryMax; + private plasmaWordIndexEntryInstance entryMin, entryMax; private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry private plasmaSearchQuery query; private plasmaSearchRankingProfile ranking; @@ -116,36 +116,36 @@ public final class plasmaSearchPreOrder { return pageAcc.size() > 0; } - public plasmaWordIndexEntry next() { + public plasmaWordIndexEntryInstance next() { Object top = pageAcc.lastKey(); - return (plasmaWordIndexEntry) pageAcc.remove(top); + return (plasmaWordIndexEntryInstance) pageAcc.remove(top); } public void addContainer(plasmaWordIndexEntryContainer container, long maxTime) { long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; - plasmaWordIndexEntry indexEntry; + plasmaWordIndexEntryInstance indexEntry; // first pass: find min/max to obtain limits for normalization Iterator i = container.entries(); int count = 0; while (i.hasNext()) { if (System.currentTimeMillis() > limitTime) break; - indexEntry = (plasmaWordIndexEntry) i.next(); - if (entryMin == null) entryMin = (plasmaWordIndexEntry) indexEntry.clone(); else entryMin.min(indexEntry); - if (entryMax == null) entryMax = (plasmaWordIndexEntry) indexEntry.clone(); else entryMax.max(indexEntry); + indexEntry = (plasmaWordIndexEntryInstance) i.next(); + if (entryMin == null) entryMin = (plasmaWordIndexEntryInstance) indexEntry.clone(); else entryMin.min(indexEntry); + if (entryMax == null) entryMax = (plasmaWordIndexEntryInstance) indexEntry.clone(); else entryMax.max(indexEntry); count++; } // second pass: normalize entries and get ranking i = container.entries(); for (int j = 0; j < count; j++) { - indexEntry = (plasmaWordIndexEntry) i.next(); + indexEntry = (plasmaWordIndexEntryInstance) i.next(); pageAcc.put(serverCodings.encodeHex(this.ranking.preRanking(indexEntry.generateNormalized(entryMin, entryMax)), 16) + indexEntry.getUrlHash(), indexEntry); } } - public plasmaWordIndexEntry[] getNormalizer() { - return new plasmaWordIndexEntry[] {entryMin, entryMax}; + public plasmaWordIndexEntryInstance[] getNormalizer() { + return new plasmaWordIndexEntryInstance[] {entryMin, entryMax}; } public static int ybr_p(String urlHash) { diff --git a/source/de/anomic/plasma/plasmaSearchRankingProfile.java b/source/de/anomic/plasma/plasmaSearchRankingProfile.java index b898bd0bb..b6608d7c6 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java @@ -46,6 +46,8 @@ import java.util.Iterator; import java.util.Map; import java.util.Set; +import de.anomic.index.indexEntry; + public class plasmaSearchRankingProfile { // old parameters for ordering @@ -161,21 +163,23 @@ public class plasmaSearchRankingProfile { return new String(ext); } - public long preRanking(plasmaWordIndexEntry normalizedEntry) { + public long preRanking(indexEntry entry) { long ranking = 0; - - ranking += normalizedEntry.getQuality() << ((Integer) coeff.get(ENTROPY)).intValue(); - ranking += normalizedEntry.getVirtualAge() << ((Integer) coeff.get(DATE)).intValue(); - ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.getUrlHash()) << ((Integer) coeff.get(YBR)).intValue(); - ranking += (normalizedEntry.posintext() == 0) ? 0 : (255 - normalizedEntry.posintext()) << ((Integer) coeff.get(POSINTEXT)).intValue(); - ranking += (normalizedEntry.worddistance() == 0) ? 0 : (255 - normalizedEntry.worddistance()) << ((Integer) coeff.get(WORDDISTANCE)).intValue(); - ranking += (normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount() << ((Integer) coeff.get(HITCOUNT)).intValue(); - ranking += (255 - normalizedEntry.domlengthNormalized()) << ((Integer) coeff.get(DOMLENGTH)).intValue(); + if (entry instanceof plasmaWordIndexEntryInstance) { + plasmaWordIndexEntryInstance normalizedEntry = (plasmaWordIndexEntryInstance) entry; + ranking += normalizedEntry.getQuality() << ((Integer) coeff.get(ENTROPY)).intValue(); + ranking += normalizedEntry.getVirtualAge() << ((Integer) coeff.get(DATE)).intValue(); + ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.getUrlHash()) << ((Integer) coeff.get(YBR)).intValue(); + ranking += (normalizedEntry.posintext() == 0) ? 0 : (255 - normalizedEntry.posintext()) << ((Integer) coeff.get(POSINTEXT)).intValue(); + ranking += (normalizedEntry.worddistance() == 0) ? 0 : (255 - normalizedEntry.worddistance()) << ((Integer) coeff.get(WORDDISTANCE)).intValue(); + ranking += (normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount() << ((Integer) coeff.get(HITCOUNT)).intValue(); + ranking += (255 - normalizedEntry.domlengthNormalized()) << ((Integer) coeff.get(DOMLENGTH)).intValue(); + } return ranking; } public long postRanking( - plasmaWordIndexEntry normalizedEntry, + indexEntry normalizedEntry, plasmaSearchQuery query, Set topwords, String[] urlcomps, diff --git a/source/de/anomic/plasma/plasmaSearchResult.java b/source/de/anomic/plasma/plasmaSearchResult.java index a8353db42..bc91d9b9b 100644 --- a/source/de/anomic/plasma/plasmaSearchResult.java +++ b/source/de/anomic/plasma/plasmaSearchResult.java @@ -59,7 +59,7 @@ import de.anomic.index.indexEntryAttribute; public final class plasmaSearchResult { - private plasmaWordIndexEntry entryMin, entryMax; + private plasmaWordIndexEntryInstance entryMin, entryMax; private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic private ArrayList results; // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects @@ -106,11 +106,11 @@ public final class plasmaSearchResult { return (plasmaCrawlLURL.Entry) pageAcc.remove(top); } - protected void addResult(plasmaWordIndexEntry indexEntry, plasmaCrawlLURL.Entry page) { + protected void addResult(plasmaWordIndexEntryInstance indexEntry, plasmaCrawlLURL.Entry page) { // make min/max for normalization - if (entryMin == null) entryMin = (plasmaWordIndexEntry) indexEntry.clone(); else entryMin.min(indexEntry); - if (entryMax == null) entryMax = (plasmaWordIndexEntry) indexEntry.clone(); else entryMax.max(indexEntry); + if (entryMin == null) entryMin = (plasmaWordIndexEntryInstance) indexEntry.clone(); else entryMin.min(indexEntry); + if (entryMax == null) entryMax = (plasmaWordIndexEntryInstance) indexEntry.clone(); else entryMax.max(indexEntry); // take out relevant information for reference computation URL url = page.url(); @@ -138,13 +138,13 @@ public final class plasmaSearchResult { for (int i = 0; i < references.length; i++) commonSense.add(references[i]); Object[] resultVector; - plasmaWordIndexEntry indexEntry; + plasmaWordIndexEntryInstance indexEntry; plasmaCrawlLURL.Entry page; long ranking; for (int i = 0; i < results.size(); i++) { // take out values from result array resultVector = (Object[]) results.get(i); - indexEntry = (plasmaWordIndexEntry) resultVector[0]; + indexEntry = (plasmaWordIndexEntryInstance) resultVector[0]; page = (plasmaCrawlLURL.Entry) resultVector[1]; // calculate ranking diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index f01b33d6d..27f1777a7 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1486,7 +1486,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser wordStat = (plasmaCondenser.wordStatProp) wentry.getValue(); String wordHash = indexEntryAttribute.word2hash(word); plasmaWordIndexEntryContainer wordIdxContainer = new plasmaWordIndexEntryContainer(wordHash); - plasmaWordIndexEntry wordIdxEntry = new plasmaWordIndexEntry(urlHash, + plasmaWordIndexEntryInstance wordIdxEntry = new plasmaWordIndexEntryInstance(urlHash, urlLength, urlComps, wordStat.count, document.longTitle.length(), diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 71c59c209..a3ed0b87d 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -155,7 +155,7 @@ public final class plasmaWordIndex { } } - public boolean addEntry(String wordHash, plasmaWordIndexEntry entry, long updateTime, boolean dhtCase) { + public boolean addEntry(String wordHash, plasmaWordIndexEntryInstance entry, long updateTime, boolean dhtCase) { if (ramCache.addEntry(wordHash, entry, updateTime, dhtCase)) { if (!dhtCase) flushControl(); return true; @@ -237,7 +237,7 @@ public final class plasmaWordIndex { Iterator i = condenser.words(); Map.Entry wentry; String word; - plasmaWordIndexEntry ientry; + plasmaWordIndexEntryInstance ientry; plasmaCondenser.wordStatProp wprop; String wordHash; int urlLength = url.toString().length(); @@ -249,7 +249,7 @@ public final class plasmaWordIndex { wprop = (plasmaCondenser.wordStatProp) wentry.getValue(); // if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c); wordHash = indexEntryAttribute.word2hash(word); - ientry = new plasmaWordIndexEntry(urlHash, + ientry = new plasmaWordIndexEntryInstance(urlHash, urlLength, urlComps, (document == null) ? urlLength : document.longTitle.length(), wprop.count, condenser.RESULT_SIMI_WORDS, @@ -503,11 +503,11 @@ public final class plasmaWordIndex { // the combined container will fit, read the container try { Iterator entries = entity.elements(true); - plasmaWordIndexEntry entry; + plasmaWordIndexEntryInstance entry; while (entries.hasNext()) { - entry = (plasmaWordIndexEntry) entries.next(); + entry = (plasmaWordIndexEntryInstance) entries.next(); // System.out.println("ENTRY = " + entry.getUrlHash()); - container.add(new plasmaWordIndexEntry[]{entry}, System.currentTimeMillis()); + container.add(new plasmaWordIndexEntryInstance[]{entry}, System.currentTimeMillis()); } // we have read all elements, now delete the entity entity.deleteComplete(); @@ -555,7 +555,7 @@ public final class plasmaWordIndex { serverLog.logInfo("INDEXCLEANER", "IndexCleaner-Thread started"); String wordHash = ""; plasmaWordIndexEntryContainer wordContainer = null; - plasmaWordIndexEntry entry = null; + plasmaWordIndexEntryInstance entry = null; URL url = null; HashSet urlHashs = new HashSet(); try { @@ -568,7 +568,7 @@ public final class plasmaWordIndex { wordHashNow = wordHash; while (containerIterator.hasNext() && run) { waiter(); - entry = (plasmaWordIndexEntry) containerIterator.next(); + entry = (plasmaWordIndexEntryInstance) containerIterator.next(); // System.out.println("Wordhash: "+wordHash+" UrlHash: // "+entry.getUrlHash()); try { diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortment.java b/source/de/anomic/plasma/plasmaWordIndexAssortment.java index 2dd481a51..1be8b0357 100644 --- a/source/de/anomic/plasma/plasmaWordIndexAssortment.java +++ b/source/de/anomic/plasma/plasmaWordIndexAssortment.java @@ -71,7 +71,7 @@ public final class plasmaWordIndexAssortment { 4, // occurrence counter 8, // timestamp of last access indexEntryAttribute.urlHashLength, // corresponding URL hash - plasmaWordIndexEntry.attrSpace // URL attributes + plasmaWordIndexEntryInstance.encodedStringFormLength() // URL attributes }; // class variables @@ -135,11 +135,11 @@ public final class plasmaWordIndexAssortment { row[1] = kelondroRecords.long2bytes(1, 4); row[2] = kelondroRecords.long2bytes(newContainer.updated(), 8); Iterator entries = newContainer.entries(); - plasmaWordIndexEntry entry; + plasmaWordIndexEntryInstance entry; for (int i = 0; i < assortmentLength; i++) { - entry = (plasmaWordIndexEntry) entries.next(); + entry = (plasmaWordIndexEntryInstance) entries.next(); row[3 + 2 * i] = entry.getUrlHash().getBytes(); - row[4 + 2 * i] = entry.toEncodedForm().getBytes(); + row[4 + 2 * i] = entry.toEncodedStringForm().getBytes(); } byte[][] oldrow = null; try { @@ -217,7 +217,7 @@ public final class plasmaWordIndexAssortment { plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash); for (int i = 0; i < assortmentLength; i++) { container.add( - new plasmaWordIndexEntry[] { new plasmaWordIndexEntry( + new plasmaWordIndexEntryInstance[] { new plasmaWordIndexEntryInstance( new String(row[3 + 2 * i]), new String(row[4 + 2 * i])) }, updateTime); } return container; diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java index 73c98be6c..5ca622dc9 100644 --- a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java +++ b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java @@ -147,7 +147,7 @@ public final class plasmaWordIndexAssortmentCluster { c = new plasmaWordIndexEntryContainer(wordHash); for (int k = 0; k < j; k++) { if (i.hasNext()) { - c.add((plasmaWordIndexEntry) i.next(), newContainer.updated()); + c.add((plasmaWordIndexEntryInstance) i.next(), newContainer.updated()); } else { storeForced(wordHash, c); return; @@ -190,7 +190,7 @@ public final class plasmaWordIndexAssortmentCluster { c = new plasmaWordIndexEntryContainer(wordHash); for (int k = 0; k <= j; k++) { assert (i.hasNext()); - c.add((plasmaWordIndexEntry) i.next(), newContainer.updated()); + c.add((plasmaWordIndexEntryInstance) i.next(), newContainer.updated()); } storeForced(wordHash, c); } diff --git a/source/de/anomic/plasma/plasmaWordIndexCache.java b/source/de/anomic/plasma/plasmaWordIndexCache.java index 209fb835a..39d2811cb 100644 --- a/source/de/anomic/plasma/plasmaWordIndexCache.java +++ b/source/de/anomic/plasma/plasmaWordIndexCache.java @@ -117,7 +117,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { String wordHash; plasmaWordIndexEntryContainer container; long updateTime; - plasmaWordIndexEntry wordEntry; + plasmaWordIndexEntryInstance wordEntry; byte[][] row = new byte[5][]; // write kCache, this will be melted with the wCache upon load @@ -130,12 +130,12 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { if (container != null) { Iterator ci = container.entries(); while (ci.hasNext()) { - wordEntry = (plasmaWordIndexEntry) ci.next(); + wordEntry = (plasmaWordIndexEntryInstance) ci.next(); row[0] = container.wordHash().getBytes(); row[1] = kelondroRecords.long2bytes(container.size(), 4); row[2] = kelondroRecords.long2bytes(container.updated(), 8); row[3] = wordEntry.getUrlHash().getBytes(); - row[4] = wordEntry.toEncodedForm().getBytes(); + row[4] = wordEntry.toEncodedStringForm().getBytes(); dumpArray.set((int) urlcount++, row); } } @@ -159,12 +159,12 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { if (container != null) { Iterator ci = container.entries(); while (ci.hasNext()) { - wordEntry = (plasmaWordIndexEntry) ci.next(); + wordEntry = (plasmaWordIndexEntryInstance) ci.next(); row[0] = wordHash.getBytes(); row[1] = kelondroRecords.long2bytes(container.size(), 4); row[2] = kelondroRecords.long2bytes(updateTime, 8); row[3] = wordEntry.getUrlHash().getBytes(); - row[4] = wordEntry.toEncodedForm().getBytes(); + row[4] = wordEntry.toEncodedStringForm().getBytes(); dumpArray.set((int) urlcount++, row); } } @@ -198,7 +198,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { int i = dumpArray.size(); String wordHash; //long creationTime; - plasmaWordIndexEntry wordEntry; + plasmaWordIndexEntryInstance wordEntry; byte[][] row; //Runtime rt = Runtime.getRuntime(); while (i-- > 0) { @@ -207,7 +207,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { if ((row[0] == null) || (row[1] == null) || (row[2] == null) || (row[3] == null) || (row[4] == null)) continue; wordHash = new String(row[0], "UTF-8"); //creationTime = kelondroRecords.bytes2long(row[2]); - wordEntry = new plasmaWordIndexEntry(new String(row[3], "UTF-8"), new String(row[4], "UTF-8")); + wordEntry = new plasmaWordIndexEntryInstance(new String(row[3], "UTF-8"), new String(row[4], "UTF-8")); // store to cache addEntry(wordHash, wordEntry, startTime, false); urlCount++; @@ -450,7 +450,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { return added; } - public boolean addEntry(String wordHash, plasmaWordIndexEntry newEntry, long updateTime, boolean dhtCase) { + public boolean addEntry(String wordHash, plasmaWordIndexEntryInstance newEntry, long updateTime, boolean dhtCase) { if (dhtCase) synchronized (kCache) { // put container into kCache plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash); @@ -462,7 +462,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { } else synchronized (wCache) { plasmaWordIndexEntryContainer container = (plasmaWordIndexEntryContainer) wCache.get(wordHash); if (container == null) container = new plasmaWordIndexEntryContainer(wordHash); - plasmaWordIndexEntry[] entries = new plasmaWordIndexEntry[] { newEntry }; + plasmaWordIndexEntryInstance[] entries = new plasmaWordIndexEntryInstance[] { newEntry }; if (container.add(entries, updateTime) > 0) { wCache.put(wordHash, container); hashScore.incScore(wordHash); diff --git a/source/de/anomic/plasma/plasmaWordIndexClassicDB.java b/source/de/anomic/plasma/plasmaWordIndexClassicDB.java index 02446b090..20ff95c65 100644 --- a/source/de/anomic/plasma/plasmaWordIndexClassicDB.java +++ b/source/de/anomic/plasma/plasmaWordIndexClassicDB.java @@ -187,10 +187,10 @@ public class plasmaWordIndexClassicDB { if (plasmaWordIndexEntity.wordHash2path(databaseRoot, wordHash).exists()) { plasmaWordIndexEntity entity = this.getEntity(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime * 9 / 10); plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash); - plasmaWordIndexEntry entry; + plasmaWordIndexEntryInstance entry; Iterator i = entity.elements(true); while ((i.hasNext()) && (System.currentTimeMillis() < (start + maxTime))) { - entry = (plasmaWordIndexEntry) i.next(); + entry = (plasmaWordIndexEntryInstance) i.next(); container.add(entry); } return container; diff --git a/source/de/anomic/plasma/plasmaWordIndexEntity.java b/source/de/anomic/plasma/plasmaWordIndexEntity.java index 564f9862a..4ad314f06 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntity.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntity.java @@ -90,10 +90,10 @@ public final class plasmaWordIndexEntity { kt = new kelondroTree(theLocation, cacheSize, kelondroTree.defaultObjectCachePercent); } catch (IOException e) { theLocation.delete(); - kt = new kelondroTree(theLocation, cacheSize, kelondroTree.defaultObjectCachePercent, plasmaURL.urlHashLength, plasmaWordIndexEntry.attrSpace, false); + kt = new kelondroTree(theLocation, cacheSize, kelondroTree.defaultObjectCachePercent, plasmaURL.urlHashLength, plasmaWordIndexEntryInstance.encodedStringFormLength(), false); } else { // create new index file - kt = new kelondroTree(theLocation, cacheSize, kelondroTree.defaultObjectCachePercent, plasmaURL.urlHashLength, plasmaWordIndexEntry.attrSpace, false); + kt = new kelondroTree(theLocation, cacheSize, kelondroTree.defaultObjectCachePercent, plasmaURL.urlHashLength, plasmaWordIndexEntryInstance.encodedStringFormLength(), false); } return kt; // everyone who get this should close it when finished! } @@ -132,27 +132,27 @@ public final class plasmaWordIndexEntity { } catch (IOException e) {} } - public plasmaWordIndexEntry getEntry(String urlhash) throws IOException { + public plasmaWordIndexEntryInstance getEntry(String urlhash) throws IOException { byte[][] n = theIndex.get(urlhash.getBytes()); if (n == null) return null; - return new plasmaWordIndexEntry(new String(n[0]), new String(n[1])); + return new plasmaWordIndexEntryInstance(new String(n[0]), new String(n[1])); } public boolean contains(String urlhash) throws IOException { return (theIndex.get(urlhash.getBytes()) != null); } - public boolean contains(plasmaWordIndexEntry entry) throws IOException { + public boolean contains(plasmaWordIndexEntryInstance entry) throws IOException { return (theIndex.get(entry.getUrlHash().getBytes()) != null); } - public boolean addEntry(plasmaWordIndexEntry entry) throws IOException { + public boolean addEntry(plasmaWordIndexEntryInstance entry) throws IOException { if (entry == null) return false; - plasmaWordIndexEntry oldEntry = getEntry(entry.getUrlHash()); + plasmaWordIndexEntryInstance oldEntry = getEntry(entry.getUrlHash()); if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this entity return false; } - return (theIndex.put(entry.getUrlHash().getBytes(), entry.toEncodedForm().getBytes()) == null); + return (theIndex.put(entry.getUrlHash().getBytes(), entry.toEncodedStringForm().getBytes()) == null); } public int addEntries(plasmaWordIndexEntryContainer container) throws IOException { @@ -167,7 +167,7 @@ public final class plasmaWordIndexEntity { if (container != null) { Iterator i = container.entries(); while (i.hasNext()) { - if (addEntry((plasmaWordIndexEntry) i.next())) count++; + if (addEntry((plasmaWordIndexEntryInstance) i.next())) count++; } } @@ -231,7 +231,7 @@ public final class plasmaWordIndexEntity { public Object next() { if (i == null) return null; byte[][] n = (byte[][]) i.next(); - return new plasmaWordIndexEntry(new String(n[0]), new String(n[1])); + return new plasmaWordIndexEntryInstance(new String(n[0]), new String(n[1])); } public void remove() { throw new UnsupportedOperationException(); @@ -251,7 +251,7 @@ public final class plasmaWordIndexEntity { long timeout = (time == -1) ? Long.MAX_VALUE : System.currentTimeMillis() + time; try { while ((i.hasNext()) && (System.currentTimeMillis() < timeout)) { - addEntry((plasmaWordIndexEntry) i.next()); + addEntry((plasmaWordIndexEntryInstance) i.next()); } } catch (kelondroException e) { serverLog.logSevere("PLASMA", "plasmaWordIndexEntity.merge: " + e.getMessage()); diff --git a/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java b/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java index 62e2cbd30..9e73fbf59 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java @@ -100,16 +100,16 @@ public final class plasmaWordIndexEntryContainer { return wordHash; } - public int add(plasmaWordIndexEntry entry) { + public int add(plasmaWordIndexEntryInstance entry) { return add(entry, System.currentTimeMillis()); } - public int add(plasmaWordIndexEntry entry, long updateTime) { + public int add(plasmaWordIndexEntryInstance entry, long updateTime) { this.updateTime = java.lang.Math.max(this.updateTime, updateTime); return (addi(entry)) ? 1 : 0; } - public int add(plasmaWordIndexEntry[] entries, long updateTime) { + public int add(plasmaWordIndexEntryInstance[] entries, long updateTime) { int c = 0; for (int i = 0; i < entries.length; i++) if (addi(entries[i])) c++; this.updateTime = java.lang.Math.max(this.updateTime, updateTime); @@ -124,16 +124,16 @@ public final class plasmaWordIndexEntryContainer { int x = 0; while ((i.hasNext()) && ((maxTime < 0) || ((startTime + maxTime) > System.currentTimeMillis()))) { try { - if (addi((plasmaWordIndexEntry) i.next())) x++; + if (addi((plasmaWordIndexEntryInstance) i.next())) x++; } catch (ConcurrentModificationException e) {} } this.updateTime = java.lang.Math.max(this.updateTime, c.updateTime); return x; } - private boolean addi(plasmaWordIndexEntry entry) { + private boolean addi(plasmaWordIndexEntryInstance entry) { // returns true if the new entry was added, false if it already existed - plasmaWordIndexEntry oldEntry = (plasmaWordIndexEntry) container.put(entry.getUrlHash(), entry); + plasmaWordIndexEntryInstance oldEntry = (plasmaWordIndexEntryInstance) container.put(entry.getUrlHash(), entry); if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this container container.put(entry.getUrlHash(), oldEntry); // put it back return false; @@ -145,16 +145,16 @@ public final class plasmaWordIndexEntryContainer { return container.containsKey(urlHash); } - public plasmaWordIndexEntry get(String urlHash) { - return (plasmaWordIndexEntry) container.get(urlHash); + public plasmaWordIndexEntryInstance get(String urlHash) { + return (plasmaWordIndexEntryInstance) container.get(urlHash); } - public plasmaWordIndexEntry[] getEntryArray() { - return (plasmaWordIndexEntry[]) container.values().toArray(); + public plasmaWordIndexEntryInstance[] getEntryArray() { + return (plasmaWordIndexEntryInstance[]) container.values().toArray(); } - public plasmaWordIndexEntry remove(String urlHash) { - return (plasmaWordIndexEntry) container.remove(urlHash); + public plasmaWordIndexEntryInstance remove(String urlHash) { + return (plasmaWordIndexEntryInstance) container.remove(urlHash); } public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) { @@ -254,10 +254,10 @@ public final class plasmaWordIndexEntryContainer { System.out.println("DEBUG: JOIN METHOD BY TEST"); plasmaWordIndexEntryContainer conj = new plasmaWordIndexEntryContainer(null); // start with empty search result Iterator se = small.entries(); - plasmaWordIndexEntry ie0, ie1; + plasmaWordIndexEntryInstance ie0, ie1; long stamp = System.currentTimeMillis(); while ((se.hasNext()) && ((System.currentTimeMillis() - stamp) < time)) { - ie0 = (plasmaWordIndexEntry) se.next(); + ie0 = (plasmaWordIndexEntryInstance) se.next(); ie1 = large.get(ie0.getUrlHash()); if (ie1 != null) { // this is a hit. Calculate word distance: @@ -276,25 +276,25 @@ public final class plasmaWordIndexEntryContainer { Iterator e2 = i2.entries(); int c; if ((e1.hasNext()) && (e2.hasNext())) { - plasmaWordIndexEntry ie1; - plasmaWordIndexEntry ie2; - ie1 = (plasmaWordIndexEntry) e1.next(); - ie2 = (plasmaWordIndexEntry) e2.next(); + plasmaWordIndexEntryInstance ie1; + plasmaWordIndexEntryInstance ie2; + ie1 = (plasmaWordIndexEntryInstance) e1.next(); + ie2 = (plasmaWordIndexEntryInstance) e2.next(); long stamp = System.currentTimeMillis(); while ((System.currentTimeMillis() - stamp) < time) { c = i1.ordering.compare(ie1.getUrlHash(), ie2.getUrlHash()); //System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c); if (c < 0) { - if (e1.hasNext()) ie1 = (plasmaWordIndexEntry) e1.next(); else break; + if (e1.hasNext()) ie1 = (plasmaWordIndexEntryInstance) e1.next(); else break; } else if (c > 0) { - if (e2.hasNext()) ie2 = (plasmaWordIndexEntry) e2.next(); else break; + if (e2.hasNext()) ie2 = (plasmaWordIndexEntryInstance) e2.next(); else break; } else { // we have found the same urls in different searches! ie1.combineDistance(ie2); if (ie1.worddistance() <= maxDistance) conj.add(ie1); - if (e1.hasNext()) ie1 = (plasmaWordIndexEntry) e1.next(); else break; - if (e2.hasNext()) ie2 = (plasmaWordIndexEntry) e2.next(); else break; + if (e1.hasNext()) ie1 = (plasmaWordIndexEntryInstance) e1.next(); else break; + if (e2.hasNext()) ie2 = (plasmaWordIndexEntryInstance) e2.next(); else break; } } } diff --git a/source/de/anomic/plasma/plasmaWordIndexEntry.java b/source/de/anomic/plasma/plasmaWordIndexEntryInstance.java similarity index 61% rename from source/de/anomic/plasma/plasmaWordIndexEntry.java rename to source/de/anomic/plasma/plasmaWordIndexEntryInstance.java index 0f2834269..9dc680d7c 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntry.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntryInstance.java @@ -49,38 +49,21 @@ package de.anomic.plasma; import java.util.Properties; + +import de.anomic.index.indexEntry; import de.anomic.index.indexEntryAttribute; +import de.anomic.index.indexEntryPrototype; import de.anomic.kelondro.kelondroBase64Order; -public final class plasmaWordIndexEntry implements Cloneable { +public final class plasmaWordIndexEntryInstance extends indexEntryPrototype implements Cloneable, indexEntry { // an wordEntry can be filled in either of two ways: // by the discrete values of the entry // or by the encoded entry-string - // the size of the index entry attributes - public static final int attrSpace = 24; - - // the associated hash - private final String urlHash; - - // discrete values - private int hitcount; // number of this words in file - private int wordcount; // number of all words in the file - private int phrasecount; // number of all phrases in the file - private int posintext; // first position of the word in text as number of word; 0=unknown or irrelevant position - private int posinphrase; // position within a phrase of the word - private int posofphrase; // position of the phrase in the text as count of sentences; 0=unknown; 1=path; 2=keywords; 3=headline; >4: in text - private int worddistance;// distance between the words, only used if the index is artificial (from a conjunction) - private long lastModified;// calculated by using last-modified - private int quality; // result of a heuristic on the source file - private byte[] language; // essentially the country code (the TLD as heuristic), two letters lowercase only - private char doctype; // type of source - private char localflag; // indicates if the index was created locally - // the class instantiation can only be done by a plasmaStore method // therefore they are all public - public plasmaWordIndexEntry(String urlHash, + public plasmaWordIndexEntryInstance(String urlHash, int urlLength, // byte-length of complete URL int urlComps, // number of path components int titleLength, // length of description/length (longer are better?) @@ -122,7 +105,7 @@ public final class plasmaWordIndexEntry implements Cloneable { this.localflag = (local) ? indexEntryAttribute.LT_LOCAL : indexEntryAttribute.LT_GLOBAL; } - public plasmaWordIndexEntry(String urlHash, String code) { + public plasmaWordIndexEntryInstance(String urlHash, String code) { // the code is not parsed but used later on this.urlHash = urlHash; this.hitcount = (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(6, 8)); @@ -142,7 +125,7 @@ public final class plasmaWordIndexEntry implements Cloneable { if (phrasecount == 0) phrasecount = 100; } - public plasmaWordIndexEntry(String external) { + public plasmaWordIndexEntryInstance(String external) { // parse external form String[] elts = external.substring(1, external.length() - 1).split(","); Properties pr = new Properties(); @@ -167,13 +150,18 @@ public final class plasmaWordIndexEntry implements Cloneable { } public Object clone() { - return new plasmaWordIndexEntry(this.toExternalForm()); + return new plasmaWordIndexEntryInstance(this.toPropertyForm()); } - public String toEncodedForm() { + public static int encodedStringFormLength() { + // the size of the index entry attributes when encoded to string + return 24; + } + + public String toEncodedStringForm() { // attention: this integrates NOT the URL hash into the encoding // if you need a complete dump, use toExternalForm() - StringBuffer buf = new StringBuffer(attrSpace); + StringBuffer buf = new StringBuffer(encodedStringFormLength()); buf.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.quality, plasmaURL.urlQualityLength)) .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(plasmaWordIndex.microDateDays(this.lastModified), 3)) @@ -191,7 +179,16 @@ public final class plasmaWordIndexEntry implements Cloneable { return buf.toString(); } - public String toExternalForm() { + public static int encodedByteArrayFormLength() { + // the size of the index entry attributes when encoded to string + return encodedStringFormLength(); + } + + public byte[] toEncodedByteArrayForm() { + return toEncodedStringForm().getBytes(); + } + + public String toPropertyForm() { StringBuffer str = new StringBuffer(61); str.append("{") @@ -213,93 +210,6 @@ public final class plasmaWordIndexEntry implements Cloneable { return str.toString(); } - public void combineDistance(plasmaWordIndexEntry oe) { - this.worddistance = this.worddistance + oe.worddistance + Math.abs(this.posintext - oe.posintext); - this.posintext = Math.min(this.posintext, oe.posintext); - if (this.posofphrase != oe.posofphrase) this.posinphrase = 0; // (unknown) - this.posofphrase = Math.min(this.posofphrase, oe.posofphrase); - this.wordcount = (this.wordcount + oe.wordcount) / 2; - } - - public void min(plasmaWordIndexEntry other) { - if (this.hitcount > other.hitcount) this.hitcount = other.hitcount; - if (this.wordcount > other.wordcount) this.wordcount = other.wordcount; - if (this.phrasecount > other.phrasecount) this.phrasecount = other.phrasecount; - if (this.posintext > other.posintext) this.posintext = other.posintext; - if (this.posinphrase > other.posinphrase) this.posinphrase = other.posinphrase; - if (this.posofphrase > other.posofphrase) this.posofphrase = other.posofphrase; - if (this.worddistance > other.worddistance) this.worddistance = other.worddistance; - if (this.lastModified > other.lastModified) this.lastModified = other.lastModified; - if (this.quality > other.quality) this.quality = other.quality; - } - - public void max(plasmaWordIndexEntry other) { - if (this.hitcount < other.hitcount) this.hitcount = other.hitcount; - if (this.wordcount < other.wordcount) this.wordcount = other.wordcount; - if (this.phrasecount < other.phrasecount) this.phrasecount = other.phrasecount; - if (this.posintext < other.posintext) this.posintext = other.posintext; - if (this.posinphrase < other.posinphrase) this.posinphrase = other.posinphrase; - if (this.posofphrase < other.posofphrase) this.posofphrase = other.posofphrase; - if (this.worddistance < other.worddistance) this.worddistance = other.worddistance; - if (this.lastModified < other.lastModified) this.lastModified = other.lastModified; - if (this.quality < other.quality) this.quality = other.quality; - } - - public void normalize(plasmaWordIndexEntry min, plasmaWordIndexEntry max) { - this.hitcount = (this.hitcount == 0) ? 0 : 1 + 255 * (this.hitcount - min.hitcount ) / (1 + max.hitcount - min.hitcount); - this.wordcount = (this.wordcount == 0) ? 0 : 1 + 255 * (this.wordcount - min.wordcount ) / (1 + max.wordcount - min.wordcount); - this.phrasecount = (this.phrasecount == 0) ? 0 : 1 + 255 * (this.phrasecount - min.phrasecount ) / (1 + max.phrasecount - min.phrasecount); - this.posintext = (this.posintext == 0) ? 0 : 1 + 255 * (this.posintext - min.posintext ) / (1 + max.posintext - min.posintext); - this.posinphrase = (this.posinphrase == 0) ? 0 : 1 + 255 * (this.posinphrase - min.posinphrase ) / (1 + max.posinphrase - min.posinphrase); - this.posofphrase = (this.posofphrase == 0) ? 0 : 1 + 255 * (this.posofphrase - min.posofphrase ) / (1 + max.posofphrase - min.posofphrase); - this.worddistance = (this.worddistance == 0) ? 0 : 1 + 255 * (this.worddistance - min.worddistance) / (1 + max.worddistance - min.worddistance); - this.lastModified = (this.lastModified == 0) ? 0 : 1 + 255 * (this.lastModified - min.lastModified) / (1 + max.lastModified - min.lastModified); - this.quality = (this.quality == 0) ? 0 : 1 + 255 * (this.quality - min.quality ) / (1 + max.quality - min.quality); - } - - public plasmaWordIndexEntry generateNormalized(plasmaWordIndexEntry min, plasmaWordIndexEntry max) { - plasmaWordIndexEntry e = (plasmaWordIndexEntry) this.clone(); - e.normalize(min, max); - return e; - } - - public String getUrlHash() { return urlHash; } - public int getQuality() { return quality; } - public int getVirtualAge() { return plasmaWordIndex.microDateDays(lastModified); } - public long getLastModified() { return lastModified; } - public int hitcount() { return hitcount; } - public int posintext() { return posintext; } - public int posinphrase() { return posinphrase; } - public int posofphrase() { return posofphrase; } - public int worddistance() { return worddistance; } - public int wordcount() { return wordcount; } - public int phrasecount() { return phrasecount; } - public String getLanguage() { return new String(language); } - public char getType() { return doctype; } - public boolean isLocal() { return localflag == indexEntryAttribute.LT_LOCAL; } - - public boolean isNewer(plasmaWordIndexEntry other) { - if (other == null) return true; - if (this.lastModified > other.lastModified) return true; - if (this.lastModified == other.getLastModified()) { - if (this.quality > other.quality) return true; - } - return false; - } - - public boolean isOlder(plasmaWordIndexEntry other) { - if (other == null) return false; - if (this.lastModified < other.getLastModified()) return true; - if (this.lastModified == other.getLastModified()) { - if (this.quality < other.quality) return true; - } - return false; - } - - public int domlengthNormalized() { - return 255 * plasmaURL.domLengthEstimation(this.urlHash) / 30; - } - public static void main(String[] args) { // outputs the word hash to a given word if (args.length != 1) System.exit(0); diff --git a/source/de/anomic/plasma/plasmaWordIndexInterface.java b/source/de/anomic/plasma/plasmaWordIndexInterface.java index dc47838a3..7471a6d31 100644 --- a/source/de/anomic/plasma/plasmaWordIndexInterface.java +++ b/source/de/anomic/plasma/plasmaWordIndexInterface.java @@ -55,7 +55,7 @@ public interface plasmaWordIndexInterface { public plasmaWordIndexEntryContainer deleteContainer(String wordHash); public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete); - public boolean addEntry(String wordHash, plasmaWordIndexEntry entry, long updateTime, boolean dhtCase); + public boolean addEntry(String wordHash, plasmaWordIndexEntryInstance entry, long updateTime, boolean dhtCase); public int addEntries(plasmaWordIndexEntryContainer newEntries, long creationTime, boolean dhtCase); public void close(int waitingSeconds); diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 7afa347b2..fc856c3c2 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -59,7 +59,7 @@ import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSwitchboard; -import de.anomic.plasma.plasmaWordIndexEntry; +import de.anomic.plasma.plasmaWordIndexEntryInstance; import de.anomic.plasma.plasmaWordIndexEntryContainer; import de.anomic.plasma.plasmaURLPattern; import de.anomic.plasma.plasmaSearchTimingProfile; @@ -483,10 +483,10 @@ public final class yacyClient { urlManager.stackEntry(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2); // save the url entry - final plasmaWordIndexEntry entry; + final plasmaWordIndexEntryInstance entry; if (urlEntry.word() == null) { // the old way to define words - entry = new plasmaWordIndexEntry( + entry = new plasmaWordIndexEntryInstance( urlEntry.hash(), urlLength, urlComps, urlEntry.descr().length(), @@ -513,7 +513,7 @@ public final class yacyClient { } // add the url entry to the word indexes for (int m = 0; m < words; m++) { - container[m].add(new plasmaWordIndexEntry[]{entry}, System.currentTimeMillis()); + container[m].add(new plasmaWordIndexEntryInstance[]{entry}, System.currentTimeMillis()); } } @@ -881,11 +881,11 @@ public final class yacyClient { // check if we got all necessary urls in the urlCache (only for debugging) Iterator eenum; - plasmaWordIndexEntry entry; + plasmaWordIndexEntryInstance entry; for (int i = 0; i < indexes.length; i++) { eenum = indexes[i].entries(); while (eenum.hasNext()) { - entry = (plasmaWordIndexEntry) eenum.next(); + entry = (plasmaWordIndexEntryInstance) eenum.next(); if (urlCache.get(entry.getUrlHash()) == null) { yacyCore.log.logFine("DEBUG transferIndex: to-send url hash '" + entry.getUrlHash() + "' is not contained in urlCache"); } @@ -961,13 +961,13 @@ public final class yacyClient { int indexcount = 0; final StringBuffer entrypost = new StringBuffer(indexes.length*73); Iterator eenum; - plasmaWordIndexEntry entry; + plasmaWordIndexEntryInstance entry; for (int i = 0; i < indexes.length; i++) { eenum = indexes[i].entries(); while (eenum.hasNext()) { - entry = (plasmaWordIndexEntry) eenum.next(); + entry = (plasmaWordIndexEntryInstance) eenum.next(); entrypost.append(indexes[i].wordHash()) - .append(entry.toExternalForm()) + .append(entry.toPropertyForm()) .append(serverCore.crlfString); indexcount++; } diff --git a/source/yacy.java b/source/yacy.java index 9394c5bc7..4f54f22d3 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -85,7 +85,7 @@ import de.anomic.plasma.plasmaWordIndexAssortment; import de.anomic.plasma.plasmaWordIndexAssortmentCluster; import de.anomic.plasma.plasmaWordIndexClassicDB; import de.anomic.plasma.plasmaWordIndexEntity; -import de.anomic.plasma.plasmaWordIndexEntry; +import de.anomic.plasma.plasmaWordIndexEntryInstance; import de.anomic.plasma.plasmaWordIndexEntryContainer; import de.anomic.server.serverCore; import de.anomic.server.serverDate; @@ -859,7 +859,7 @@ public final class yacy { // the combined container will fit, read the container Iterator importWordIdxEntries = newContainer.entries(); - plasmaWordIndexEntry importWordIdxEntry; + plasmaWordIndexEntryInstance importWordIdxEntry; while (importWordIdxEntries.hasNext()) { // testing if import process was aborted @@ -867,7 +867,7 @@ public final class yacy { // getting next word index entry entryCounter++; - importWordIdxEntry = (plasmaWordIndexEntry) importWordIdxEntries.next(); + importWordIdxEntry = (plasmaWordIndexEntryInstance) importWordIdxEntries.next(); String urlHash = importWordIdxEntry.getUrlHash(); if ((importUrlDB.exists(urlHash)) && (!homeUrlDB.exists(urlHash))) try { // importing the new url @@ -970,9 +970,9 @@ public final class yacy { // the combined container will fit, read the container Iterator wordIdxEntries = wordIdxContainer.entries(); - plasmaWordIndexEntry wordIdxEntry; + plasmaWordIndexEntryInstance wordIdxEntry; while (wordIdxEntries.hasNext()) { - wordIdxEntry = (plasmaWordIndexEntry) wordIdxEntries.next(); + wordIdxEntry = (plasmaWordIndexEntryInstance) wordIdxEntries.next(); String urlHash = wordIdxEntry.getUrlHash(); if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try { plasmaCrawlLURL.Entry urlEntry = currentUrlDB.getEntry(urlHash, null);