diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java
index f0fbf3b14..c140706df 100644
--- a/htroot/IndexControl_p.java
+++ b/htroot/IndexControl_p.java
@@ -62,7 +62,7 @@ import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.plasmaWordIndex;
-import de.anomic.plasma.plasmaWordIndexEntry;
+import de.anomic.plasma.plasmaWordIndexEntryInstance;
import de.anomic.plasma.plasmaWordIndexEntryContainer;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@@ -153,7 +153,7 @@ public class IndexControl_p {
int i = 0;
urlx = new String[index.size()];
while (en.hasNext()) {
- urlx[i++] = ((plasmaWordIndexEntry) en.next()).getUrlHash();
+ urlx[i++] = ((plasmaWordIndexEntryInstance) en.next()).getUrlHash();
}
index = null;
}
@@ -254,10 +254,10 @@ public class IndexControl_p {
Iterator urlIter = index.entries();
HashMap knownURLs = new HashMap();
HashSet unknownURLEntries = new HashSet();
- plasmaWordIndexEntry indexEntry;
+ plasmaWordIndexEntryInstance indexEntry;
plasmaCrawlLURL.Entry lurl;
while (urlIter.hasNext()) {
- indexEntry = (plasmaWordIndexEntry) urlIter.next();
+ indexEntry = (plasmaWordIndexEntryInstance) urlIter.next();
try {
lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), null);
if (lurl.toString() == null) {
@@ -437,9 +437,9 @@ public class IndexControl_p {
int i = 0;
final TreeMap tm = new TreeMap();
- plasmaWordIndexEntry xi;
+ plasmaWordIndexEntryInstance xi;
while (en.hasNext()) {
- xi = (plasmaWordIndexEntry) en.next();
+ xi = (plasmaWordIndexEntryInstance) en.next();
uh = new String[]{xi.getUrlHash(), Integer.toString(xi.posintext())};
try {
us = switchboard.urlPool.loadedURL.getEntry(uh[0], null).url().toString();
diff --git a/htroot/index.html b/htroot/index.html
index 373ade029..f388059c2 100644
--- a/htroot/index.html
+++ b/htroot/index.html
@@ -38,7 +38,7 @@
|
- more options... |
+ more options... |
::
diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java
index 03f18497e..eb8e255b5 100644
--- a/htroot/yacy/transferRWI.java
+++ b/htroot/yacy/transferRWI.java
@@ -52,7 +52,7 @@ import java.util.LinkedList;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaSwitchboard;
-import de.anomic.plasma.plasmaWordIndexEntry;
+import de.anomic.plasma.plasmaWordIndexEntryInstance;
import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@@ -123,7 +123,7 @@ public final class transferRWI {
int p;
String wordHash;
String urlHash;
- plasmaWordIndexEntry entry;
+ plasmaWordIndexEntryInstance entry;
int wordhashesSize = v.size();
final HashSet unknownURL = new HashSet();
String[] wordhashes = new String[v.size()];
@@ -136,7 +136,7 @@ public final class transferRWI {
if (p > 0) {
wordHash = estring.substring(0, p);
wordhashes[received] = wordHash;
- entry = new plasmaWordIndexEntry(estring.substring(p));
+ entry = new plasmaWordIndexEntryInstance(estring.substring(p));
sb.wordIndex.addEntry(wordHash, entry, System.currentTimeMillis(), true);
serverCore.checkInterruption();
diff --git a/source/de/anomic/index/indexEntry.java b/source/de/anomic/index/indexEntry.java
new file mode 100644
index 000000000..4d3876720
--- /dev/null
+++ b/source/de/anomic/index/indexEntry.java
@@ -0,0 +1,44 @@
+// indexEntry.java
+// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
+// first published 20.05.2006 on http://www.anomic.de
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
+// $LastChangedRevision: 1986 $
+// $LastChangedBy: orbiter $
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+package de.anomic.index;
+
+public interface indexEntry {
+
+ public Object clone();
+ public String toEncodedStringForm();
+ public byte[] toEncodedByteArrayForm();
+ public String toPropertyForm();
+
+ public void combineDistance(indexEntry oe);
+ public void min(indexEntry other);
+ public void max(indexEntry other);
+ public void normalize(indexEntry min, indexEntry max);
+ public indexEntry generateNormalized(indexEntry min, indexEntry max);
+ public boolean isNewer(indexEntry other);
+ public boolean isOlder(indexEntry other);
+
+}
diff --git a/source/de/anomic/index/indexEntryPrototype.java b/source/de/anomic/index/indexEntryPrototype.java
new file mode 100644
index 000000000..89b51917d
--- /dev/null
+++ b/source/de/anomic/index/indexEntryPrototype.java
@@ -0,0 +1,154 @@
+// indexEntryPrototype.java
+// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
+// first published 20.05.2006 on http://www.anomic.de
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
+// $LastChangedRevision: 1986 $
+// $LastChangedBy: orbiter $
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+package de.anomic.index;
+
+import de.anomic.plasma.plasmaURL;
+import de.anomic.plasma.plasmaWordIndex;
+
+public abstract class indexEntryPrototype implements indexEntry {
+
+ // the associated hash
+ protected String urlHash;
+
+ // discrete values
+ protected int hitcount; // number of this words in file
+ protected int wordcount; // number of all words in the file
+ protected int phrasecount; // number of all phrases in the file
+ protected int posintext; // first position of the word in text as number of word; 0=unknown or irrelevant position
+ protected int posinphrase; // position within a phrase of the word
+ protected int posofphrase; // position of the phrase in the text as count of sentences; 0=unknown; 1=path; 2=keywords; 3=headline; >4: in text
+ protected int worddistance;// distance between the words, only used if the index is artificial (from a conjunction)
+ protected long lastModified;// calculated by using last-modified
+ protected int quality; // result of a heuristic on the source file
+ protected byte[] language; // essentially the country code (the TLD as heuristic), two letters lowercase only
+ protected char doctype; // type of source
+ protected char localflag; // indicates if the index was created locally
+
+ public abstract Object clone();
+
+ public abstract String toEncodedStringForm();
+
+ public abstract byte[] toEncodedByteArrayForm();
+
+ public abstract String toPropertyForm();
+
+ public void combineDistance(indexEntry oe) {
+ this.worddistance = this.worddistance + ((indexEntryPrototype) oe).worddistance + Math.abs(this.posintext - ((indexEntryPrototype) oe).posintext);
+ this.posintext = Math.min(this.posintext, ((indexEntryPrototype) oe).posintext);
+ if (this.posofphrase != ((indexEntryPrototype) oe).posofphrase) this.posinphrase = 0; // (unknown)
+ this.posofphrase = Math.min(this.posofphrase, ((indexEntryPrototype) oe).posofphrase);
+ this.wordcount = (this.wordcount + ((indexEntryPrototype) oe).wordcount) / 2;
+ }
+
+ public void min(indexEntry other) {
+ if (this.hitcount > ((indexEntryPrototype) other).hitcount) this.hitcount = ((indexEntryPrototype) other).hitcount;
+ if (this.wordcount > ((indexEntryPrototype) other).wordcount) this.wordcount = ((indexEntryPrototype) other).wordcount;
+ if (this.phrasecount > ((indexEntryPrototype) other).phrasecount) this.phrasecount = ((indexEntryPrototype) other).phrasecount;
+ if (this.posintext > ((indexEntryPrototype) other).posintext) this.posintext = ((indexEntryPrototype) other).posintext;
+ if (this.posinphrase > ((indexEntryPrototype) other).posinphrase) this.posinphrase = ((indexEntryPrototype) other).posinphrase;
+ if (this.posofphrase > ((indexEntryPrototype) other).posofphrase) this.posofphrase = ((indexEntryPrototype) other).posofphrase;
+ if (this.worddistance > ((indexEntryPrototype) other).worddistance) this.worddistance = ((indexEntryPrototype) other).worddistance;
+ if (this.lastModified > ((indexEntryPrototype) other).lastModified) this.lastModified = ((indexEntryPrototype) other).lastModified;
+ if (this.quality > ((indexEntryPrototype) other).quality) this.quality = ((indexEntryPrototype) other).quality;
+ }
+
+ public void max(indexEntry other) {
+ if (this.hitcount < ((indexEntryPrototype) other).hitcount) this.hitcount = ((indexEntryPrototype) other).hitcount;
+ if (this.wordcount < ((indexEntryPrototype) other).wordcount) this.wordcount = ((indexEntryPrototype) other).wordcount;
+ if (this.phrasecount < ((indexEntryPrototype) other).phrasecount) this.phrasecount = ((indexEntryPrototype) other).phrasecount;
+ if (this.posintext < ((indexEntryPrototype) other).posintext) this.posintext = ((indexEntryPrototype) other).posintext;
+ if (this.posinphrase < ((indexEntryPrototype) other).posinphrase) this.posinphrase = ((indexEntryPrototype) other).posinphrase;
+ if (this.posofphrase < ((indexEntryPrototype) other).posofphrase) this.posofphrase = ((indexEntryPrototype) other).posofphrase;
+ if (this.worddistance < ((indexEntryPrototype) other).worddistance) this.worddistance = ((indexEntryPrototype) other).worddistance;
+ if (this.lastModified < ((indexEntryPrototype) other).lastModified) this.lastModified = ((indexEntryPrototype) other).lastModified;
+ if (this.quality < ((indexEntryPrototype) other).quality) this.quality = ((indexEntryPrototype) other).quality;
+ }
+
+ public void normalize(indexEntry mi, indexEntry ma) {
+ indexEntryPrototype min = (indexEntryPrototype) mi;
+ indexEntryPrototype max = (indexEntryPrototype) ma;
+ this.hitcount = (this.hitcount == 0) ? 0 : 1 + 255 * (this.hitcount - min.hitcount ) / (1 + max.hitcount - min.hitcount);
+ this.wordcount = (this.wordcount == 0) ? 0 : 1 + 255 * (this.wordcount - min.wordcount ) / (1 + max.wordcount - min.wordcount);
+ this.phrasecount = (this.phrasecount == 0) ? 0 : 1 + 255 * (this.phrasecount - min.phrasecount ) / (1 + max.phrasecount - min.phrasecount);
+ this.posintext = (this.posintext == 0) ? 0 : 1 + 255 * (this.posintext - min.posintext ) / (1 + max.posintext - min.posintext);
+ this.posinphrase = (this.posinphrase == 0) ? 0 : 1 + 255 * (this.posinphrase - min.posinphrase ) / (1 + max.posinphrase - min.posinphrase);
+ this.posofphrase = (this.posofphrase == 0) ? 0 : 1 + 255 * (this.posofphrase - min.posofphrase ) / (1 + max.posofphrase - min.posofphrase);
+ this.worddistance = (this.worddistance == 0) ? 0 : 1 + 255 * (this.worddistance - min.worddistance) / (1 + max.worddistance - min.worddistance);
+ this.lastModified = (this.lastModified == 0) ? 0 : 1 + 255 * (this.lastModified - min.lastModified) / (1 + max.lastModified - min.lastModified);
+ this.quality = (this.quality == 0) ? 0 : 1 + 255 * (this.quality - min.quality ) / (1 + max.quality - min.quality);
+ }
+
+ public indexEntry generateNormalized(indexEntry min, indexEntry max) {
+ indexEntry e = (indexEntryPrototype) this.clone();
+ e.normalize(min, max);
+ return e;
+ }
+
+ public String getUrlHash() { return urlHash; }
+ public int getQuality() { return quality; }
+ public int getVirtualAge() { return plasmaWordIndex.microDateDays(lastModified); }
+ public long getLastModified() { return lastModified; }
+ public int hitcount() { return hitcount; }
+ public int posintext() { return posintext; }
+ public int posinphrase() { return posinphrase; }
+ public int posofphrase() { return posofphrase; }
+ public int worddistance() { return worddistance; }
+ public int wordcount() { return wordcount; }
+ public int phrasecount() { return phrasecount; }
+ public String getLanguage() { return new String(language); }
+ public char getType() { return doctype; }
+ public boolean isLocal() { return localflag == indexEntryAttribute.LT_LOCAL; }
+
+ public boolean isNewer(indexEntry other) {
+ if (other == null) return true;
+ if (this.lastModified > ((indexEntryPrototype) other).lastModified) return true;
+ if (this.lastModified == ((indexEntryPrototype) other).getLastModified()) {
+ if (this.quality > ((indexEntryPrototype) other).quality) return true;
+ }
+ return false;
+ }
+
+ public boolean isOlder(indexEntry other) {
+ if (other == null) return false;
+ if (this.lastModified < ((indexEntryPrototype) other).getLastModified()) return true;
+ if (this.lastModified == ((indexEntryPrototype) other).getLastModified()) {
+ if (this.quality < ((indexEntryPrototype) other).quality) return true;
+ }
+ return false;
+ }
+
+ public int domlengthNormalized() {
+ return 255 * plasmaURL.domLengthEstimation(this.urlHash) / 30;
+ }
+
+ public static void main(String[] args) {
+ // outputs the word hash to a given word
+ if (args.length != 1) System.exit(0);
+ System.out.println("WORDHASH: " + indexEntryAttribute.word2hash(args[0]));
+ }
+
+}
diff --git a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java
index 85ebf22dc..6a4b05ad1 100644
--- a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java
+++ b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java
@@ -10,7 +10,7 @@ import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
-import de.anomic.plasma.plasmaWordIndexEntry;
+import de.anomic.plasma.plasmaWordIndexEntryInstance;
import de.anomic.plasma.plasmaWordIndexEntryContainer;
import de.anomic.server.serverDate;
@@ -128,13 +128,13 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
// loop throug the entities of the container and get the
// urlhash
Iterator importWordIdxEntries = newContainer.entries();
- plasmaWordIndexEntry importWordIdxEntry;
+ plasmaWordIndexEntryInstance importWordIdxEntry;
while (importWordIdxEntries.hasNext()) {
// testing if import process was aborted
if (isAborted()) break;
// getting next word index entry
- importWordIdxEntry = (plasmaWordIndexEntry) importWordIdxEntries.next();
+ importWordIdxEntry = (plasmaWordIndexEntryInstance) importWordIdxEntries.next();
String urlHash = importWordIdxEntry.getUrlHash();
entityUrls.add(urlHash);
}
diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java
index 7512ea5b5..3b55a34f2 100644
--- a/source/de/anomic/plasma/plasmaCrawlLURL.java
+++ b/source/de/anomic/plasma/plasmaCrawlLURL.java
@@ -179,7 +179,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
gcrawlResultStack.add(urlHash + initiatorHash + executorHash);
}
- public Entry getEntry(String hash, plasmaWordIndexEntry searchedWord) throws IOException {
+ public Entry getEntry(String hash, plasmaWordIndexEntryInstance searchedWord) throws IOException {
return new Entry(hash, searchedWord);
}
@@ -410,7 +410,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
private int size;
private int wordCount;
private String snippet;
- private plasmaWordIndexEntry word; // this is only used if the url is transported via remote search requests
+ private plasmaWordIndexEntryInstance word; // this is only used if the url is transported via remote search requests
private boolean stored;
// more needed attributes:
@@ -445,7 +445,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
this.stored = false;
}
- public Entry(String urlHash, plasmaWordIndexEntry searchedWord) throws IOException {
+ public Entry(String urlHash, plasmaWordIndexEntryInstance searchedWord) throws IOException {
// generates an plasmaLURLEntry using the url hash
// to speed up the access, the url-hashes are buffered
// in the hash cache.
@@ -506,7 +506,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
this.wordCount = Integer.parseInt(prop.getProperty("wc", "0"));
this.snippet = prop.getProperty("snippet", "");
if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null);
- this.word = (prop.containsKey("word")) ? new plasmaWordIndexEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word",""))) : null;
+ this.word = (prop.containsKey("word")) ? new plasmaWordIndexEntryInstance(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word",""))) : null;
this.stored = false;
//}
} catch (Exception e) {
@@ -646,7 +646,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
return snippet;
}
- public plasmaWordIndexEntry word() {
+ public plasmaWordIndexEntryInstance word() {
return word;
}
@@ -683,7 +683,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
if (this.word != null) {
// append also word properties
- corePropStr.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toExternalForm()));
+ corePropStr.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toPropertyForm()));
}
return corePropStr;
diff --git a/source/de/anomic/plasma/plasmaDHTChunk.java b/source/de/anomic/plasma/plasmaDHTChunk.java
index 7ef16a899..6b34dcce5 100644
--- a/source/de/anomic/plasma/plasmaDHTChunk.java
+++ b/source/de/anomic/plasma/plasmaDHTChunk.java
@@ -188,7 +188,7 @@ public class plasmaDHTChunk {
Iterator wordHashIterator = wordIndex.wordHashSet(hash, resourceLevel, true, maxcount).iterator();
plasmaWordIndexEntryContainer indexContainer;
Iterator urlIter;
- plasmaWordIndexEntry indexEntry;
+ plasmaWordIndexEntryInstance indexEntry;
plasmaCrawlLURL.Entry lurl;
int refcount = 0;
@@ -204,7 +204,7 @@ public class plasmaDHTChunk {
urlIter = indexContainer.entries();
// iterate over indexes to fetch url entries and store them in the urlCache
while ((urlIter.hasNext()) && (maxcount > refcount)) {
- indexEntry = (plasmaWordIndexEntry) urlIter.next();
+ indexEntry = (plasmaWordIndexEntryInstance) urlIter.next();
try {
lurl = lurls.getEntry(indexEntry.getUrlHash(), indexEntry);
if ((lurl == null) || (lurl.url() == null)) {
@@ -224,7 +224,7 @@ public class plasmaDHTChunk {
// remove all remaining; we have enough
while (urlIter.hasNext()) {
- indexEntry = (plasmaWordIndexEntry) urlIter.next();
+ indexEntry = (plasmaWordIndexEntryInstance) urlIter.next();
urlIter.remove();
}
@@ -266,7 +266,7 @@ public class plasmaDHTChunk {
public int deleteTransferIndexes() {
Iterator urlIter;
- plasmaWordIndexEntry indexEntry;
+ plasmaWordIndexEntryInstance indexEntry;
String[] urlHashes;
int count = 0;
for (int i = 0; i < this.indexContainers.length; i++) {
@@ -275,7 +275,7 @@ public class plasmaDHTChunk {
urlHashes = new String[this.indexContainers[i].size()];
urlIter = this.indexContainers[i].entries();
while (urlIter.hasNext()) {
- indexEntry = (plasmaWordIndexEntry) urlIter.next();
+ indexEntry = (plasmaWordIndexEntryInstance) urlIter.next();
urlHashes[c++] = indexEntry.getUrlHash();
}
count += wordIndex.removeEntries(this.indexContainers[i].wordHash(), urlHashes, true);
diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java
index 1927210c6..b4a38b975 100644
--- a/source/de/anomic/plasma/plasmaSearchEvent.java
+++ b/source/de/anomic/plasma/plasmaSearchEvent.java
@@ -239,7 +239,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
//if (searchResult == null) return acc; // strange case where searchResult is not proper: acc is then empty
//if (searchResult.size() == 0) return acc; // case that we have nothing to do
- plasmaWordIndexEntry entry;
+ plasmaWordIndexEntryInstance entry;
plasmaCrawlLURL.Entry page;
int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT);
try {
diff --git a/source/de/anomic/plasma/plasmaSearchPreOrder.java b/source/de/anomic/plasma/plasmaSearchPreOrder.java
index dbfac54d0..9ea5de0c3 100644
--- a/source/de/anomic/plasma/plasmaSearchPreOrder.java
+++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java
@@ -56,7 +56,7 @@ public final class plasmaSearchPreOrder {
public static kelondroBinSearch[] ybrTables = null; // block-rank tables
private static boolean useYBR = true;
- private plasmaWordIndexEntry entryMin, entryMax;
+ private plasmaWordIndexEntryInstance entryMin, entryMax;
private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry
private plasmaSearchQuery query;
private plasmaSearchRankingProfile ranking;
@@ -116,36 +116,36 @@ public final class plasmaSearchPreOrder {
return pageAcc.size() > 0;
}
- public plasmaWordIndexEntry next() {
+ public plasmaWordIndexEntryInstance next() {
Object top = pageAcc.lastKey();
- return (plasmaWordIndexEntry) pageAcc.remove(top);
+ return (plasmaWordIndexEntryInstance) pageAcc.remove(top);
}
public void addContainer(plasmaWordIndexEntryContainer container, long maxTime) {
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
- plasmaWordIndexEntry indexEntry;
+ plasmaWordIndexEntryInstance indexEntry;
// first pass: find min/max to obtain limits for normalization
Iterator i = container.entries();
int count = 0;
while (i.hasNext()) {
if (System.currentTimeMillis() > limitTime) break;
- indexEntry = (plasmaWordIndexEntry) i.next();
- if (entryMin == null) entryMin = (plasmaWordIndexEntry) indexEntry.clone(); else entryMin.min(indexEntry);
- if (entryMax == null) entryMax = (plasmaWordIndexEntry) indexEntry.clone(); else entryMax.max(indexEntry);
+ indexEntry = (plasmaWordIndexEntryInstance) i.next();
+ if (entryMin == null) entryMin = (plasmaWordIndexEntryInstance) indexEntry.clone(); else entryMin.min(indexEntry);
+ if (entryMax == null) entryMax = (plasmaWordIndexEntryInstance) indexEntry.clone(); else entryMax.max(indexEntry);
count++;
}
// second pass: normalize entries and get ranking
i = container.entries();
for (int j = 0; j < count; j++) {
- indexEntry = (plasmaWordIndexEntry) i.next();
+ indexEntry = (plasmaWordIndexEntryInstance) i.next();
pageAcc.put(serverCodings.encodeHex(this.ranking.preRanking(indexEntry.generateNormalized(entryMin, entryMax)), 16) + indexEntry.getUrlHash(), indexEntry);
}
}
- public plasmaWordIndexEntry[] getNormalizer() {
- return new plasmaWordIndexEntry[] {entryMin, entryMax};
+ public plasmaWordIndexEntryInstance[] getNormalizer() {
+ return new plasmaWordIndexEntryInstance[] {entryMin, entryMax};
}
public static int ybr_p(String urlHash) {
diff --git a/source/de/anomic/plasma/plasmaSearchRankingProfile.java b/source/de/anomic/plasma/plasmaSearchRankingProfile.java
index b898bd0bb..b6608d7c6 100644
--- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java
+++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java
@@ -46,6 +46,8 @@ import java.util.Iterator;
import java.util.Map;
import java.util.Set;
+import de.anomic.index.indexEntry;
+
public class plasmaSearchRankingProfile {
// old parameters for ordering
@@ -161,21 +163,23 @@ public class plasmaSearchRankingProfile {
return new String(ext);
}
- public long preRanking(plasmaWordIndexEntry normalizedEntry) {
+ public long preRanking(indexEntry entry) {
long ranking = 0;
-
- ranking += normalizedEntry.getQuality() << ((Integer) coeff.get(ENTROPY)).intValue();
- ranking += normalizedEntry.getVirtualAge() << ((Integer) coeff.get(DATE)).intValue();
- ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.getUrlHash()) << ((Integer) coeff.get(YBR)).intValue();
- ranking += (normalizedEntry.posintext() == 0) ? 0 : (255 - normalizedEntry.posintext()) << ((Integer) coeff.get(POSINTEXT)).intValue();
- ranking += (normalizedEntry.worddistance() == 0) ? 0 : (255 - normalizedEntry.worddistance()) << ((Integer) coeff.get(WORDDISTANCE)).intValue();
- ranking += (normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount() << ((Integer) coeff.get(HITCOUNT)).intValue();
- ranking += (255 - normalizedEntry.domlengthNormalized()) << ((Integer) coeff.get(DOMLENGTH)).intValue();
+ if (entry instanceof plasmaWordIndexEntryInstance) {
+ plasmaWordIndexEntryInstance normalizedEntry = (plasmaWordIndexEntryInstance) entry;
+ ranking += normalizedEntry.getQuality() << ((Integer) coeff.get(ENTROPY)).intValue();
+ ranking += normalizedEntry.getVirtualAge() << ((Integer) coeff.get(DATE)).intValue();
+ ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.getUrlHash()) << ((Integer) coeff.get(YBR)).intValue();
+ ranking += (normalizedEntry.posintext() == 0) ? 0 : (255 - normalizedEntry.posintext()) << ((Integer) coeff.get(POSINTEXT)).intValue();
+ ranking += (normalizedEntry.worddistance() == 0) ? 0 : (255 - normalizedEntry.worddistance()) << ((Integer) coeff.get(WORDDISTANCE)).intValue();
+ ranking += (normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount() << ((Integer) coeff.get(HITCOUNT)).intValue();
+ ranking += (255 - normalizedEntry.domlengthNormalized()) << ((Integer) coeff.get(DOMLENGTH)).intValue();
+ }
return ranking;
}
public long postRanking(
- plasmaWordIndexEntry normalizedEntry,
+ indexEntry normalizedEntry,
plasmaSearchQuery query,
Set topwords,
String[] urlcomps,
diff --git a/source/de/anomic/plasma/plasmaSearchResult.java b/source/de/anomic/plasma/plasmaSearchResult.java
index a8353db42..bc91d9b9b 100644
--- a/source/de/anomic/plasma/plasmaSearchResult.java
+++ b/source/de/anomic/plasma/plasmaSearchResult.java
@@ -59,7 +59,7 @@ import de.anomic.index.indexEntryAttribute;
public final class plasmaSearchResult {
- private plasmaWordIndexEntry entryMin, entryMax;
+ private plasmaWordIndexEntryInstance entryMin, entryMax;
private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry
private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic
private ArrayList results; // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects
@@ -106,11 +106,11 @@ public final class plasmaSearchResult {
return (plasmaCrawlLURL.Entry) pageAcc.remove(top);
}
- protected void addResult(plasmaWordIndexEntry indexEntry, plasmaCrawlLURL.Entry page) {
+ protected void addResult(plasmaWordIndexEntryInstance indexEntry, plasmaCrawlLURL.Entry page) {
// make min/max for normalization
- if (entryMin == null) entryMin = (plasmaWordIndexEntry) indexEntry.clone(); else entryMin.min(indexEntry);
- if (entryMax == null) entryMax = (plasmaWordIndexEntry) indexEntry.clone(); else entryMax.max(indexEntry);
+ if (entryMin == null) entryMin = (plasmaWordIndexEntryInstance) indexEntry.clone(); else entryMin.min(indexEntry);
+ if (entryMax == null) entryMax = (plasmaWordIndexEntryInstance) indexEntry.clone(); else entryMax.max(indexEntry);
// take out relevant information for reference computation
URL url = page.url();
@@ -138,13 +138,13 @@ public final class plasmaSearchResult {
for (int i = 0; i < references.length; i++) commonSense.add(references[i]);
Object[] resultVector;
- plasmaWordIndexEntry indexEntry;
+ plasmaWordIndexEntryInstance indexEntry;
plasmaCrawlLURL.Entry page;
long ranking;
for (int i = 0; i < results.size(); i++) {
// take out values from result array
resultVector = (Object[]) results.get(i);
- indexEntry = (plasmaWordIndexEntry) resultVector[0];
+ indexEntry = (plasmaWordIndexEntryInstance) resultVector[0];
page = (plasmaCrawlLURL.Entry) resultVector[1];
// calculate ranking
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index f01b33d6d..27f1777a7 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -1486,7 +1486,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
wordStat = (plasmaCondenser.wordStatProp) wentry.getValue();
String wordHash = indexEntryAttribute.word2hash(word);
plasmaWordIndexEntryContainer wordIdxContainer = new plasmaWordIndexEntryContainer(wordHash);
- plasmaWordIndexEntry wordIdxEntry = new plasmaWordIndexEntry(urlHash,
+ plasmaWordIndexEntryInstance wordIdxEntry = new plasmaWordIndexEntryInstance(urlHash,
urlLength, urlComps,
wordStat.count,
document.longTitle.length(),
diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java
index 71c59c209..a3ed0b87d 100644
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@@ -155,7 +155,7 @@ public final class plasmaWordIndex {
}
}
- public boolean addEntry(String wordHash, plasmaWordIndexEntry entry, long updateTime, boolean dhtCase) {
+ public boolean addEntry(String wordHash, plasmaWordIndexEntryInstance entry, long updateTime, boolean dhtCase) {
if (ramCache.addEntry(wordHash, entry, updateTime, dhtCase)) {
if (!dhtCase) flushControl();
return true;
@@ -237,7 +237,7 @@ public final class plasmaWordIndex {
Iterator i = condenser.words();
Map.Entry wentry;
String word;
- plasmaWordIndexEntry ientry;
+ plasmaWordIndexEntryInstance ientry;
plasmaCondenser.wordStatProp wprop;
String wordHash;
int urlLength = url.toString().length();
@@ -249,7 +249,7 @@ public final class plasmaWordIndex {
wprop = (plasmaCondenser.wordStatProp) wentry.getValue();
// if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c);
wordHash = indexEntryAttribute.word2hash(word);
- ientry = new plasmaWordIndexEntry(urlHash,
+ ientry = new plasmaWordIndexEntryInstance(urlHash,
urlLength, urlComps, (document == null) ? urlLength : document.longTitle.length(),
wprop.count,
condenser.RESULT_SIMI_WORDS,
@@ -503,11 +503,11 @@ public final class plasmaWordIndex {
// the combined container will fit, read the container
try {
Iterator entries = entity.elements(true);
- plasmaWordIndexEntry entry;
+ plasmaWordIndexEntryInstance entry;
while (entries.hasNext()) {
- entry = (plasmaWordIndexEntry) entries.next();
+ entry = (plasmaWordIndexEntryInstance) entries.next();
// System.out.println("ENTRY = " + entry.getUrlHash());
- container.add(new plasmaWordIndexEntry[]{entry}, System.currentTimeMillis());
+ container.add(new plasmaWordIndexEntryInstance[]{entry}, System.currentTimeMillis());
}
// we have read all elements, now delete the entity
entity.deleteComplete();
@@ -555,7 +555,7 @@ public final class plasmaWordIndex {
serverLog.logInfo("INDEXCLEANER", "IndexCleaner-Thread started");
String wordHash = "";
plasmaWordIndexEntryContainer wordContainer = null;
- plasmaWordIndexEntry entry = null;
+ plasmaWordIndexEntryInstance entry = null;
URL url = null;
HashSet urlHashs = new HashSet();
try {
@@ -568,7 +568,7 @@ public final class plasmaWordIndex {
wordHashNow = wordHash;
while (containerIterator.hasNext() && run) {
waiter();
- entry = (plasmaWordIndexEntry) containerIterator.next();
+ entry = (plasmaWordIndexEntryInstance) containerIterator.next();
// System.out.println("Wordhash: "+wordHash+" UrlHash:
// "+entry.getUrlHash());
try {
diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortment.java b/source/de/anomic/plasma/plasmaWordIndexAssortment.java
index 2dd481a51..1be8b0357 100644
--- a/source/de/anomic/plasma/plasmaWordIndexAssortment.java
+++ b/source/de/anomic/plasma/plasmaWordIndexAssortment.java
@@ -71,7 +71,7 @@ public final class plasmaWordIndexAssortment {
4, // occurrence counter
8, // timestamp of last access
indexEntryAttribute.urlHashLength, // corresponding URL hash
- plasmaWordIndexEntry.attrSpace // URL attributes
+ plasmaWordIndexEntryInstance.encodedStringFormLength() // URL attributes
};
// class variables
@@ -135,11 +135,11 @@ public final class plasmaWordIndexAssortment {
row[1] = kelondroRecords.long2bytes(1, 4);
row[2] = kelondroRecords.long2bytes(newContainer.updated(), 8);
Iterator entries = newContainer.entries();
- plasmaWordIndexEntry entry;
+ plasmaWordIndexEntryInstance entry;
for (int i = 0; i < assortmentLength; i++) {
- entry = (plasmaWordIndexEntry) entries.next();
+ entry = (plasmaWordIndexEntryInstance) entries.next();
row[3 + 2 * i] = entry.getUrlHash().getBytes();
- row[4 + 2 * i] = entry.toEncodedForm().getBytes();
+ row[4 + 2 * i] = entry.toEncodedStringForm().getBytes();
}
byte[][] oldrow = null;
try {
@@ -217,7 +217,7 @@ public final class plasmaWordIndexAssortment {
plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash);
for (int i = 0; i < assortmentLength; i++) {
container.add(
- new plasmaWordIndexEntry[] { new plasmaWordIndexEntry(
+ new plasmaWordIndexEntryInstance[] { new plasmaWordIndexEntryInstance(
new String(row[3 + 2 * i]), new String(row[4 + 2 * i])) }, updateTime);
}
return container;
diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java
index 73c98be6c..5ca622dc9 100644
--- a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java
+++ b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java
@@ -147,7 +147,7 @@ public final class plasmaWordIndexAssortmentCluster {
c = new plasmaWordIndexEntryContainer(wordHash);
for (int k = 0; k < j; k++) {
if (i.hasNext()) {
- c.add((plasmaWordIndexEntry) i.next(), newContainer.updated());
+ c.add((plasmaWordIndexEntryInstance) i.next(), newContainer.updated());
} else {
storeForced(wordHash, c);
return;
@@ -190,7 +190,7 @@ public final class plasmaWordIndexAssortmentCluster {
c = new plasmaWordIndexEntryContainer(wordHash);
for (int k = 0; k <= j; k++) {
assert (i.hasNext());
- c.add((plasmaWordIndexEntry) i.next(), newContainer.updated());
+ c.add((plasmaWordIndexEntryInstance) i.next(), newContainer.updated());
}
storeForced(wordHash, c);
}
diff --git a/source/de/anomic/plasma/plasmaWordIndexCache.java b/source/de/anomic/plasma/plasmaWordIndexCache.java
index 209fb835a..39d2811cb 100644
--- a/source/de/anomic/plasma/plasmaWordIndexCache.java
+++ b/source/de/anomic/plasma/plasmaWordIndexCache.java
@@ -117,7 +117,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
String wordHash;
plasmaWordIndexEntryContainer container;
long updateTime;
- plasmaWordIndexEntry wordEntry;
+ plasmaWordIndexEntryInstance wordEntry;
byte[][] row = new byte[5][];
// write kCache, this will be melted with the wCache upon load
@@ -130,12 +130,12 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
if (container != null) {
Iterator ci = container.entries();
while (ci.hasNext()) {
- wordEntry = (plasmaWordIndexEntry) ci.next();
+ wordEntry = (plasmaWordIndexEntryInstance) ci.next();
row[0] = container.wordHash().getBytes();
row[1] = kelondroRecords.long2bytes(container.size(), 4);
row[2] = kelondroRecords.long2bytes(container.updated(), 8);
row[3] = wordEntry.getUrlHash().getBytes();
- row[4] = wordEntry.toEncodedForm().getBytes();
+ row[4] = wordEntry.toEncodedStringForm().getBytes();
dumpArray.set((int) urlcount++, row);
}
}
@@ -159,12 +159,12 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
if (container != null) {
Iterator ci = container.entries();
while (ci.hasNext()) {
- wordEntry = (plasmaWordIndexEntry) ci.next();
+ wordEntry = (plasmaWordIndexEntryInstance) ci.next();
row[0] = wordHash.getBytes();
row[1] = kelondroRecords.long2bytes(container.size(), 4);
row[2] = kelondroRecords.long2bytes(updateTime, 8);
row[3] = wordEntry.getUrlHash().getBytes();
- row[4] = wordEntry.toEncodedForm().getBytes();
+ row[4] = wordEntry.toEncodedStringForm().getBytes();
dumpArray.set((int) urlcount++, row);
}
}
@@ -198,7 +198,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
int i = dumpArray.size();
String wordHash;
//long creationTime;
- plasmaWordIndexEntry wordEntry;
+ plasmaWordIndexEntryInstance wordEntry;
byte[][] row;
//Runtime rt = Runtime.getRuntime();
while (i-- > 0) {
@@ -207,7 +207,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
if ((row[0] == null) || (row[1] == null) || (row[2] == null) || (row[3] == null) || (row[4] == null)) continue;
wordHash = new String(row[0], "UTF-8");
//creationTime = kelondroRecords.bytes2long(row[2]);
- wordEntry = new plasmaWordIndexEntry(new String(row[3], "UTF-8"), new String(row[4], "UTF-8"));
+ wordEntry = new plasmaWordIndexEntryInstance(new String(row[3], "UTF-8"), new String(row[4], "UTF-8"));
// store to cache
addEntry(wordHash, wordEntry, startTime, false);
urlCount++;
@@ -450,7 +450,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
return added;
}
- public boolean addEntry(String wordHash, plasmaWordIndexEntry newEntry, long updateTime, boolean dhtCase) {
+ public boolean addEntry(String wordHash, plasmaWordIndexEntryInstance newEntry, long updateTime, boolean dhtCase) {
if (dhtCase) synchronized (kCache) {
// put container into kCache
plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash);
@@ -462,7 +462,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
} else synchronized (wCache) {
plasmaWordIndexEntryContainer container = (plasmaWordIndexEntryContainer) wCache.get(wordHash);
if (container == null) container = new plasmaWordIndexEntryContainer(wordHash);
- plasmaWordIndexEntry[] entries = new plasmaWordIndexEntry[] { newEntry };
+ plasmaWordIndexEntryInstance[] entries = new plasmaWordIndexEntryInstance[] { newEntry };
if (container.add(entries, updateTime) > 0) {
wCache.put(wordHash, container);
hashScore.incScore(wordHash);
diff --git a/source/de/anomic/plasma/plasmaWordIndexClassicDB.java b/source/de/anomic/plasma/plasmaWordIndexClassicDB.java
index 02446b090..20ff95c65 100644
--- a/source/de/anomic/plasma/plasmaWordIndexClassicDB.java
+++ b/source/de/anomic/plasma/plasmaWordIndexClassicDB.java
@@ -187,10 +187,10 @@ public class plasmaWordIndexClassicDB {
if (plasmaWordIndexEntity.wordHash2path(databaseRoot, wordHash).exists()) {
plasmaWordIndexEntity entity = this.getEntity(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime * 9 / 10);
plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash);
- plasmaWordIndexEntry entry;
+ plasmaWordIndexEntryInstance entry;
Iterator i = entity.elements(true);
while ((i.hasNext()) && (System.currentTimeMillis() < (start + maxTime))) {
- entry = (plasmaWordIndexEntry) i.next();
+ entry = (plasmaWordIndexEntryInstance) i.next();
container.add(entry);
}
return container;
diff --git a/source/de/anomic/plasma/plasmaWordIndexEntity.java b/source/de/anomic/plasma/plasmaWordIndexEntity.java
index 564f9862a..4ad314f06 100644
--- a/source/de/anomic/plasma/plasmaWordIndexEntity.java
+++ b/source/de/anomic/plasma/plasmaWordIndexEntity.java
@@ -90,10 +90,10 @@ public final class plasmaWordIndexEntity {
kt = new kelondroTree(theLocation, cacheSize, kelondroTree.defaultObjectCachePercent);
} catch (IOException e) {
theLocation.delete();
- kt = new kelondroTree(theLocation, cacheSize, kelondroTree.defaultObjectCachePercent, plasmaURL.urlHashLength, plasmaWordIndexEntry.attrSpace, false);
+ kt = new kelondroTree(theLocation, cacheSize, kelondroTree.defaultObjectCachePercent, plasmaURL.urlHashLength, plasmaWordIndexEntryInstance.encodedStringFormLength(), false);
} else {
// create new index file
- kt = new kelondroTree(theLocation, cacheSize, kelondroTree.defaultObjectCachePercent, plasmaURL.urlHashLength, plasmaWordIndexEntry.attrSpace, false);
+ kt = new kelondroTree(theLocation, cacheSize, kelondroTree.defaultObjectCachePercent, plasmaURL.urlHashLength, plasmaWordIndexEntryInstance.encodedStringFormLength(), false);
}
return kt; // everyone who get this should close it when finished!
}
@@ -132,27 +132,27 @@ public final class plasmaWordIndexEntity {
} catch (IOException e) {}
}
- public plasmaWordIndexEntry getEntry(String urlhash) throws IOException {
+ public plasmaWordIndexEntryInstance getEntry(String urlhash) throws IOException {
byte[][] n = theIndex.get(urlhash.getBytes());
if (n == null) return null;
- return new plasmaWordIndexEntry(new String(n[0]), new String(n[1]));
+ return new plasmaWordIndexEntryInstance(new String(n[0]), new String(n[1]));
}
public boolean contains(String urlhash) throws IOException {
return (theIndex.get(urlhash.getBytes()) != null);
}
- public boolean contains(plasmaWordIndexEntry entry) throws IOException {
+ public boolean contains(plasmaWordIndexEntryInstance entry) throws IOException {
return (theIndex.get(entry.getUrlHash().getBytes()) != null);
}
- public boolean addEntry(plasmaWordIndexEntry entry) throws IOException {
+ public boolean addEntry(plasmaWordIndexEntryInstance entry) throws IOException {
if (entry == null) return false;
- plasmaWordIndexEntry oldEntry = getEntry(entry.getUrlHash());
+ plasmaWordIndexEntryInstance oldEntry = getEntry(entry.getUrlHash());
if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this entity
return false;
}
- return (theIndex.put(entry.getUrlHash().getBytes(), entry.toEncodedForm().getBytes()) == null);
+ return (theIndex.put(entry.getUrlHash().getBytes(), entry.toEncodedStringForm().getBytes()) == null);
}
public int addEntries(plasmaWordIndexEntryContainer container) throws IOException {
@@ -167,7 +167,7 @@ public final class plasmaWordIndexEntity {
if (container != null) {
Iterator i = container.entries();
while (i.hasNext()) {
- if (addEntry((plasmaWordIndexEntry) i.next())) count++;
+ if (addEntry((plasmaWordIndexEntryInstance) i.next())) count++;
}
}
@@ -231,7 +231,7 @@ public final class plasmaWordIndexEntity {
public Object next() {
if (i == null) return null;
byte[][] n = (byte[][]) i.next();
- return new plasmaWordIndexEntry(new String(n[0]), new String(n[1]));
+ return new plasmaWordIndexEntryInstance(new String(n[0]), new String(n[1]));
}
public void remove() {
throw new UnsupportedOperationException();
@@ -251,7 +251,7 @@ public final class plasmaWordIndexEntity {
long timeout = (time == -1) ? Long.MAX_VALUE : System.currentTimeMillis() + time;
try {
while ((i.hasNext()) && (System.currentTimeMillis() < timeout)) {
- addEntry((plasmaWordIndexEntry) i.next());
+ addEntry((plasmaWordIndexEntryInstance) i.next());
}
} catch (kelondroException e) {
serverLog.logSevere("PLASMA", "plasmaWordIndexEntity.merge: " + e.getMessage());
diff --git a/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java b/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java
index 62e2cbd30..9e73fbf59 100644
--- a/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java
+++ b/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java
@@ -100,16 +100,16 @@ public final class plasmaWordIndexEntryContainer {
return wordHash;
}
- public int add(plasmaWordIndexEntry entry) {
+ public int add(plasmaWordIndexEntryInstance entry) {
return add(entry, System.currentTimeMillis());
}
- public int add(plasmaWordIndexEntry entry, long updateTime) {
+ public int add(plasmaWordIndexEntryInstance entry, long updateTime) {
this.updateTime = java.lang.Math.max(this.updateTime, updateTime);
return (addi(entry)) ? 1 : 0;
}
- public int add(plasmaWordIndexEntry[] entries, long updateTime) {
+ public int add(plasmaWordIndexEntryInstance[] entries, long updateTime) {
int c = 0;
for (int i = 0; i < entries.length; i++) if (addi(entries[i])) c++;
this.updateTime = java.lang.Math.max(this.updateTime, updateTime);
@@ -124,16 +124,16 @@ public final class plasmaWordIndexEntryContainer {
int x = 0;
while ((i.hasNext()) && ((maxTime < 0) || ((startTime + maxTime) > System.currentTimeMillis()))) {
try {
- if (addi((plasmaWordIndexEntry) i.next())) x++;
+ if (addi((plasmaWordIndexEntryInstance) i.next())) x++;
} catch (ConcurrentModificationException e) {}
}
this.updateTime = java.lang.Math.max(this.updateTime, c.updateTime);
return x;
}
- private boolean addi(plasmaWordIndexEntry entry) {
+ private boolean addi(plasmaWordIndexEntryInstance entry) {
// returns true if the new entry was added, false if it already existed
- plasmaWordIndexEntry oldEntry = (plasmaWordIndexEntry) container.put(entry.getUrlHash(), entry);
+ plasmaWordIndexEntryInstance oldEntry = (plasmaWordIndexEntryInstance) container.put(entry.getUrlHash(), entry);
if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this container
container.put(entry.getUrlHash(), oldEntry); // put it back
return false;
@@ -145,16 +145,16 @@ public final class plasmaWordIndexEntryContainer {
return container.containsKey(urlHash);
}
- public plasmaWordIndexEntry get(String urlHash) {
- return (plasmaWordIndexEntry) container.get(urlHash);
+ public plasmaWordIndexEntryInstance get(String urlHash) {
+ return (plasmaWordIndexEntryInstance) container.get(urlHash);
}
- public plasmaWordIndexEntry[] getEntryArray() {
- return (plasmaWordIndexEntry[]) container.values().toArray();
+ public plasmaWordIndexEntryInstance[] getEntryArray() {
+ return (plasmaWordIndexEntryInstance[]) container.values().toArray();
}
- public plasmaWordIndexEntry remove(String urlHash) {
- return (plasmaWordIndexEntry) container.remove(urlHash);
+ public plasmaWordIndexEntryInstance remove(String urlHash) {
+ return (plasmaWordIndexEntryInstance) container.remove(urlHash);
}
public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) {
@@ -254,10 +254,10 @@ public final class plasmaWordIndexEntryContainer {
System.out.println("DEBUG: JOIN METHOD BY TEST");
plasmaWordIndexEntryContainer conj = new plasmaWordIndexEntryContainer(null); // start with empty search result
Iterator se = small.entries();
- plasmaWordIndexEntry ie0, ie1;
+ plasmaWordIndexEntryInstance ie0, ie1;
long stamp = System.currentTimeMillis();
while ((se.hasNext()) && ((System.currentTimeMillis() - stamp) < time)) {
- ie0 = (plasmaWordIndexEntry) se.next();
+ ie0 = (plasmaWordIndexEntryInstance) se.next();
ie1 = large.get(ie0.getUrlHash());
if (ie1 != null) {
// this is a hit. Calculate word distance:
@@ -276,25 +276,25 @@ public final class plasmaWordIndexEntryContainer {
Iterator e2 = i2.entries();
int c;
if ((e1.hasNext()) && (e2.hasNext())) {
- plasmaWordIndexEntry ie1;
- plasmaWordIndexEntry ie2;
- ie1 = (plasmaWordIndexEntry) e1.next();
- ie2 = (plasmaWordIndexEntry) e2.next();
+ plasmaWordIndexEntryInstance ie1;
+ plasmaWordIndexEntryInstance ie2;
+ ie1 = (plasmaWordIndexEntryInstance) e1.next();
+ ie2 = (plasmaWordIndexEntryInstance) e2.next();
long stamp = System.currentTimeMillis();
while ((System.currentTimeMillis() - stamp) < time) {
c = i1.ordering.compare(ie1.getUrlHash(), ie2.getUrlHash());
//System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c);
if (c < 0) {
- if (e1.hasNext()) ie1 = (plasmaWordIndexEntry) e1.next(); else break;
+ if (e1.hasNext()) ie1 = (plasmaWordIndexEntryInstance) e1.next(); else break;
} else if (c > 0) {
- if (e2.hasNext()) ie2 = (plasmaWordIndexEntry) e2.next(); else break;
+ if (e2.hasNext()) ie2 = (plasmaWordIndexEntryInstance) e2.next(); else break;
} else {
// we have found the same urls in different searches!
ie1.combineDistance(ie2);
if (ie1.worddistance() <= maxDistance) conj.add(ie1);
- if (e1.hasNext()) ie1 = (plasmaWordIndexEntry) e1.next(); else break;
- if (e2.hasNext()) ie2 = (plasmaWordIndexEntry) e2.next(); else break;
+ if (e1.hasNext()) ie1 = (plasmaWordIndexEntryInstance) e1.next(); else break;
+ if (e2.hasNext()) ie2 = (plasmaWordIndexEntryInstance) e2.next(); else break;
}
}
}
diff --git a/source/de/anomic/plasma/plasmaWordIndexEntry.java b/source/de/anomic/plasma/plasmaWordIndexEntryInstance.java
similarity index 61%
rename from source/de/anomic/plasma/plasmaWordIndexEntry.java
rename to source/de/anomic/plasma/plasmaWordIndexEntryInstance.java
index 0f2834269..9dc680d7c 100644
--- a/source/de/anomic/plasma/plasmaWordIndexEntry.java
+++ b/source/de/anomic/plasma/plasmaWordIndexEntryInstance.java
@@ -49,38 +49,21 @@
package de.anomic.plasma;
import java.util.Properties;
+
+import de.anomic.index.indexEntry;
import de.anomic.index.indexEntryAttribute;
+import de.anomic.index.indexEntryPrototype;
import de.anomic.kelondro.kelondroBase64Order;
-public final class plasmaWordIndexEntry implements Cloneable {
+public final class plasmaWordIndexEntryInstance extends indexEntryPrototype implements Cloneable, indexEntry {
// an wordEntry can be filled in either of two ways:
// by the discrete values of the entry
// or by the encoded entry-string
- // the size of the index entry attributes
- public static final int attrSpace = 24;
-
- // the associated hash
- private final String urlHash;
-
- // discrete values
- private int hitcount; // number of this words in file
- private int wordcount; // number of all words in the file
- private int phrasecount; // number of all phrases in the file
- private int posintext; // first position of the word in text as number of word; 0=unknown or irrelevant position
- private int posinphrase; // position within a phrase of the word
- private int posofphrase; // position of the phrase in the text as count of sentences; 0=unknown; 1=path; 2=keywords; 3=headline; >4: in text
- private int worddistance;// distance between the words, only used if the index is artificial (from a conjunction)
- private long lastModified;// calculated by using last-modified
- private int quality; // result of a heuristic on the source file
- private byte[] language; // essentially the country code (the TLD as heuristic), two letters lowercase only
- private char doctype; // type of source
- private char localflag; // indicates if the index was created locally
-
// the class instantiation can only be done by a plasmaStore method
// therefore they are all public
- public plasmaWordIndexEntry(String urlHash,
+ public plasmaWordIndexEntryInstance(String urlHash,
int urlLength, // byte-length of complete URL
int urlComps, // number of path components
int titleLength, // length of description/length (longer are better?)
@@ -122,7 +105,7 @@ public final class plasmaWordIndexEntry implements Cloneable {
this.localflag = (local) ? indexEntryAttribute.LT_LOCAL : indexEntryAttribute.LT_GLOBAL;
}
- public plasmaWordIndexEntry(String urlHash, String code) {
+ public plasmaWordIndexEntryInstance(String urlHash, String code) {
// the code is not parsed but used later on
this.urlHash = urlHash;
this.hitcount = (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(6, 8));
@@ -142,7 +125,7 @@ public final class plasmaWordIndexEntry implements Cloneable {
if (phrasecount == 0) phrasecount = 100;
}
- public plasmaWordIndexEntry(String external) {
+ public plasmaWordIndexEntryInstance(String external) {
// parse external form
String[] elts = external.substring(1, external.length() - 1).split(",");
Properties pr = new Properties();
@@ -167,13 +150,18 @@ public final class plasmaWordIndexEntry implements Cloneable {
}
public Object clone() {
- return new plasmaWordIndexEntry(this.toExternalForm());
+ return new plasmaWordIndexEntryInstance(this.toPropertyForm());
}
- public String toEncodedForm() {
+ public static int encodedStringFormLength() {
+ // the size of the index entry attributes when encoded to string
+ return 24;
+ }
+
+ public String toEncodedStringForm() {
// attention: this integrates NOT the URL hash into the encoding
// if you need a complete dump, use toExternalForm()
- StringBuffer buf = new StringBuffer(attrSpace);
+ StringBuffer buf = new StringBuffer(encodedStringFormLength());
buf.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.quality, plasmaURL.urlQualityLength))
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(plasmaWordIndex.microDateDays(this.lastModified), 3))
@@ -191,7 +179,16 @@ public final class plasmaWordIndexEntry implements Cloneable {
return buf.toString();
}
- public String toExternalForm() {
+ public static int encodedByteArrayFormLength() {
+ // the size of the index entry attributes when encoded to string
+ return encodedStringFormLength();
+ }
+
+ public byte[] toEncodedByteArrayForm() {
+ return toEncodedStringForm().getBytes();
+ }
+
+ public String toPropertyForm() {
StringBuffer str = new StringBuffer(61);
str.append("{")
@@ -213,93 +210,6 @@ public final class plasmaWordIndexEntry implements Cloneable {
return str.toString();
}
- public void combineDistance(plasmaWordIndexEntry oe) {
- this.worddistance = this.worddistance + oe.worddistance + Math.abs(this.posintext - oe.posintext);
- this.posintext = Math.min(this.posintext, oe.posintext);
- if (this.posofphrase != oe.posofphrase) this.posinphrase = 0; // (unknown)
- this.posofphrase = Math.min(this.posofphrase, oe.posofphrase);
- this.wordcount = (this.wordcount + oe.wordcount) / 2;
- }
-
- public void min(plasmaWordIndexEntry other) {
- if (this.hitcount > other.hitcount) this.hitcount = other.hitcount;
- if (this.wordcount > other.wordcount) this.wordcount = other.wordcount;
- if (this.phrasecount > other.phrasecount) this.phrasecount = other.phrasecount;
- if (this.posintext > other.posintext) this.posintext = other.posintext;
- if (this.posinphrase > other.posinphrase) this.posinphrase = other.posinphrase;
- if (this.posofphrase > other.posofphrase) this.posofphrase = other.posofphrase;
- if (this.worddistance > other.worddistance) this.worddistance = other.worddistance;
- if (this.lastModified > other.lastModified) this.lastModified = other.lastModified;
- if (this.quality > other.quality) this.quality = other.quality;
- }
-
- public void max(plasmaWordIndexEntry other) {
- if (this.hitcount < other.hitcount) this.hitcount = other.hitcount;
- if (this.wordcount < other.wordcount) this.wordcount = other.wordcount;
- if (this.phrasecount < other.phrasecount) this.phrasecount = other.phrasecount;
- if (this.posintext < other.posintext) this.posintext = other.posintext;
- if (this.posinphrase < other.posinphrase) this.posinphrase = other.posinphrase;
- if (this.posofphrase < other.posofphrase) this.posofphrase = other.posofphrase;
- if (this.worddistance < other.worddistance) this.worddistance = other.worddistance;
- if (this.lastModified < other.lastModified) this.lastModified = other.lastModified;
- if (this.quality < other.quality) this.quality = other.quality;
- }
-
- public void normalize(plasmaWordIndexEntry min, plasmaWordIndexEntry max) {
- this.hitcount = (this.hitcount == 0) ? 0 : 1 + 255 * (this.hitcount - min.hitcount ) / (1 + max.hitcount - min.hitcount);
- this.wordcount = (this.wordcount == 0) ? 0 : 1 + 255 * (this.wordcount - min.wordcount ) / (1 + max.wordcount - min.wordcount);
- this.phrasecount = (this.phrasecount == 0) ? 0 : 1 + 255 * (this.phrasecount - min.phrasecount ) / (1 + max.phrasecount - min.phrasecount);
- this.posintext = (this.posintext == 0) ? 0 : 1 + 255 * (this.posintext - min.posintext ) / (1 + max.posintext - min.posintext);
- this.posinphrase = (this.posinphrase == 0) ? 0 : 1 + 255 * (this.posinphrase - min.posinphrase ) / (1 + max.posinphrase - min.posinphrase);
- this.posofphrase = (this.posofphrase == 0) ? 0 : 1 + 255 * (this.posofphrase - min.posofphrase ) / (1 + max.posofphrase - min.posofphrase);
- this.worddistance = (this.worddistance == 0) ? 0 : 1 + 255 * (this.worddistance - min.worddistance) / (1 + max.worddistance - min.worddistance);
- this.lastModified = (this.lastModified == 0) ? 0 : 1 + 255 * (this.lastModified - min.lastModified) / (1 + max.lastModified - min.lastModified);
- this.quality = (this.quality == 0) ? 0 : 1 + 255 * (this.quality - min.quality ) / (1 + max.quality - min.quality);
- }
-
- public plasmaWordIndexEntry generateNormalized(plasmaWordIndexEntry min, plasmaWordIndexEntry max) {
- plasmaWordIndexEntry e = (plasmaWordIndexEntry) this.clone();
- e.normalize(min, max);
- return e;
- }
-
- public String getUrlHash() { return urlHash; }
- public int getQuality() { return quality; }
- public int getVirtualAge() { return plasmaWordIndex.microDateDays(lastModified); }
- public long getLastModified() { return lastModified; }
- public int hitcount() { return hitcount; }
- public int posintext() { return posintext; }
- public int posinphrase() { return posinphrase; }
- public int posofphrase() { return posofphrase; }
- public int worddistance() { return worddistance; }
- public int wordcount() { return wordcount; }
- public int phrasecount() { return phrasecount; }
- public String getLanguage() { return new String(language); }
- public char getType() { return doctype; }
- public boolean isLocal() { return localflag == indexEntryAttribute.LT_LOCAL; }
-
- public boolean isNewer(plasmaWordIndexEntry other) {
- if (other == null) return true;
- if (this.lastModified > other.lastModified) return true;
- if (this.lastModified == other.getLastModified()) {
- if (this.quality > other.quality) return true;
- }
- return false;
- }
-
- public boolean isOlder(plasmaWordIndexEntry other) {
- if (other == null) return false;
- if (this.lastModified < other.getLastModified()) return true;
- if (this.lastModified == other.getLastModified()) {
- if (this.quality < other.quality) return true;
- }
- return false;
- }
-
- public int domlengthNormalized() {
- return 255 * plasmaURL.domLengthEstimation(this.urlHash) / 30;
- }
-
public static void main(String[] args) {
// outputs the word hash to a given word
if (args.length != 1) System.exit(0);
diff --git a/source/de/anomic/plasma/plasmaWordIndexInterface.java b/source/de/anomic/plasma/plasmaWordIndexInterface.java
index dc47838a3..7471a6d31 100644
--- a/source/de/anomic/plasma/plasmaWordIndexInterface.java
+++ b/source/de/anomic/plasma/plasmaWordIndexInterface.java
@@ -55,7 +55,7 @@ public interface plasmaWordIndexInterface {
public plasmaWordIndexEntryContainer deleteContainer(String wordHash);
public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete);
- public boolean addEntry(String wordHash, plasmaWordIndexEntry entry, long updateTime, boolean dhtCase);
+ public boolean addEntry(String wordHash, plasmaWordIndexEntryInstance entry, long updateTime, boolean dhtCase);
public int addEntries(plasmaWordIndexEntryContainer newEntries, long creationTime, boolean dhtCase);
public void close(int waitingSeconds);
diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java
index 7afa347b2..fc856c3c2 100644
--- a/source/de/anomic/yacy/yacyClient.java
+++ b/source/de/anomic/yacy/yacyClient.java
@@ -59,7 +59,7 @@ import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard;
-import de.anomic.plasma.plasmaWordIndexEntry;
+import de.anomic.plasma.plasmaWordIndexEntryInstance;
import de.anomic.plasma.plasmaWordIndexEntryContainer;
import de.anomic.plasma.plasmaURLPattern;
import de.anomic.plasma.plasmaSearchTimingProfile;
@@ -483,10 +483,10 @@ public final class yacyClient {
urlManager.stackEntry(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2);
// save the url entry
- final plasmaWordIndexEntry entry;
+ final plasmaWordIndexEntryInstance entry;
if (urlEntry.word() == null) {
// the old way to define words
- entry = new plasmaWordIndexEntry(
+ entry = new plasmaWordIndexEntryInstance(
urlEntry.hash(),
urlLength, urlComps,
urlEntry.descr().length(),
@@ -513,7 +513,7 @@ public final class yacyClient {
}
// add the url entry to the word indexes
for (int m = 0; m < words; m++) {
- container[m].add(new plasmaWordIndexEntry[]{entry}, System.currentTimeMillis());
+ container[m].add(new plasmaWordIndexEntryInstance[]{entry}, System.currentTimeMillis());
}
}
@@ -881,11 +881,11 @@ public final class yacyClient {
// check if we got all necessary urls in the urlCache (only for debugging)
Iterator eenum;
- plasmaWordIndexEntry entry;
+ plasmaWordIndexEntryInstance entry;
for (int i = 0; i < indexes.length; i++) {
eenum = indexes[i].entries();
while (eenum.hasNext()) {
- entry = (plasmaWordIndexEntry) eenum.next();
+ entry = (plasmaWordIndexEntryInstance) eenum.next();
if (urlCache.get(entry.getUrlHash()) == null) {
yacyCore.log.logFine("DEBUG transferIndex: to-send url hash '" + entry.getUrlHash() + "' is not contained in urlCache");
}
@@ -961,13 +961,13 @@ public final class yacyClient {
int indexcount = 0;
final StringBuffer entrypost = new StringBuffer(indexes.length*73);
Iterator eenum;
- plasmaWordIndexEntry entry;
+ plasmaWordIndexEntryInstance entry;
for (int i = 0; i < indexes.length; i++) {
eenum = indexes[i].entries();
while (eenum.hasNext()) {
- entry = (plasmaWordIndexEntry) eenum.next();
+ entry = (plasmaWordIndexEntryInstance) eenum.next();
entrypost.append(indexes[i].wordHash())
- .append(entry.toExternalForm())
+ .append(entry.toPropertyForm())
.append(serverCore.crlfString);
indexcount++;
}
diff --git a/source/yacy.java b/source/yacy.java
index 9394c5bc7..4f54f22d3 100644
--- a/source/yacy.java
+++ b/source/yacy.java
@@ -85,7 +85,7 @@ import de.anomic.plasma.plasmaWordIndexAssortment;
import de.anomic.plasma.plasmaWordIndexAssortmentCluster;
import de.anomic.plasma.plasmaWordIndexClassicDB;
import de.anomic.plasma.plasmaWordIndexEntity;
-import de.anomic.plasma.plasmaWordIndexEntry;
+import de.anomic.plasma.plasmaWordIndexEntryInstance;
import de.anomic.plasma.plasmaWordIndexEntryContainer;
import de.anomic.server.serverCore;
import de.anomic.server.serverDate;
@@ -859,7 +859,7 @@ public final class yacy {
// the combined container will fit, read the container
Iterator importWordIdxEntries = newContainer.entries();
- plasmaWordIndexEntry importWordIdxEntry;
+ plasmaWordIndexEntryInstance importWordIdxEntry;
while (importWordIdxEntries.hasNext()) {
// testing if import process was aborted
@@ -867,7 +867,7 @@ public final class yacy {
// getting next word index entry
entryCounter++;
- importWordIdxEntry = (plasmaWordIndexEntry) importWordIdxEntries.next();
+ importWordIdxEntry = (plasmaWordIndexEntryInstance) importWordIdxEntries.next();
String urlHash = importWordIdxEntry.getUrlHash();
if ((importUrlDB.exists(urlHash)) && (!homeUrlDB.exists(urlHash))) try {
// importing the new url
@@ -970,9 +970,9 @@ public final class yacy {
// the combined container will fit, read the container
Iterator wordIdxEntries = wordIdxContainer.entries();
- plasmaWordIndexEntry wordIdxEntry;
+ plasmaWordIndexEntryInstance wordIdxEntry;
while (wordIdxEntries.hasNext()) {
- wordIdxEntry = (plasmaWordIndexEntry) wordIdxEntries.next();
+ wordIdxEntry = (plasmaWordIndexEntryInstance) wordIdxEntries.next();
String urlHash = wordIdxEntry.getUrlHash();
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
plasmaCrawlLURL.Entry urlEntry = currentUrlDB.getEntry(urlHash, null);