replaced indexURLEntry by new class that uses a kelondroRow.Entry object

to store the index entry. This is another step to move to the new database structure.
A side effect of this change is, that index storage uses much less RAM space,
which affects the index RAM cache.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2341 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 0b7112f8b2
commit c4e922885a

@ -59,12 +59,12 @@ import java.util.TreeMap;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpHeader;
import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.index.indexURLEntry;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyClient;
@ -154,7 +154,7 @@ public class IndexControl_p {
int i = 0;
urlx = new String[index.size()];
while (en.hasNext()) {
urlx[i++] = ((indexURLEntry) en.next()).urlHash();
urlx[i++] = ((indexEntry) en.next()).urlHash();
}
index = null;
}
@ -257,20 +257,20 @@ public class IndexControl_p {
Iterator urlIter = index.entries();
HashMap knownURLs = new HashMap();
HashSet unknownURLEntries = new HashSet();
indexURLEntry indexEntry;
indexEntry iEntry;
plasmaCrawlLURL.Entry lurl;
while (urlIter.hasNext()) {
indexEntry = (indexURLEntry) urlIter.next();
iEntry = (indexEntry) urlIter.next();
try {
lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.urlHash(), null);
lurl = switchboard.urlPool.loadedURL.getEntry(iEntry.urlHash(), null);
if (lurl.toString() == null) {
unknownURLEntries.add(indexEntry.urlHash());
unknownURLEntries.add(iEntry.urlHash());
urlIter.remove();
} else {
knownURLs.put(indexEntry.urlHash(), lurl);
knownURLs.put(iEntry.urlHash(), lurl);
}
} catch (IOException e) {
unknownURLEntries.add(indexEntry.urlHash());
unknownURLEntries.add(iEntry.urlHash());
}
}
// use whats remaining
@ -439,9 +439,9 @@ public class IndexControl_p {
int i = 0;
final TreeMap tm = new TreeMap();
indexURLEntry xi;
indexEntry xi;
while (en.hasNext()) {
xi = (indexURLEntry) en.next();
xi = (indexEntry) en.next();
uh = new String[]{xi.urlHash(), Integer.toString(xi.posintext())};
try {
us = switchboard.urlPool.loadedURL.getEntry(uh[0], null).url().toString();

@ -51,8 +51,9 @@ import java.util.Iterator;
import java.util.LinkedList;
import de.anomic.http.httpHeader;
import de.anomic.index.indexEntry;
import de.anomic.index.indexURLEntryNew;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.index.indexURLEntry;
import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -132,7 +133,7 @@ public final class transferRWI {
int p;
String wordHash;
String urlHash;
indexURLEntry entry;
indexEntry iEntry;
int wordhashesSize = v.size();
final HashSet unknownURL = new HashSet();
String[] wordhashes = new String[v.size()];
@ -145,11 +146,11 @@ public final class transferRWI {
if (p > 0) {
wordHash = estring.substring(0, p);
wordhashes[received] = wordHash;
entry = new indexURLEntry(estring.substring(p));
sb.wordIndex.addEntry(wordHash, entry, System.currentTimeMillis(), true);
iEntry = new indexURLEntryNew(estring.substring(p));
sb.wordIndex.addEntry(wordHash, iEntry, System.currentTimeMillis(), true);
serverCore.checkInterruption();
urlHash = entry.urlHash();
urlHash = iEntry.urlHash();
try {
if ((!(unknownURL.contains(urlHash))) &&
(!(sb.urlPool.loadedURL.exists(urlHash)))) {

@ -1,153 +0,0 @@
// indexbstractEntry.java
// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 20.05.2006 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.index;
import de.anomic.plasma.plasmaWordIndex;
public abstract class indexAbstractEntry implements indexEntry {
// the associated hash
protected String urlHash;
// discrete values
protected int hitcount; // number of this words in file
protected int wordcount; // number of all words in the file
protected int phrasecount; // number of all phrases in the file
protected int posintext; // first position of the word in text as number of word; 0=unknown or irrelevant position
protected int posinphrase; // position within a phrase of the word
protected int posofphrase; // position of the phrase in the text as count of sentences; 0=unknown; 1=path; 2=keywords; 3=headline; >4: in text
protected int worddistance;// distance between the words, only used if the index is artificial (from a conjunction)
protected long lastModified;// calculated by using last-modified
protected int quality; // result of a heuristic on the source file
protected byte[] language; // essentially the country code (the TLD as heuristic), two letters lowercase only
protected char doctype; // type of source
protected char localflag; // indicates if the index was created locally
public abstract Object clone();
public abstract String toEncodedStringForm();
public abstract byte[] toEncodedByteArrayForm();
public abstract String toPropertyForm();
public void combineDistance(indexEntry oe) {
this.worddistance = this.worddistance + ((indexAbstractEntry) oe).worddistance + Math.abs(this.posintext - ((indexAbstractEntry) oe).posintext);
this.posintext = Math.min(this.posintext, ((indexAbstractEntry) oe).posintext);
if (this.posofphrase != ((indexAbstractEntry) oe).posofphrase) this.posinphrase = 0; // (unknown)
this.posofphrase = Math.min(this.posofphrase, ((indexAbstractEntry) oe).posofphrase);
this.wordcount = (this.wordcount + ((indexAbstractEntry) oe).wordcount) / 2;
}
public void min(indexEntry other) {
if (this.hitcount > ((indexAbstractEntry) other).hitcount) this.hitcount = ((indexAbstractEntry) other).hitcount;
if (this.wordcount > ((indexAbstractEntry) other).wordcount) this.wordcount = ((indexAbstractEntry) other).wordcount;
if (this.phrasecount > ((indexAbstractEntry) other).phrasecount) this.phrasecount = ((indexAbstractEntry) other).phrasecount;
if (this.posintext > ((indexAbstractEntry) other).posintext) this.posintext = ((indexAbstractEntry) other).posintext;
if (this.posinphrase > ((indexAbstractEntry) other).posinphrase) this.posinphrase = ((indexAbstractEntry) other).posinphrase;
if (this.posofphrase > ((indexAbstractEntry) other).posofphrase) this.posofphrase = ((indexAbstractEntry) other).posofphrase;
if (this.worddistance > ((indexAbstractEntry) other).worddistance) this.worddistance = ((indexAbstractEntry) other).worddistance;
if (this.lastModified > ((indexAbstractEntry) other).lastModified) this.lastModified = ((indexAbstractEntry) other).lastModified;
if (this.quality > ((indexAbstractEntry) other).quality) this.quality = ((indexAbstractEntry) other).quality;
}
public void max(indexEntry other) {
if (this.hitcount < ((indexAbstractEntry) other).hitcount) this.hitcount = ((indexAbstractEntry) other).hitcount;
if (this.wordcount < ((indexAbstractEntry) other).wordcount) this.wordcount = ((indexAbstractEntry) other).wordcount;
if (this.phrasecount < ((indexAbstractEntry) other).phrasecount) this.phrasecount = ((indexAbstractEntry) other).phrasecount;
if (this.posintext < ((indexAbstractEntry) other).posintext) this.posintext = ((indexAbstractEntry) other).posintext;
if (this.posinphrase < ((indexAbstractEntry) other).posinphrase) this.posinphrase = ((indexAbstractEntry) other).posinphrase;
if (this.posofphrase < ((indexAbstractEntry) other).posofphrase) this.posofphrase = ((indexAbstractEntry) other).posofphrase;
if (this.worddistance < ((indexAbstractEntry) other).worddistance) this.worddistance = ((indexAbstractEntry) other).worddistance;
if (this.lastModified < ((indexAbstractEntry) other).lastModified) this.lastModified = ((indexAbstractEntry) other).lastModified;
if (this.quality < ((indexAbstractEntry) other).quality) this.quality = ((indexAbstractEntry) other).quality;
}
public void normalize(indexEntry mi, indexEntry ma) {
indexAbstractEntry min = (indexAbstractEntry) mi;
indexAbstractEntry max = (indexAbstractEntry) ma;
this.hitcount = (this.hitcount == 0) ? 0 : 1 + 255 * (this.hitcount - min.hitcount ) / (1 + max.hitcount - min.hitcount);
this.wordcount = (this.wordcount == 0) ? 0 : 1 + 255 * (this.wordcount - min.wordcount ) / (1 + max.wordcount - min.wordcount);
this.phrasecount = (this.phrasecount == 0) ? 0 : 1 + 255 * (this.phrasecount - min.phrasecount ) / (1 + max.phrasecount - min.phrasecount);
this.posintext = (this.posintext == 0) ? 0 : 1 + 255 * (this.posintext - min.posintext ) / (1 + max.posintext - min.posintext);
this.posinphrase = (this.posinphrase == 0) ? 0 : 1 + 255 * (this.posinphrase - min.posinphrase ) / (1 + max.posinphrase - min.posinphrase);
this.posofphrase = (this.posofphrase == 0) ? 0 : 1 + 255 * (this.posofphrase - min.posofphrase ) / (1 + max.posofphrase - min.posofphrase);
this.worddistance = (this.worddistance == 0) ? 0 : 1 + 255 * (this.worddistance - min.worddistance) / (1 + max.worddistance - min.worddistance);
this.lastModified = (this.lastModified == 0) ? 0 : 1 + 255 * (this.lastModified - min.lastModified) / (1 + max.lastModified - min.lastModified);
this.quality = (this.quality == 0) ? 0 : 1 + 255 * (this.quality - min.quality ) / (1 + max.quality - min.quality);
}
public indexEntry generateNormalized(indexEntry min, indexEntry max) {
indexEntry e = (indexAbstractEntry) this.clone();
e.normalize(min, max);
return e;
}
public String urlHash() { return urlHash; }
public int quality() { return quality; }
public int virtualAge() { return plasmaWordIndex.microDateDays(lastModified); }
public long lastModified() { return lastModified; }
public int hitcount() { return hitcount; }
public int posintext() { return posintext; }
public int posinphrase() { return posinphrase; }
public int posofphrase() { return posofphrase; }
public int worddistance() { return worddistance; }
public int wordcount() { return wordcount; }
public int phrasecount() { return phrasecount; }
public String getLanguage() { return new String(language); }
public char getType() { return doctype; }
public boolean isLocal() { return localflag == indexEntryAttribute.LT_LOCAL; }
public boolean isNewer(indexEntry other) {
if (other == null) return true;
if (this.lastModified > ((indexAbstractEntry) other).lastModified) return true;
if (this.lastModified == ((indexAbstractEntry) other).lastModified()) {
if (this.quality > ((indexAbstractEntry) other).quality) return true;
}
return false;
}
public boolean isOlder(indexEntry other) {
if (other == null) return false;
if (this.lastModified < ((indexAbstractEntry) other).lastModified()) return true;
if (this.lastModified == ((indexAbstractEntry) other).lastModified()) {
if (this.quality < ((indexAbstractEntry) other).quality) return true;
}
return false;
}
public int domlengthNormalized() {
return 255 * indexURL.domLengthEstimation(this.urlHash) / 30;
}
public static void main(String[] args) {
// outputs the word hash to a given word
if (args.length != 1) System.exit(0);
System.out.println("WORDHASH: " + indexEntryAttribute.word2hash(args[0]));
}
}

@ -31,8 +31,7 @@ import de.anomic.kelondro.kelondroRow;
public interface indexEntry {
public Object clone();
public String toEncodedStringForm();
public byte[] toEncodedByteArrayForm(); // shall be replaced by toKelondroEntry()
public byte[] toEncodedByteArrayForm(boolean includeUrlHash); // shall be replaced by toKelondroEntry()
public String toPropertyForm();
public kelondroRow.Entry toKelondroEntry();

@ -103,7 +103,7 @@ public final class indexRAMCacheRI extends indexAbstractRI implements indexRI {
String wordHash;
indexTreeMapContainer container;
long updateTime;
indexURLEntry wordEntry;
indexEntry iEntry;
kelondroRow.Entry row = dumpArray.row().newEntry();
// write kCache, this will be melted with the wCache upon load
@ -116,12 +116,12 @@ public final class indexRAMCacheRI extends indexAbstractRI implements indexRI {
if (container != null) {
Iterator ci = container.entries();
while (ci.hasNext()) {
wordEntry = (indexURLEntry) ci.next();
iEntry = (indexEntry) ci.next();
row.setCol(0, container.getWordHash().getBytes());
row.setCol(1, kelondroNaturalOrder.encodeLong(container.size(), 4));
row.setCol(2, kelondroNaturalOrder.encodeLong(container.updated(), 8));
row.setCol(3, wordEntry.urlHash().getBytes());
row.setCol(4, wordEntry.toEncodedStringForm().getBytes());
row.setCol(3, iEntry.urlHash().getBytes());
row.setCol(4, iEntry.toEncodedByteArrayForm(false));
dumpArray.set((int) urlcount++, row);
}
}
@ -145,12 +145,12 @@ public final class indexRAMCacheRI extends indexAbstractRI implements indexRI {
if (container != null) {
Iterator ci = container.entries();
while (ci.hasNext()) {
wordEntry = (indexURLEntry) ci.next();
iEntry = (indexEntry) ci.next();
row.setCol(0, wordHash.getBytes());
row.setCol(1, kelondroNaturalOrder.encodeLong(container.size(), 4));
row.setCol(2, kelondroNaturalOrder.encodeLong(updateTime, 8));
row.setCol(3, wordEntry.urlHash().getBytes());
row.setCol(4, wordEntry.toEncodedStringForm().getBytes());
row.setCol(3, iEntry.urlHash().getBytes());
row.setCol(4, iEntry.toEncodedByteArrayForm(false));
dumpArray.set((int) urlcount++, row);
}
}
@ -184,7 +184,7 @@ public final class indexRAMCacheRI extends indexAbstractRI implements indexRI {
int i = dumpArray.size();
String wordHash;
//long creationTime;
indexURLEntry wordEntry;
indexEntry wordEntry;
kelondroRow.Entry row;
//Runtime rt = Runtime.getRuntime();
while (i-- > 0) {
@ -193,7 +193,7 @@ public final class indexRAMCacheRI extends indexAbstractRI implements indexRI {
if ((row == null) || (row.empty(0)) || (row.empty(3)) || (row.empty(4))) continue;
wordHash = row.getColString(0, "UTF-8");
//creationTime = kelondroRecords.bytes2long(row[2]);
wordEntry = new indexURLEntry(row.getColString(3, "UTF-8"), row.getColString(4, "UTF-8"));
wordEntry = new indexURLEntryNew(row.getColString(3, null), row.getColString(4, null));
// store to cache
addEntry(wordHash, wordEntry, startTime, false);
urlCount++;

@ -27,6 +27,7 @@
package de.anomic.index;
import java.lang.reflect.Method;
import java.util.ConcurrentModificationException;
import java.util.Iterator;
import java.util.Set;
@ -78,12 +79,34 @@ public class indexRowSetContainer extends kelondroRowSet implements indexContain
}
public int add(indexContainer c, long maxTime) {
// TODO Auto-generated method stub
return 0;
// returns the number of new elements
long startTime = System.currentTimeMillis();
if (c == null) return 0;
int x = 0;
synchronized (c) {
Iterator i = c.entries();
while ((i.hasNext()) && ((maxTime < 0) || ((startTime + maxTime) > System.currentTimeMillis()))) {
try {
if (addi((indexEntry) i.next())) x++;
} catch (ConcurrentModificationException e) {}
}
}
this.lastTimeWrote = java.lang.Math.max(this.lastTimeWrote, c.updated());
return x;
}
private boolean addi(indexEntry entry) {
// returns true if the new entry was added, false if it already existed
indexEntry oldEntry = new indexURLEntryNew(this.put(entry.toKelondroEntry())); // FIXME: see if cloning is necessary
if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this container
this.put(oldEntry.toKelondroEntry()); // put it back
return false;
}
return (oldEntry == null);
}
public boolean contains(String urlHash) {
// TODO Auto-generated method stub
// TODO Auto-generated method stub
return false;
}

@ -130,7 +130,7 @@ public final class indexTreeMapContainer extends indexAbstractContainer implemen
Iterator i = c.entries();
while ((i.hasNext()) && ((maxTime < 0) || ((startTime + maxTime) > System.currentTimeMillis()))) {
try {
if (addi((indexURLEntry) i.next())) x++;
if (addi((indexEntry) i.next())) x++;
} catch (ConcurrentModificationException e) {}
}
}
@ -140,7 +140,7 @@ public final class indexTreeMapContainer extends indexAbstractContainer implemen
private boolean addi(indexEntry entry) {
// returns true if the new entry was added, false if it already existed
indexURLEntry oldEntry = (indexURLEntry) container.put(entry.urlHash(), entry);
indexEntry oldEntry = (indexEntry) container.put(entry.urlHash(), entry);
if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this container
container.put(entry.urlHash(), oldEntry); // put it back
return false;
@ -153,15 +153,15 @@ public final class indexTreeMapContainer extends indexAbstractContainer implemen
}
public indexEntry get(String urlHash) {
return (indexURLEntry) container.get(urlHash);
return (indexEntry) container.get(urlHash);
}
public indexEntry[] getEntryArray() {
return (indexURLEntry[]) container.values().toArray();
return (indexEntry[]) container.values().toArray();
}
public indexEntry remove(String urlHash) {
return (indexURLEntry) container.remove(urlHash);
return (indexEntry) container.remove(urlHash);
}
public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) {
@ -290,25 +290,25 @@ public final class indexTreeMapContainer extends indexAbstractContainer implemen
Iterator e2 = i2.entries();
int c;
if ((e1.hasNext()) && (e2.hasNext())) {
indexURLEntry ie1;
indexURLEntry ie2;
ie1 = (indexURLEntry) e1.next();
ie2 = (indexURLEntry) e2.next();
indexEntry ie1;
indexEntry ie2;
ie1 = (indexEntry) e1.next();
ie2 = (indexEntry) e2.next();
long stamp = System.currentTimeMillis();
while ((System.currentTimeMillis() - stamp) < time) {
c = i1.getOrdering().compare(ie1.urlHash(), ie2.urlHash());
//System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c);
if (c < 0) {
if (e1.hasNext()) ie1 = (indexURLEntry) e1.next(); else break;
if (e1.hasNext()) ie1 = (indexEntry) e1.next(); else break;
} else if (c > 0) {
if (e2.hasNext()) ie2 = (indexURLEntry) e2.next(); else break;
if (e2.hasNext()) ie2 = (indexEntry) e2.next(); else break;
} else {
// we have found the same urls in different searches!
ie1.combineDistance(ie2);
if (ie1.worddistance() <= maxDistance) conj.add(ie1);
if (e1.hasNext()) ie1 = (indexURLEntry) e1.next(); else break;
if (e2.hasNext()) ie2 = (indexURLEntry) e2.next(); else break;
if (e1.hasNext()) ie1 = (indexEntry) e1.next(); else break;
if (e2.hasNext()) ie2 = (indexEntry) e2.next(); else break;
}
}
}

@ -523,6 +523,10 @@ public class indexURL {
return 20;
}
public static int domLengthNormalized(String urlHash) {
return 255 * domLengthEstimation(urlHash) / 30;
}
public static final String oldurlHash(URL url) {
if (url == null) return null;
String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(htmlFilterContentScraper.urlNormalform(url))).substring(0, urlHashLength);

@ -1,207 +0,0 @@
// indexURLEntry.java
// (C) 2004, 2005, 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 2004 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
/*
This class defines the structures of an index entry for URLs
*/
package de.anomic.index;
import java.util.Properties;
import de.anomic.index.indexEntry;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexAbstractEntry;
import de.anomic.index.indexURL;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRow.Entry;
import de.anomic.plasma.plasmaWordIndex;
public final class indexURLEntry extends indexAbstractEntry implements Cloneable, indexEntry {
// the class instantiation can only be done by a plasmaStore method
// therefore they are all public
public indexURLEntry(String urlHash,
int urlLength, // byte-length of complete URL
int urlComps, // number of path components
int titleLength, // length of description/length (longer are better?)
int hitcount, //*how often appears this word in the text
int wordcount, //*total number of words
int phrasecount, //*total number of phrases
int posintext, //*position of word in all words
int posinphrase, //*position of word in its phrase
int posofphrase, //*number of the phrase where word appears
int distance, //*word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search
int sizeOfPage, // # of bytes of the page
long lastmodified, //*last-modified time of the document where word appears
long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
int quality, //*the entropy value
String language, //*(guessed) language of document
char doctype, //*type of document
int outlinksSame, // outlinks to same domain
int outlinksOther,// outlinks to other domain
boolean local //*flag shows that this index was generated locally; othervise its from a remote peer
) {
// more needed attributes:
// - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag etc
// - boolean: URL attributes
if ((language == null) || (language.length() != indexURL.urlLanguageLength)) language = "uk";
this.urlHash = urlHash;
this.hitcount = hitcount;
this.wordcount = wordcount;
this.phrasecount = phrasecount;
this.posintext = posintext;
this.posinphrase = posinphrase;
this.posofphrase = posofphrase;
this.worddistance = distance;
this.lastModified = lastmodified;
this.quality = quality;
this.language = language.getBytes();
this.doctype = doctype;
this.localflag = (local) ? indexEntryAttribute.LT_LOCAL : indexEntryAttribute.LT_GLOBAL;
}
public indexURLEntry(String urlHash, String code) {
// the code is not parsed but used later on
this.urlHash = urlHash;
this.hitcount = (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(6, 8));
this.lastModified = plasmaWordIndex.reverseMicroDateDays((int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(3, 6)));
this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(0, 3));
this.language = code.substring(8, 10).getBytes();
this.doctype = code.charAt(10);
this.localflag = code.charAt(11);
this.posintext = (code.length() >= 14) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(12, 14)) : 0;
this.posinphrase = (code.length() >= 15) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(14, 16)) : 0;
this.posofphrase = (code.length() >= 17) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(16, 18)) : 0;
this.worddistance = (code.length() >= 19) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(18, 20)) : 0;
this.wordcount = (code.length() >= 21) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(20, 22)) : 0;
this.phrasecount = (code.length() >= 23) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(22, 24)) : 0;
if (hitcount == 0) hitcount = 1;
if (wordcount == 0) wordcount = 1000;
if (phrasecount == 0) phrasecount = 100;
}
public indexURLEntry(String external) {
// parse external form
String[] elts = external.substring(1, external.length() - 1).split(",");
Properties pr = new Properties();
int p;
for (int i = 0; i < elts.length; i++) {
pr.put(elts[i].substring(0, (p = elts[i].indexOf("="))), elts[i].substring(p + 1));
}
// set values
this.urlHash = pr.getProperty("h", "");
this.hitcount = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("c", "A"));
this.wordcount = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("w", "__"));
this.phrasecount = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("p", "__"));
this.posintext = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("t", "__"));
this.posinphrase = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("r", "__"));
this.posofphrase = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("o", "__"));
this.worddistance = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("i", "__"));
this.lastModified = plasmaWordIndex.reverseMicroDateDays((int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("a", "A")));
this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("q", "__"));
this.language = pr.getProperty("l", "uk").getBytes();
this.doctype = pr.getProperty("d", "u").charAt(0);
this.localflag = pr.getProperty("f", ""+indexEntryAttribute.LT_LOCAL).charAt(0);
}
public Object clone() {
return new indexURLEntry(this.toPropertyForm());
}
public static int encodedStringFormLength() {
// the size of the index entry attributes when encoded to string
return 24;
}
public String toEncodedStringForm() {
// attention: this integrates NOT the URL hash into the encoding
// if you need a complete dump, use toExternalForm()
StringBuffer buf = new StringBuffer(encodedStringFormLength());
buf.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.quality, indexURL.urlQualityLength))
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(plasmaWordIndex.microDateDays(this.lastModified), 3))
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.hitcount, 2))
.append(new String(this.language))
.append(this.doctype)
.append(this.localflag)
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posintext, 2))
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posinphrase, 2))
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posofphrase, 2))
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.worddistance, 2))
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.wordcount, 2))
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.phrasecount, 2)); // 3+3+2+2+1+1+2+2+2+2+2+2= 24 bytes
return buf.toString();
}
public static int encodedByteArrayFormLength() {
// the size of the index entry attributes when encoded to string
return encodedStringFormLength();
}
public byte[] toEncodedByteArrayForm() {
return toEncodedStringForm().getBytes();
}
public Entry toKelondroEntry() {
kelondroRow.Entry entry = indexURLEntryNew.urlEntryRow.newEntry(toEncodedByteArrayForm());
return entry;
}
public String toPropertyForm() {
StringBuffer str = new StringBuffer(61);
str.append("{")
.append( "h=").append(this.urlHash)
.append(",q=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.quality, indexURL.urlQualityLength))
.append(",a=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(plasmaWordIndex.microDateDays(this.lastModified), 3))
.append(",c=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.hitcount, 2))
.append(",l=").append(new String(this.language))
.append(",d=").append(this.doctype)
.append(",f=").append(this.localflag)
.append(",t=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posintext, 2))
.append(",r=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posinphrase, 2))
.append(",o=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posofphrase, 2))
.append(",i=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.worddistance, 2))
.append(",w=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.wordcount, 2))
.append(",p=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.phrasecount, 2))
.append("}");
return str.toString();
}
public static void main(String[] args) {
// outputs the word hash to a given word
if (args.length != 1) System.exit(0);
System.out.println("WORDHASH: " + indexEntryAttribute.word2hash(args[0]));
}
}

@ -94,6 +94,7 @@ public class indexURLEntryNew implements Cloneable, indexEntry {
// - boolean: URL attributes
if ((language == null) || (language.length() != indexURL.urlLanguageLength)) language = "uk";
this.entry = urlEntryRow.newEntry();
this.entry.setColString(col_urlhash, urlHash, null);
this.entry.setColLong(col_quality, quality);
this.entry.setColLong(col_lastModified, lastmodified);
@ -122,16 +123,27 @@ public class indexURLEntryNew implements Cloneable, indexEntry {
this.entry = urlEntryRow.newEntry(row);
}
public indexURLEntryNew(kelondroRow.Entry rentry) {
// FIXME: see if cloning is necessary
this.entry = rentry;
}
public Object clone() {
return new indexURLEntryNew(toEncodedByteArrayForm());
byte[] b = new byte[urlEntryRow.objectsize()];
System.arraycopy(entry.bytes(), 0, b, 0, urlEntryRow.objectsize());
return new indexURLEntryNew(b);
}
public String toEncodedStringForm() {
return new String(toEncodedByteArrayForm());
public static int encodedByteArrayFormLength(boolean includingHeader) {
// the size of the index entry attributes when encoded to string
return (includingHeader) ? urlEntryRow.objectsize() : urlEntryRow.objectsize() - indexURL.urlHashLength;
}
public byte[] toEncodedByteArrayForm() {
return entry.bytes();
public byte[] toEncodedByteArrayForm(boolean includeHash) {
if (includeHash) return entry.bytes();
byte[] b = new byte[urlEntryRow.objectsize() - indexURL.urlLanguageLength];
System.arraycopy(entry.bytes(), indexURL.urlLanguageLength, b, 0, b.length);
return b;
}
public String toPropertyForm() {
@ -278,9 +290,9 @@ public class indexURLEntryNew implements Cloneable, indexEntry {
public boolean isOlder(indexEntry other) {
if (other == null) return false;
if (this.lastModified() < ((indexAbstractEntry) other).lastModified()) return true;
if (this.lastModified() == ((indexAbstractEntry) other).lastModified()) {
if (this.quality() < ((indexAbstractEntry) other).quality) return true;
if (this.lastModified() < other.lastModified()) return true;
if (this.lastModified() == other.lastModified()) {
if (this.quality() < other.quality()) return true;
}
return false;
}

@ -182,9 +182,15 @@ public class kelondroRow {
public Entry(byte[][] cols) {
rowinstance = new byte[objectsize];
for (int i = 0; i < objectsize; i++) this.rowinstance[i] = 0;
for (int i = 0; i < cols.length; i++) {
if (cols[i] != null) System.arraycopy(cols[i], 0, rowinstance, colstart[i], Math.min(cols[i].length, row[i].cellwidth()));
int ll;
for (int i = 0; i < row.length; i++) {
if ((i >= cols.length) || (cols[i] == null)) {
for (int j = 0; j < row[i].cellwidth(); j++) this.rowinstance[colstart[i] + j] = 0;
} else {
ll = Math.min(cols[i].length, row[i].cellwidth());
System.arraycopy(cols[i], 0, rowinstance, colstart[i], ll);
for (int j = ll; j < row[i].cellwidth(); j++) this.rowinstance[colstart[i] + j] = 0;
}
}
}

@ -7,11 +7,11 @@ import java.util.Iterator;
import java.util.TreeSet;
import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.index.indexURLEntry;
import de.anomic.server.serverDate;
public class plasmaDbImporter extends AbstractImporter implements dbImporter {
@ -128,13 +128,13 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
// loop throug the entities of the container and get the
// urlhash
Iterator importWordIdxEntries = newContainer.entries();
indexURLEntry importWordIdxEntry;
indexEntry importWordIdxEntry;
while (importWordIdxEntries.hasNext()) {
// testing if import process was aborted
if (isAborted()) break;
// getting next word index entry
importWordIdxEntry = (indexURLEntry) importWordIdxEntries.next();
importWordIdxEntry = (indexEntry) importWordIdxEntries.next();
String urlHash = importWordIdxEntry.urlHash();
entityUrls.add(urlHash);
}

@ -66,8 +66,9 @@ import java.util.Properties;
import de.anomic.http.httpc;
import de.anomic.http.httpc.response;
import de.anomic.index.indexEntry;
import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.index.indexURLEntryNew;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroTree;
import de.anomic.kelondro.kelondroRow;
@ -161,7 +162,7 @@ public final class plasmaCrawlLURL extends indexURL {
gcrawlResultStack.add(urlHash + initiatorHash + executorHash);
}
public Entry getEntry(String hash, indexURLEntry searchedWord) throws IOException {
public Entry getEntry(String hash, indexEntry searchedWord) throws IOException {
return new Entry(hash, searchedWord);
}
@ -416,7 +417,7 @@ public final class plasmaCrawlLURL extends indexURL {
private int size;
private int wordCount;
private String snippet;
private indexURLEntry word; // this is only used if the url is transported via remote search requests
private indexEntry word; // this is only used if the url is transported via remote search requests
private boolean stored;
// more needed attributes:
@ -451,7 +452,7 @@ public final class plasmaCrawlLURL extends indexURL {
this.stored = false;
}
public Entry(String urlHash, indexURLEntry searchedWord) throws IOException {
public Entry(String urlHash, indexEntry searchedWord) throws IOException {
// generates an plasmaLURLEntry using the url hash
// to speed up the access, the url-hashes are buffered
// in the hash cache.
@ -466,13 +467,13 @@ public final class plasmaCrawlLURL extends indexURL {
this.stored = true;
}
public Entry(kelondroRow.Entry entry, indexURLEntry searchedWord) throws IOException {
public Entry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException {
assert (entry != null);
insertEntry(entry, word);
this.stored = false;
}
private void insertEntry(kelondroRow.Entry entry, indexURLEntry searchedWord) throws IOException {
private void insertEntry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException {
try {
this.urlHash = entry.getColString(0, null);
this.url = new URL(entry.getColString(1, "UTF-8").trim());
@ -522,7 +523,7 @@ public final class plasmaCrawlLURL extends indexURL {
this.wordCount = Integer.parseInt(prop.getProperty("wc", "0"));
this.snippet = prop.getProperty("snippet", "");
if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null);
this.word = (prop.containsKey("word")) ? new indexURLEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word",""))) : null;
this.word = (prop.containsKey("word")) ? new indexURLEntryNew(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word",""))) : null;
this.stored = false;
//}
} catch (Exception e) {
@ -659,7 +660,7 @@ public final class plasmaCrawlLURL extends indexURL {
return snippet;
}
public indexURLEntry word() {
public indexEntry word() {
return word;
}

@ -48,8 +48,8 @@ import java.util.HashSet;
import java.util.Iterator;
import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
import de.anomic.index.indexTreeMapContainer;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException;
import de.anomic.server.serverCodings;
@ -191,7 +191,7 @@ public class plasmaDHTChunk {
Iterator indexContainerIterator = wordIndex.indexContainerSet(hash, resourceLevel, true, maxcount).iterator();
indexContainer container;
Iterator urlIter;
indexURLEntry indexEntry;
indexEntry iEntry;
plasmaCrawlLURL.Entry lurl;
int refcount = 0;
int wholesize;
@ -208,29 +208,29 @@ public class plasmaDHTChunk {
urlIter = container.entries();
// iterate over indexes to fetch url entries and store them in the urlCache
while ((urlIter.hasNext()) && (maxcount > refcount)) {
indexEntry = (indexURLEntry) urlIter.next();
iEntry = (indexEntry) urlIter.next();
try {
lurl = lurls.getEntry(indexEntry.urlHash(), indexEntry);
lurl = lurls.getEntry(iEntry.urlHash(), iEntry);
if ((lurl == null) || (lurl.url() == null)) {
yacyCore.log.logFine("DEBUG selectTransferContainersResource: not-bound url hash '" + indexEntry.urlHash() + "' for word hash " + container.getWordHash());
yacyCore.log.logFine("DEBUG selectTransferContainersResource: not-bound url hash '" + iEntry.urlHash() + "' for word hash " + container.getWordHash());
notBoundCounter++;
urlIter.remove();
wordIndex.removeEntry(container.getWordHash(), indexEntry.urlHash(), true);
wordIndex.removeEntry(container.getWordHash(), iEntry.urlHash(), true);
} else {
urlCache.put(indexEntry.urlHash(), lurl);
yacyCore.log.logFine("DEBUG selectTransferContainersResource: added url hash '" + indexEntry.urlHash() + "' to urlCache for word hash " + container.getWordHash());
urlCache.put(iEntry.urlHash(), lurl);
yacyCore.log.logFine("DEBUG selectTransferContainersResource: added url hash '" + iEntry.urlHash() + "' to urlCache for word hash " + container.getWordHash());
refcount++;
}
} catch (IOException e) {
notBoundCounter++;
urlIter.remove();
wordIndex.removeEntry(container.getWordHash(), indexEntry.urlHash(), true);
wordIndex.removeEntry(container.getWordHash(), iEntry.urlHash(), true);
}
}
// remove all remaining; we have enough
while (urlIter.hasNext()) {
indexEntry = (indexURLEntry) urlIter.next();
iEntry = (indexEntry) urlIter.next();
urlIter.remove();
}
@ -272,7 +272,7 @@ public class plasmaDHTChunk {
public int deleteTransferIndexes() {
Iterator urlIter;
indexURLEntry indexEntry;
indexEntry iEntry;
HashSet urlHashes;
int count = 0;
@ -282,8 +282,8 @@ public class plasmaDHTChunk {
urlHashes = new HashSet(this.indexContainers[i].size());
urlIter = this.indexContainers[i].entries();
while (urlIter.hasNext()) {
indexEntry = (indexURLEntry) urlIter.next();
urlHashes.add(indexEntry.urlHash());
iEntry = (indexEntry) urlIter.next();
urlHashes.add(iEntry.urlHash());
}
count += wordIndex.removeEntries(this.indexContainers[i].getWordHash(), urlHashes, true);
log.logFine("Deleted partial index (" + c + " URLs) for word " + this.indexContainers[i].getWordHash() + "; " + this.wordIndex.indexSize(indexContainers[i].getWordHash()) + " entries left");

@ -52,8 +52,8 @@ import de.anomic.server.logging.serverLog;
import de.anomic.server.serverInstantThread;
import de.anomic.yacy.yacySearch;
import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
import de.anomic.index.indexTreeMapContainer;
import de.anomic.index.indexURLEntry;
public final class plasmaSearchEvent extends Thread implements Runnable {
@ -242,7 +242,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
//if (searchResult == null) return acc; // strange case where searchResult is not proper: acc is then empty
//if (searchResult.size() == 0) return acc; // case that we have nothing to do
indexURLEntry entry;
indexEntry entry;
plasmaCrawlLURL.Entry page;
int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT);
try {

@ -49,8 +49,8 @@ import java.util.Iterator;
import de.anomic.server.serverCodings;
import de.anomic.server.serverFileUtils;
import de.anomic.index.indexEntry;
import de.anomic.index.indexTreeMapContainer;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBinSearch;
public final class plasmaSearchPreOrder {
@ -58,7 +58,7 @@ public final class plasmaSearchPreOrder {
public static kelondroBinSearch[] ybrTables = null; // block-rank tables
private static boolean useYBR = true;
private indexURLEntry entryMin, entryMax;
private indexEntry entryMin, entryMax;
private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry
private plasmaSearchQuery query;
private plasmaSearchRankingProfile ranking;
@ -118,36 +118,36 @@ public final class plasmaSearchPreOrder {
return pageAcc.size() > 0;
}
public indexURLEntry next() {
public indexEntry next() {
Object top = pageAcc.lastKey();
return (indexURLEntry) pageAcc.remove(top);
return (indexEntry) pageAcc.remove(top);
}
public void addContainer(indexTreeMapContainer container, long maxTime) {
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
indexURLEntry indexEntry;
indexEntry iEntry;
// first pass: find min/max to obtain limits for normalization
Iterator i = container.entries();
int count = 0;
while (i.hasNext()) {
if (System.currentTimeMillis() > limitTime) break;
indexEntry = (indexURLEntry) i.next();
if (entryMin == null) entryMin = (indexURLEntry) indexEntry.clone(); else entryMin.min(indexEntry);
if (entryMax == null) entryMax = (indexURLEntry) indexEntry.clone(); else entryMax.max(indexEntry);
iEntry = (indexEntry) i.next();
if (entryMin == null) entryMin = (indexEntry) iEntry.clone(); else entryMin.min(iEntry);
if (entryMax == null) entryMax = (indexEntry) iEntry.clone(); else entryMax.max(iEntry);
count++;
}
// second pass: normalize entries and get ranking
i = container.entries();
for (int j = 0; j < count; j++) {
indexEntry = (indexURLEntry) i.next();
pageAcc.put(serverCodings.encodeHex(this.ranking.preRanking(indexEntry.generateNormalized(entryMin, entryMax)), 16) + indexEntry.urlHash(), indexEntry);
iEntry = (indexEntry) i.next();
pageAcc.put(serverCodings.encodeHex(this.ranking.preRanking(iEntry.generateNormalized(entryMin, entryMax)), 16) + iEntry.urlHash(), iEntry);
}
}
public indexURLEntry[] getNormalizer() {
return new indexURLEntry[] {entryMin, entryMax};
public indexEntry[] getNormalizer() {
return new indexEntry[] {entryMin, entryMax};
}
public static int ybr_p(String urlHash) {

@ -47,7 +47,7 @@ import java.util.Map;
import java.util.Set;
import de.anomic.index.indexEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.index.indexURL;
public class plasmaSearchRankingProfile {
@ -164,18 +164,16 @@ public class plasmaSearchRankingProfile {
return new String(ext);
}
public long preRanking(indexEntry entry) {
public long preRanking(indexEntry normalizedEntry) {
// the normalizedEntry must be a normalized indexEntry
long ranking = 0;
if (entry instanceof indexURLEntry) {
indexURLEntry normalizedEntry = (indexURLEntry) entry;
ranking += normalizedEntry.quality() << ((Integer) coeff.get(ENTROPY)).intValue();
ranking += normalizedEntry.virtualAge() << ((Integer) coeff.get(DATE)).intValue();
ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.urlHash()) << ((Integer) coeff.get(YBR)).intValue();
ranking += (normalizedEntry.posintext() == 0) ? 0 : (255 - normalizedEntry.posintext()) << ((Integer) coeff.get(POSINTEXT)).intValue();
ranking += (normalizedEntry.worddistance() == 0) ? 0 : (255 - normalizedEntry.worddistance()) << ((Integer) coeff.get(WORDDISTANCE)).intValue();
ranking += (normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount() << ((Integer) coeff.get(HITCOUNT)).intValue();
ranking += (255 - normalizedEntry.domlengthNormalized()) << ((Integer) coeff.get(DOMLENGTH)).intValue();
}
ranking += (255 - indexURL.domLengthNormalized(normalizedEntry.urlHash())) << ((Integer) coeff.get(DOMLENGTH)).intValue();
return ranking;
}

@ -55,13 +55,13 @@ import java.net.MalformedURLException;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.server.serverCodings;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.index.indexEntry;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
public final class plasmaSearchResult {
private indexURLEntry entryMin, entryMax;
private indexEntry entryMin, entryMax;
private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry
private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic
private ArrayList results; // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects
@ -108,11 +108,11 @@ public final class plasmaSearchResult {
return (plasmaCrawlLURL.Entry) pageAcc.remove(top);
}
protected void addResult(indexURLEntry indexEntry, plasmaCrawlLURL.Entry page) {
protected void addResult(indexEntry iEntry, plasmaCrawlLURL.Entry page) {
// make min/max for normalization
if (entryMin == null) entryMin = (indexURLEntry) indexEntry.clone(); else entryMin.min(indexEntry);
if (entryMax == null) entryMax = (indexURLEntry) indexEntry.clone(); else entryMax.max(indexEntry);
if (entryMin == null) entryMin = (indexEntry) iEntry.clone(); else entryMin.min(iEntry);
if (entryMax == null) entryMax = (indexEntry) iEntry.clone(); else entryMax.max(iEntry);
// take out relevant information for reference computation
URL url = page.url();
@ -122,7 +122,7 @@ public final class plasmaSearchResult {
String[] descrcomps = descr.toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description
// store everything
Object[] resultVector = new Object[] {indexEntry, page, urlcomps, descrcomps};
Object[] resultVector = new Object[] {iEntry, page, urlcomps, descrcomps};
results.add(resultVector);
// add references
@ -140,18 +140,18 @@ public final class plasmaSearchResult {
for (int i = 0; i < references.length; i++) commonSense.add(references[i]);
Object[] resultVector;
indexURLEntry indexEntry;
indexEntry iEntry;
plasmaCrawlLURL.Entry page;
long ranking;
for (int i = 0; i < results.size(); i++) {
// take out values from result array
resultVector = (Object[]) results.get(i);
indexEntry = (indexURLEntry) resultVector[0];
iEntry = (indexEntry) resultVector[0];
page = (plasmaCrawlLURL.Entry) resultVector[1];
// calculate ranking
ranking = this.ranking.postRanking(
indexEntry,
iEntry,
query,
commonSense,
(String[]) resultVector[2],
@ -161,7 +161,7 @@ public final class plasmaSearchResult {
// insert value
//System.out.println("Ranking " + ranking + ", YBR-" + plasmaSearchPreOrder.ybr(indexEntry.getUrlHash()) + " for URL " + page.url());
pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.urlHash(), page);
pageAcc.put(serverCodings.encodeHex(ranking, 16) + iEntry.urlHash(), page);
}
// flush memory

@ -130,10 +130,11 @@ import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpHeader;
import de.anomic.http.httpRemoteProxyConfig;
import de.anomic.http.httpc;
import de.anomic.index.indexEntry;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexTreeMapContainer;
import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.index.indexURLEntryNew;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMSetTools;
@ -1487,7 +1488,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
wordStat = (plasmaCondenser.wordStatProp) wentry.getValue();
String wordHash = indexEntryAttribute.word2hash(word);
indexTreeMapContainer wordIdxContainer = new indexTreeMapContainer(wordHash);
indexURLEntry wordIdxEntry = new indexURLEntry(urlHash,
indexEntry wordIdxEntry = new indexURLEntryNew(urlHash,
urlLength, urlComps,
wordStat.count,
document.longTitle.length(),

@ -67,7 +67,7 @@ import de.anomic.index.indexRI;
import de.anomic.index.indexAbstractRI;
import de.anomic.index.indexRowSetContainer;
import de.anomic.index.indexTreeMapContainer;
import de.anomic.index.indexURLEntry;
import de.anomic.index.indexURLEntryNew;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMergeIterator;
@ -251,7 +251,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
Iterator i = condenser.words();
Map.Entry wentry;
String word;
indexURLEntry ientry;
indexEntry ientry;
plasmaCondenser.wordStatProp wprop;
String wordHash;
int urlLength = url.toString().length();
@ -263,7 +263,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
wprop = (plasmaCondenser.wordStatProp) wentry.getValue();
// if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c);
wordHash = indexEntryAttribute.word2hash(word);
ientry = new indexURLEntry(urlHash,
ientry = new indexURLEntryNew(urlHash,
urlLength, urlComps, (document == null) ? urlLength : document.longTitle.length(),
wprop.count,
condenser.RESULT_SIMI_WORDS,
@ -529,11 +529,11 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
// the combined container will fit, read the container
try {
Iterator entries = entity.elements(true);
indexURLEntry entry;
indexEntry entry;
while (entries.hasNext()) {
entry = (indexURLEntry) entries.next();
entry = (indexEntry) entries.next();
// System.out.println("ENTRY = " + entry.getUrlHash());
container.add(new indexURLEntry[]{entry}, System.currentTimeMillis());
container.add(new indexEntry[]{entry}, System.currentTimeMillis());
}
// we have read all elements, now delete the entity
entity.deleteComplete();
@ -580,7 +580,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
public void run() {
serverLog.logInfo("INDEXCLEANER", "IndexCleaner-Thread started");
indexContainer container = null;
indexURLEntry entry = null;
indexEntry entry = null;
URL url = null;
HashSet urlHashs = new HashSet();
try {
@ -592,7 +592,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
wordHashNow = container.getWordHash();
while (containerIterator.hasNext() && run) {
waiter();
entry = (indexURLEntry) containerIterator.next();
entry = (indexEntry) containerIterator.next();
// System.out.println("Wordhash: "+wordHash+" UrlHash:
// "+entry.getUrlHash());
try {

@ -57,9 +57,10 @@ import java.io.IOException;
import java.util.Iterator;
import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexTreeMapContainer;
import de.anomic.index.indexURLEntry;
import de.anomic.index.indexURLEntryNew;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroTree;
import de.anomic.kelondro.kelondroRow;
@ -74,7 +75,7 @@ public final class plasmaWordIndexAssortment {
4, // occurrence counter
8, // timestamp of last access
indexEntryAttribute.urlHashLength, // corresponding URL hash
indexURLEntry.encodedStringFormLength() // URL attributes
indexURLEntryNew.encodedByteArrayFormLength(false) // URL attributes
};
// class variables
@ -151,11 +152,11 @@ public final class plasmaWordIndexAssortment {
row.setColLongB256(1, 1);
row.setColLongB256(2, newContainer.updated());
Iterator entries = newContainer.entries();
indexURLEntry entry;
indexEntry entry;
for (int i = 0; i < assortmentLength; i++) {
entry = (indexURLEntry) entries.next();
entry = (indexEntry) entries.next();
row.setCol(3 + 2 * i, entry.urlHash().getBytes());
row.setCol(4 + 2 * i, entry.toEncodedStringForm().getBytes());
row.setCol(4 + 2 * i, entry.toEncodedByteArrayForm(false));
}
kelondroRow.Entry oldrow = null;
try {
@ -249,7 +250,7 @@ public final class plasmaWordIndexAssortment {
int al = assortmentCapacity(row.objectsize());
for (int i = 0; i < al; i++) {
container.add(
new indexURLEntry[] { new indexURLEntry(
new indexEntry[] { new indexURLEntryNew(
new String(row.getColBytes(3 + 2 * i)), new String(row.getColBytes(4 + 2 * i))) }, updateTime);
}
return container;

@ -59,7 +59,6 @@ import de.anomic.index.indexRI;
import de.anomic.index.indexAbstractRI;
import de.anomic.index.indexRowSetContainer;
import de.anomic.index.indexTreeMapContainer;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroObjectCache;
import de.anomic.kelondro.kelondroRecords;
@ -167,7 +166,7 @@ public final class plasmaWordIndexAssortmentCluster extends indexAbstractRI impl
c = new indexTreeMapContainer(newContainer.getWordHash());
for (int k = 0; k < j; k++) {
if (i.hasNext()) {
c.add((indexURLEntry) i.next(), newContainer.updated());
c.add((indexEntry) i.next(), newContainer.updated());
} else {
storeForced(c);
return;
@ -210,7 +209,7 @@ public final class plasmaWordIndexAssortmentCluster extends indexAbstractRI impl
c = new indexTreeMapContainer(newContainer.getWordHash());
for (int k = 0; k <= j; k++) {
assert (i.hasNext());
c.add((indexURLEntry) i.next(), newContainer.updated());
c.add((indexEntry) i.next(), newContainer.updated());
}
storeForced(c);
}

@ -49,8 +49,9 @@ import java.io.IOException;
import java.util.Iterator;
import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.index.indexURLEntryNew;
import de.anomic.kelondro.kelondroTree;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroRow;
@ -95,10 +96,10 @@ public final class plasmaWordIndexFile {
kt = new kelondroTree(theLocation, cacheSize, 0, kelondroTree.defaultObjectCachePercent);
} catch (IOException e) {
theLocation.delete();
kt = new kelondroTree(theLocation, cacheSize, 0, kelondroTree.defaultObjectCachePercent, indexURL.urlHashLength, indexURLEntry.encodedStringFormLength(), false);
kt = new kelondroTree(theLocation, cacheSize, 0, kelondroTree.defaultObjectCachePercent, indexURL.urlHashLength, indexURLEntryNew.encodedByteArrayFormLength(false), false);
} else {
// create new index file
kt = new kelondroTree(theLocation, cacheSize, 0, kelondroTree.defaultObjectCachePercent, indexURL.urlHashLength, indexURLEntry.encodedStringFormLength(), false);
kt = new kelondroTree(theLocation, cacheSize, 0, kelondroTree.defaultObjectCachePercent, indexURL.urlHashLength, indexURLEntryNew.encodedByteArrayFormLength(false), false);
}
return kt; // everyone who get this should close it when finished!
}
@ -137,27 +138,27 @@ public final class plasmaWordIndexFile {
} catch (IOException e) {}
}
public indexURLEntry getEntry(String urlhash) throws IOException {
public indexEntry getEntry(String urlhash) throws IOException {
kelondroRow.Entry n = theIndex.get(urlhash.getBytes());
if (n == null) return null;
return new indexURLEntry(n.getColString(0, null), n.getColString(1, null));
return new indexURLEntryNew(n.getColString(0, null), n.getColString(1, null));
}
public boolean contains(String urlhash) throws IOException {
return (theIndex.get(urlhash.getBytes()) != null);
}
public boolean contains(indexURLEntry entry) throws IOException {
public boolean contains(indexEntry entry) throws IOException {
return (theIndex.get(entry.urlHash().getBytes()) != null);
}
public boolean addEntry(indexURLEntry entry) throws IOException {
public boolean addEntry(indexEntry entry) throws IOException {
if (entry == null) return false;
indexURLEntry oldEntry = getEntry(entry.urlHash());
indexEntry oldEntry = getEntry(entry.urlHash());
if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this entity
return false;
}
return (theIndex.put(entry.urlHash().getBytes(), entry.toEncodedStringForm().getBytes()) == null);
return (theIndex.put(entry.urlHash().getBytes(), entry.toEncodedByteArrayForm(false)) == null);
}
public int addEntries(indexContainer container) throws IOException {
@ -172,7 +173,7 @@ public final class plasmaWordIndexFile {
if (container != null) {
Iterator i = container.entries();
while (i.hasNext()) {
if (addEntry((indexURLEntry) i.next())) count++;
if (addEntry((indexEntry) i.next())) count++;
}
}
@ -237,7 +238,7 @@ public final class plasmaWordIndexFile {
public Object next() {
if (i == null) return null;
kelondroRow.Entry n = (kelondroRow.Entry) i.next();
return new indexURLEntry(n.getColString(0, null), n.getColString(1, null));
return new indexURLEntryNew(n.getColString(0, null), n.getColString(1, null));
}
public void remove() {
throw new UnsupportedOperationException();
@ -257,7 +258,7 @@ public final class plasmaWordIndexFile {
long timeout = (time == -1) ? Long.MAX_VALUE : System.currentTimeMillis() + time;
try {
while ((i.hasNext()) && (System.currentTimeMillis() < timeout)) {
addEntry((indexURLEntry) i.next());
addEntry((indexEntry) i.next());
}
} catch (kelondroException e) {
serverLog.logSevere("PLASMA", "plasmaWordIndexEntity.merge: " + e.getMessage());

@ -51,10 +51,10 @@ import java.util.Set;
import java.util.TreeSet;
import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
import de.anomic.index.indexRI;
import de.anomic.index.indexAbstractRI;
import de.anomic.index.indexTreeMapContainer;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySeedDB;
@ -231,10 +231,10 @@ public class plasmaWordIndexFileCluster extends indexAbstractRI implements index
if (plasmaWordIndexFile.wordHash2path(databaseRoot, wordHash).exists()) {
plasmaWordIndexFile entity = this.getEntity(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime * 9 / 10);
indexTreeMapContainer container = new indexTreeMapContainer(wordHash);
indexURLEntry entry;
indexEntry entry;
Iterator i = entity.elements(true);
while ((i.hasNext()) && (System.currentTimeMillis() < (start + maxTime))) {
entry = (indexURLEntry) i.next();
entry = (indexEntry) i.next();
container.add(entry);
}
return container;

@ -54,9 +54,10 @@ import java.util.Iterator;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpc;
import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexTreeMapContainer;
import de.anomic.index.indexURLEntry;
import de.anomic.index.indexURLEntryNew;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSearchRankingProfile;
@ -484,10 +485,10 @@ public final class yacyClient {
urlManager.stackEntry(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2);
// save the url entry
final indexURLEntry entry;
final indexEntry entry;
if (urlEntry.word() == null) {
// the old way to define words
entry = new indexURLEntry(
entry = new indexURLEntryNew(
urlEntry.hash(),
urlLength, urlComps,
urlEntry.descr().length(),
@ -514,7 +515,7 @@ public final class yacyClient {
}
// add the url entry to the word indexes
for (int m = 0; m < words; m++) {
container[m].add(new indexURLEntry[]{entry}, System.currentTimeMillis());
container[m].add(new indexEntry[]{entry}, System.currentTimeMillis());
}
}
@ -888,11 +889,11 @@ public final class yacyClient {
// check if we got all necessary urls in the urlCache (only for debugging)
Iterator eenum;
indexURLEntry entry;
indexEntry entry;
for (int i = 0; i < indexes.length; i++) {
eenum = indexes[i].entries();
while (eenum.hasNext()) {
entry = (indexURLEntry) eenum.next();
entry = (indexEntry) eenum.next();
if (urlCache.get(entry.urlHash()) == null) {
yacyCore.log.logFine("DEBUG transferIndex: to-send url hash '" + entry.urlHash() + "' is not contained in urlCache");
}
@ -996,11 +997,11 @@ public final class yacyClient {
int indexcount = 0;
final StringBuffer entrypost = new StringBuffer(indexes.length*73);
Iterator eenum;
indexURLEntry entry;
indexEntry entry;
for (int i = 0; i < indexes.length; i++) {
eenum = indexes[i].entries();
while (eenum.hasNext()) {
entry = (indexURLEntry) eenum.next();
entry = (indexEntry) eenum.next();
entrypost.append(indexes[i].getWordHash())
.append(entry.toPropertyForm())
.append(serverCore.crlfString);

@ -73,6 +73,7 @@ import de.anomic.http.httpd;
import de.anomic.http.httpdFileHandler;
import de.anomic.http.httpdProxyHandler;
import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL;
import de.anomic.kelondro.kelondroDyn;
@ -86,7 +87,6 @@ import de.anomic.plasma.plasmaURLPool;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.plasmaWordIndexAssortmentCluster;
import de.anomic.plasma.plasmaWordIndexFile;
import de.anomic.index.indexURLEntry;
import de.anomic.server.serverCore;
import de.anomic.server.serverDate;
import de.anomic.server.serverFileUtils;
@ -717,10 +717,10 @@ public final class yacy {
// the combined container will fit, read the container
Iterator wordIdxEntries = wordIdxContainer.entries();
indexURLEntry wordIdxEntry;
indexEntry iEntry;
while (wordIdxEntries.hasNext()) {
wordIdxEntry = (indexURLEntry) wordIdxEntries.next();
String urlHash = wordIdxEntry.urlHash();
iEntry = (indexEntry) wordIdxEntries.next();
String urlHash = iEntry.urlHash();
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
plasmaCrawlLURL.Entry urlEntry = currentUrlDB.getEntry(urlHash, null);
urlCounter++;

Loading…
Cancel
Save