refactoring to prepare new RWI entry object

- moved all url and index(RWI) entries to index package
- better naming to distinguish RWI entries and URL entries


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2937 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 6412c926bc
commit bb7d4b5d5e

@ -55,8 +55,8 @@ import de.anomic.data.bookmarksDB;
import de.anomic.data.listManager;
import de.anomic.data.bookmarksDB.Tag;
import de.anomic.http.httpHeader;
import de.anomic.index.indexURLEntry;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
@ -147,10 +147,10 @@ public class Bookmarks {
bookmarksDB.Bookmark bookmark = switchboard.bookmarksDB.getBookmark(urlHash);
if (bookmark == null) {
// try to get the bookmark from the LURL database
plasmaCrawlLURLEntry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null);
indexURLEntry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null);
plasmaParserDocument document = null;
if (urlentry != null) {
plasmaCrawlLURLEntry.Components comp = urlentry.comp();
indexURLEntry.Components comp = urlentry.comp();
document = switchboard.snippetCache.retrieveDocument(comp.url(), true);
prop.put("mode_edit", 0); // create mode
prop.put("mode_url", comp.url().toNormalform());

@ -57,11 +57,11 @@ import java.util.TreeMap;
import de.anomic.http.httpHeader;
import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
@ -161,7 +161,7 @@ public class IndexControl_p {
int i = 0;
urlx = new String[index.size()];
while (en.hasNext()) {
urlx[i++] = ((indexEntry) en.next()).urlHash();
urlx[i++] = ((indexRWIEntry) en.next()).urlHash();
}
index = null;
}
@ -218,7 +218,7 @@ public class IndexControl_p {
}
if (post.containsKey("urlhashdelete")) {
plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
indexURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
if (entry == null) {
prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} else {
@ -263,10 +263,10 @@ public class IndexControl_p {
Iterator urlIter = index.entries();
HashMap knownURLs = new HashMap();
HashSet unknownURLEntries = new HashSet();
indexEntry iEntry;
plasmaCrawlLURLEntry lurl;
indexRWIEntry iEntry;
indexURLEntry lurl;
while (urlIter.hasNext()) {
iEntry = (indexEntry) urlIter.next();
iEntry = (indexRWIEntry) urlIter.next();
lurl = switchboard.urlPool.loadedURL.load(iEntry.urlHash(), null);
if (lurl == null) {
unknownURLEntries.add(iEntry.urlHash());
@ -320,7 +320,7 @@ public class IndexControl_p {
URL url = new URL(urlstring);
urlhash = indexURL.urlHash(url);
prop.put("urlhash", urlhash);
plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
indexURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
if (entry == null) {
prop.put("urlstring", "unknown url: " + urlstring);
prop.put("urlhash", "");
@ -334,7 +334,7 @@ public class IndexControl_p {
}
if (post.containsKey("urlhashsearch")) {
plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
indexURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
if (entry == null) {
prop.put("result", "No Entry for URL hash " + urlhash);
} else {
@ -348,12 +348,12 @@ public class IndexControl_p {
try {
final Iterator entryIt = switchboard.urlPool.loadedURL.entries(true, true, urlhash);
StringBuffer result = new StringBuffer("Sequential List of URL-Hashes:<br>");
plasmaCrawlLURLEntry entry;
indexURLEntry entry;
int i = 0;
int rows = 0, cols = 0;
prop.put("urlhashsimilar", 1);
while (entryIt.hasNext() && i < 256) {
entry = (plasmaCrawlLURLEntry) entryIt.next();
entry = (indexURLEntry) entryIt.next();
prop.put("urlhashsimilar_rows_"+rows+"_cols_"+cols+"_urlHash", entry.hash());
cols++;
if (cols==8) {
@ -400,16 +400,16 @@ public class IndexControl_p {
return prop;
}
public static serverObjects genUrlProfile(plasmaSwitchboard switchboard, plasmaCrawlLURLEntry entry, String urlhash) {
public static serverObjects genUrlProfile(plasmaSwitchboard switchboard, indexURLEntry entry, String urlhash) {
serverObjects prop = new serverObjects();
if (entry == null) {
prop.put("genUrlProfile", 1);
prop.put("genUrlProfile_urlhash", urlhash);
return prop;
}
plasmaCrawlLURLEntry.Components comp = entry.comp();
indexURLEntry.Components comp = entry.comp();
String referrer = null;
plasmaCrawlLURLEntry le = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null);
indexURLEntry le = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null);
if (le == null) {
referrer = "<unknown>";
} else {
@ -453,11 +453,11 @@ public class IndexControl_p {
int i = 0;
final TreeMap tm = new TreeMap();
indexEntry xi;
indexRWIEntry xi;
while (en.hasNext()) {
xi = (indexEntry) en.next();
xi = (indexRWIEntry) en.next();
uh = new String[]{xi.urlHash(), Integer.toString(xi.posintext())};
plasmaCrawlLURLEntry le = switchboard.urlPool.loadedURL.load(uh[0], null);
indexURLEntry le = switchboard.urlPool.loadedURL.load(uh[0], null);
if (le == null) {
tm.put(uh[0], uh);
} else {

@ -60,6 +60,7 @@ import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.http.httpHeader;
import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlProfile;
@ -204,7 +205,7 @@ public class IndexCreate_p {
prop.put("error_reasonString", reasonString);
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
crawlingStartURL.getHost(), reasonString, new bitfield(indexURL.urlFlagLength));
crawlingStartURL.getHost(), reasonString, new bitfield(indexRWIEntryOld.urlFlagLength));
ee.store();
switchboard.urlPool.errorURL.stackPushEntry(ee);
}
@ -282,7 +283,7 @@ public class IndexCreate_p {
c++;
} else {
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
(String) e.getValue(), rejectReason, new bitfield(indexURL.urlFlagLength));
(String) e.getValue(), rejectReason, new bitfield(indexRWIEntryOld.urlFlagLength));
ee.store();
switchboard.urlPool.errorURL.stackPushEntry(ee);
}

@ -43,22 +43,33 @@
// javac -classpath .:../Classes Settings_p.java
// if the shell's current path is HTROOT
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import de.anomic.http.httpHeader;
import de.anomic.index.indexURLEntry;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.nxTools;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
public class IndexMonitor {
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
// return variable that accumulates replacements
plasmaSwitchboard switchboard = (plasmaSwitchboard) env;
plasmaSwitchboard sb = (plasmaSwitchboard) env;
serverObjects prop = new serverObjects();
int showIndexedCount = 40;
boolean si = false;
boolean se = false;
int lines = 40;
boolean showInit = false;
boolean showExec = false;
if (post == null) {
@ -67,20 +78,20 @@ public class IndexMonitor {
}
// find process number
int process;
int tabletype;
try {
process = Integer.parseInt(post.get("process", "0"));
tabletype = Integer.parseInt(post.get("process", "0"));
} catch (NumberFormatException e) {
process = 0;
tabletype = 0;
}
// check if authorization is needed and/or given
if (((process > 0) && (process < 6)) ||
if (((tabletype > 0) && (tabletype < 6)) ||
(post.containsKey("clearlist")) ||
(post.containsKey("deleteentry"))) {
String authorization = ((String) header.get("Authorization", "xxxxxx"));
if (authorization.length() != 0) {
if (! switchboard.verifyAuthentication(header, true)){
if (! sb.verifyAuthentication(header, true)){
// force log-in (again, because wrong password was given)
prop.put("AUTHENTICATE", "admin log-in");
return prop;
@ -94,33 +105,102 @@ public class IndexMonitor {
// custom number of lines
if (post.containsKey("count")) {
showIndexedCount = Integer.parseInt(post.get("count", "40"));
lines = Integer.parseInt(post.get("count", "40"));
}
// do the commands
if (post.containsKey("clearlist")) switchboard.urlPool.loadedURL.clearStack(process);
if (post.containsKey("clearlist")) sb.urlPool.loadedURL.clearStack(tabletype);
if (post.containsKey("deleteentry")) {
String hash = post.get("hash", null);
if (hash != null) {
// delete from database
switchboard.urlPool.loadedURL.remove(hash);
sb.urlPool.loadedURL.remove(hash);
}
}
if (post.containsKey("moreIndexed")) {
showIndexedCount = Integer.parseInt(post.get("showIndexed", "40"));
lines = Integer.parseInt(post.get("showIndexed", "40"));
}
if (post.get("si") != null) si = true;
if (post.get("se") != null) se = true;
if (post.get("si") != null) showInit = true;
if (post.get("se") != null) showExec = true;
// create table
if (process == 0) {
if (tabletype == 0) {
prop.put("table", 2);
} else if (sb.urlPool.loadedURL.getStackSize(tabletype) == 0) {
prop.put("table", 0);
} else {
prop.putAll(switchboard.urlPool.loadedURL.genTableProps(process, showIndexedCount, si, se, "unknown", null, "IndexMonitor.html", true));
prop.put("table", 1);
if (lines > sb.urlPool.loadedURL.getStackSize(tabletype)) lines = sb.urlPool.loadedURL.getStackSize(tabletype);
if (lines == sb.urlPool.loadedURL.getStackSize(tabletype)) {
prop.put("table_size", 0);
} else {
prop.put("table_size", 1);
prop.put("table_size_count", lines);
}
prop.put("table_size_all", sb.urlPool.loadedURL.getStackSize(tabletype));
prop.put("table_feedbackpage", "IndexMonitor.html");
prop.put("table_tabletype", tabletype);
prop.put("table_showInit", (showInit) ? 1 : 0);
prop.put("table_showExec", (showExec) ? 1 : 0);
boolean dark = true;
String urlHash, initiatorHash, executorHash;
String cachepath, urlstr, urltxt;
yacySeed initiatorSeed, executorSeed;
indexURLEntry urle;
// needed for getCachePath(url)
final plasmaHTCache cacheManager = sb.getCacheManager();
int i, cnt = 0;
for (i = sb.urlPool.loadedURL.getStackSize(tabletype) - 1; i >= (sb.urlPool.loadedURL.getStackSize(tabletype) - lines); i--) {
initiatorHash = sb.urlPool.loadedURL.getInitiatorHash(tabletype, i);
executorHash = sb.urlPool.loadedURL.getExecutorHash(tabletype, i);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps initiatorHash=" + initiatorHash + " executorHash=" + executorHash);
urlHash = sb.urlPool.loadedURL.getUrlHash(tabletype, i);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash);
try {
urle = sb.urlPool.loadedURL.load(urlHash, null);
indexURLEntry.Components comp = urle.comp();
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString());
initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash);
executorSeed = yacyCore.seedDB.getConnected(executorHash);
urlstr = comp.url().toNormalform();
urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL
cachepath = cacheManager.getCachePath(new URL(urlstr)).toString().replace('\\', '/').substring(cacheManager.cachePath.toString().length() + 1);
prop.put("table_indexed_" + cnt + "_dark", (dark) ? 1 : 0);
prop.put("table_indexed_" + cnt + "_feedbackpage", "IndexMonitor.html");
prop.put("table_indexed_" + cnt + "_tabletype", tabletype);
prop.put("table_indexed_" + cnt + "_urlhash", urlHash);
prop.put("table_indexed_" + cnt + "_showInit", (showInit) ? 1 : 0);
prop.put("table_indexed_" + cnt + "_showInit_initiatorSeed", (initiatorSeed == null) ? "unknown" : initiatorSeed.getName());
prop.put("table_indexed_" + cnt + "_showExec", (showExec) ? 1 : 0);
prop.put("table_indexed_" + cnt + "_showExec_executorSeed", (executorSeed == null) ? "unknown" : executorSeed.getName());
prop.put("table_indexed_" + cnt + "_moddate", daydate(urle.moddate()));
prop.put("table_indexed_" + cnt + "_wordcount", urle.wordCount());
prop.put("table_indexed_" + cnt + "_urldescr", comp.descr());
prop.put("table_indexed_" + cnt + "_url", (cachepath == null) ? "-not-cached-" : "<a href=\"CacheAdmin_p.html?action=info&path=" + cachepath + "\" class=\"small\" title=\"" + urlstr + "\">" + urltxt + "</a>");
dark = !dark;
cnt++;
} catch (Exception e) {
serverLog.logSevere("PLASMA", "genTableProps", e);
}
}
prop.put("table_indexed", cnt);
}
prop.put("process", process);
// return rewrite properties
return prop;
prop.put("process", tabletype);
// return rewrite properties
return prop;
}
private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);
private static String daydate(Date date) {
if (date == null) {
return "";
} else {
return dayFormatter.format(date);
}
}
}

@ -54,7 +54,7 @@ import java.util.Enumeration;
import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard;
@ -106,7 +106,7 @@ public class ViewFile {
String viewMode = post.get("viewMode","sentences");
// getting the urlEntry that belongs to the url hash
plasmaCrawlLURLEntry urlEntry = null;
indexURLEntry urlEntry = null;
urlEntry = sb.urlPool.loadedURL.load(urlHash, null);
if (urlEntry == null) {
prop.put("error",2);
@ -115,7 +115,7 @@ public class ViewFile {
}
// gettin the url that belongs to the entry
plasmaCrawlLURLEntry.Components comp = urlEntry.comp();
indexURLEntry.Components comp = urlEntry.comp();
if ((comp == null) || (comp.url() == null)) {
prop.put("error",3);
prop.put("viewMode",VIEW_MODE_NO_TEXT);

@ -61,10 +61,10 @@ import de.anomic.data.userDB;
import de.anomic.http.httpHeader;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCodings;
import de.anomic.server.serverCore;
@ -362,7 +362,7 @@ public class dir {
try {
final URL url = new URL(urlstring);
final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes()));
final plasmaCrawlLURLEntry newEntry = switchboard.urlPool.loadedURL.newEntry(
final indexURLEntry newEntry = switchboard.urlPool.loadedURL.newEntry(
url,
"YaCyShare: " + descr,
yacyCore.seedDB.mySeed.getName(),

@ -50,8 +50,8 @@ import java.util.Date;
import de.anomic.http.httpHeader;
import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -249,7 +249,7 @@ public final class crawlOrder {
// case where we have already the url loaded;
reason = reasonString;
// send lurl-Entry as response
plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(indexURL.urlHash(url), null);
indexURLEntry entry = switchboard.urlPool.loadedURL.load(indexURL.urlHash(url), null);
if (entry == null) {
response = "rejected";
lurl = "";

@ -50,8 +50,9 @@ import java.io.IOException;
import de.anomic.http.httpHeader;
import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
@ -124,12 +125,12 @@ public final class crawlReceipt {
prop.put("delay", "3600");
} else if (result.equals("fill")) {
// generating a new loaded URL entry
plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.newEntry(propStr);
indexURLEntry entry = switchboard.urlPool.loadedURL.newEntry(propStr);
if (entry == null) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) for hash " + receivedUrlhash + " from peer " + iam +
"\n\tURL properties: "+ propStr);
} else {
plasmaCrawlLURLEntry.Components comp = entry.comp();
indexURLEntry.Components comp = entry.comp();
if (comp.url() == null) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (url null) for hash " + receivedUrlhash + " from peer " + iam +
"\n\tURL properties: "+ propStr);
@ -156,7 +157,7 @@ public final class crawlReceipt {
} else {
try {
plasmaCrawlNURL.Entry en = switchboard.urlPool.noticeURL.getEntry(receivedUrlhash);
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(indexURL.urlFlagLength));
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(indexRWIEntryOld.urlFlagLength));
ee.store();
switchboard.urlPool.errorURL.stackPushEntry(ee);
switchboard.urlPool.noticeURL.remove(receivedUrlhash);

@ -54,7 +54,7 @@ import java.util.Set;
import de.anomic.http.httpHeader;
import de.anomic.index.indexContainer;
import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProfile;
@ -249,10 +249,10 @@ public final class search {
StringBuffer links = new StringBuffer();
String resource = "";
//plasmaIndexEntry pie;
plasmaCrawlLURLEntry urlentry;
indexURLEntry urlentry;
plasmaSnippetCache.Snippet snippet;
while ((acc.hasMoreElements()) && (i < squery.wantedResults)) {
urlentry = (plasmaCrawlLURLEntry) acc.nextElement();
urlentry = (indexURLEntry) acc.nextElement();
if (includesnippet) {
snippet = sb.snippetCache.retrieveSnippet(urlentry.comp().url(), squery.queryHashes, false, 260, 1000);
} else {

@ -51,8 +51,8 @@ import java.util.Iterator;
import java.util.LinkedList;
import de.anomic.http.httpHeader;
import de.anomic.index.indexEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCore;
@ -146,7 +146,7 @@ public final class transferRWI {
int p;
String wordHash;
String urlHash;
indexEntry iEntry;
indexRWIEntry iEntry;
int wordhashesSize = v.size();
final HashSet unknownURL = new HashSet();
final HashSet knownURL = new HashSet();
@ -162,7 +162,7 @@ public final class transferRWI {
if (p > 0) {
wordHash = estring.substring(0, p);
wordhashes[received] = wordHash;
iEntry = new indexURLEntry(estring.substring(p));
iEntry = new indexRWIEntryOld(estring.substring(p));
urlHash = iEntry.urlHash();
if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.hashInBlacklistedCache(plasmaURLPattern.BLACKLIST_DHT, urlHash))) {
int deleted = sb.wordIndex.tryRemoveURLs(urlHash);

@ -48,7 +48,7 @@
import java.io.IOException;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCore;
@ -90,7 +90,7 @@ public final class transferURL {
final int sizeBefore = sb.urlPool.loadedURL.size();
// read the urls from the other properties and store
String urls;
plasmaCrawlLURLEntry lEntry;
indexURLEntry lEntry;
for (int i = 0; i < urlc; i++) {
serverCore.checkInterruption();
urls = (String) post.get("url" + i);
@ -102,7 +102,7 @@ public final class transferURL {
yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
// TODO: should we send back an error message???
} else {
plasmaCrawlLURLEntry.Components comp = lEntry.comp();
indexURLEntry.Components comp = lEntry.comp();
if (comp.url() == null) {
yacyCore.log.logWarning("transferURL: received invalid URL (url null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
// TODO: should we send back an error message???

@ -54,10 +54,10 @@ import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.http.httpHeader;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSearchImages;
import de.anomic.plasma.plasmaSearchPreOrder;
@ -189,9 +189,9 @@ public class yacysearch {
return prop;
}
final String recommendHash = post.get("recommendref", ""); // urlhash
plasmaCrawlLURLEntry urlentry = sb.urlPool.loadedURL.load(recommendHash, null);
indexURLEntry urlentry = sb.urlPool.loadedURL.load(recommendHash, null);
if (urlentry != null) {
plasmaCrawlLURLEntry.Components comp = urlentry.comp();
indexURLEntry.Components comp = urlentry.comp();
plasmaParserDocument document;
document = sb.snippetCache.retrieveDocument(comp.url(), true);
if (document != null) {

@ -91,7 +91,7 @@ public class indexCachedRI implements indexRI {
return new indexContainer(wordHash, payloadrow);
}
public indexContainer addEntry(String wordHash, indexEntry entry, long updateTime, boolean intern) {
public indexContainer addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean intern) {
// add the entry
if (intern) {
riIntern.addEntry(wordHash, entry, updateTime, true);

@ -152,7 +152,7 @@ public class indexCollectionRI implements indexRI {
}
}
public synchronized indexContainer addEntry(String wordHash, indexEntry newEntry, long updateTime, boolean dhtCase) {
public synchronized indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) {
indexContainer container = new indexContainer(wordHash, collectionIndex.payloadRow());
container.add(newEntry);
return addEntries(container, updateTime, dhtCase);

@ -81,18 +81,18 @@ public class indexContainer extends kelondroRowSet {
return wordHash;
}
public int add(indexEntry entry) {
public int add(indexRWIEntry entry) {
this.addUnique(entry.toKelondroEntry());
return 1;
}
public int add(indexEntry entry, long updateTime) {
public int add(indexRWIEntry entry, long updateTime) {
this.add(entry);
this.lastTimeWrote = updateTime;
return 1;
}
public int add(indexEntry[] entries, long updateTime) {
public int add(indexRWIEntry[] entries, long updateTime) {
for (int i = 0; i < entries.length; i++) this.add(entries[i], updateTime);
return entries.length;
}
@ -106,7 +106,7 @@ public class indexContainer extends kelondroRowSet {
Iterator i = c.entries();
while (i.hasNext()) {
try {
if (addi((indexEntry) i.next())) x++;
if (addi((indexRWIEntry) i.next())) x++;
} catch (ConcurrentModificationException e) {
e.printStackTrace();
}
@ -117,13 +117,13 @@ public class indexContainer extends kelondroRowSet {
return x;
}
private boolean addi(indexEntry entry) {
private boolean addi(indexRWIEntry entry) {
// returns true if the new entry was added, false if it already existed
kelondroRow.Entry oldEntryRow = this.put(entry.toKelondroEntry());
if (oldEntryRow == null) {
return true;
} else {
indexEntry oldEntry = new indexURLEntry(oldEntryRow); // FIXME: see if cloning is necessary
indexRWIEntry oldEntry = new indexRWIEntryOld(oldEntryRow); // FIXME: see if cloning is necessary
if (entry.isOlder(oldEntry)) { // A more recent Entry is already in this container
this.put(oldEntry.toKelondroEntry()); // put it back
return false;
@ -133,16 +133,16 @@ public class indexContainer extends kelondroRowSet {
}
}
public indexEntry get(String urlHash) {
public indexRWIEntry get(String urlHash) {
kelondroRow.Entry entry = this.get(urlHash.getBytes());
if (entry == null) return null;
return new indexURLEntry(entry);
return new indexRWIEntryOld(entry);
}
public indexEntry remove(String urlHash) {
public indexRWIEntry remove(String urlHash) {
kelondroRow.Entry entry = this.remove(urlHash.getBytes());
if (entry == null) return null;
return new indexURLEntry(entry);
return new indexRWIEntryOld(entry);
}
public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) {
@ -178,7 +178,7 @@ public class indexContainer extends kelondroRowSet {
public Object next() {
kelondroRow.Entry rentry = (kelondroRow.Entry) rowEntryIterator.next();
if (rentry == null) return null;
return new indexURLEntry(rentry);
return new indexRWIEntryOld(rentry);
}
public void remove() {
@ -288,10 +288,10 @@ public class indexContainer extends kelondroRowSet {
assert small.rowdef.equals(large.rowdef) : "small = " + small.rowdef.toString() + "; large = " + large.rowdef.toString();
indexContainer conj = new indexContainer(null, small.rowdef); // start with empty search result
Iterator se = small.entries();
indexEntry ie0, ie1;
indexRWIEntry ie0, ie1;
long stamp = System.currentTimeMillis();
while ((se.hasNext()) && ((System.currentTimeMillis() - stamp) < time)) {
ie0 = (indexEntry) se.next();
ie0 = (indexRWIEntry) se.next();
ie1 = large.get(ie0.urlHash());
if (ie1 != null) {
// this is a hit. Calculate word distance:
@ -312,25 +312,25 @@ public class indexContainer extends kelondroRowSet {
Iterator e2 = i2.entries();
int c;
if ((e1.hasNext()) && (e2.hasNext())) {
indexEntry ie1;
indexEntry ie2;
ie1 = (indexEntry) e1.next();
ie2 = (indexEntry) e2.next();
indexRWIEntry ie1;
indexRWIEntry ie2;
ie1 = (indexRWIEntry) e1.next();
ie2 = (indexRWIEntry) e2.next();
long stamp = System.currentTimeMillis();
while ((System.currentTimeMillis() - stamp) < time) {
c = i1.order().compare(ie1.urlHash(), ie2.urlHash());
//System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c);
if (c < 0) {
if (e1.hasNext()) ie1 = (indexEntry) e1.next(); else break;
if (e1.hasNext()) ie1 = (indexRWIEntry) e1.next(); else break;
} else if (c > 0) {
if (e2.hasNext()) ie2 = (indexEntry) e2.next(); else break;
if (e2.hasNext()) ie2 = (indexRWIEntry) e2.next(); else break;
} else {
// we have found the same urls in different searches!
ie1.combineDistance(ie2);
if (ie1.worddistance() <= maxDistance) conj.add(ie1);
if (e1.hasNext()) ie1 = (indexEntry) e1.next(); else break;
if (e2.hasNext()) ie2 = (indexEntry) e2.next(); else break;
if (e1.hasNext()) ie1 = (indexRWIEntry) e1.next(); else break;
if (e2.hasNext()) ie2 = (indexRWIEntry) e2.next(); else break;
}
}
}

@ -35,10 +35,6 @@ import de.anomic.yacy.yacySeedDB;
public class indexEntryAttribute {
// the size of a word hash
public static final int wordHashLength = yacySeedDB.commonHashLength; // 12
public static final int urlHashLength = yacySeedDB.commonHashLength; // 12
// doctypes:
public static final char DT_PDFPS = 'p';
public static final char DT_TEXT = 't';
@ -86,7 +82,7 @@ public class indexEntryAttribute {
// create a word hash
public static String word2hash(String word) {
return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase())).substring(0, indexEntryAttribute.wordHashLength);
return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase())).substring(0, yacySeedDB.commonHashLength);
}
// doctype calculation

@ -81,7 +81,7 @@ public final class indexRAMRI implements indexRI {
this.indexArrayFileName = dumpname;
this.payloadrow = payloadrow;
this.bufferStructureBasis = new kelondroRow(
"byte[] wordhash-" + indexEntryAttribute.wordHashLength + ", " +
"byte[] wordhash-" + yacySeedDB.commonHashLength + ", " +
"Cardinal occ-4 {b256}, " +
"Cardinal time-8 {b256}, " +
"byte[] urlprops-" + payloadrow.objectsize());
@ -114,7 +114,7 @@ public final class indexRAMRI implements indexRI {
String wordHash;
indexContainer container;
long updateTime;
indexEntry iEntry;
indexRWIEntry iEntry;
kelondroRow.Entry row = dumpArray.row().newEntry();
// write wCache
@ -131,7 +131,7 @@ public final class indexRAMRI implements indexRI {
if (container != null) {
Iterator ci = container.entries();
while (ci.hasNext()) {
iEntry = (indexEntry) ci.next();
iEntry = (indexRWIEntry) ci.next();
row.setCol(0, wordHash.getBytes());
row.setCol(1, kelondroNaturalOrder.encodeLong(container.size(), 4));
row.setCol(2, kelondroNaturalOrder.encodeLong(updateTime, 8));
@ -169,7 +169,7 @@ public final class indexRAMRI implements indexRI {
Iterator i = dumpArray.contentRows(-1);
String wordHash;
//long creationTime;
indexEntry wordEntry;
indexRWIEntry wordEntry;
kelondroRow.Entry row;
//Runtime rt = Runtime.getRuntime();
while (i.hasNext()) {
@ -178,7 +178,7 @@ public final class indexRAMRI implements indexRI {
if ((row == null) || (row.empty(0)) || (row.empty(3))) continue;
wordHash = row.getColString(0, "UTF-8");
//creationTime = kelondroRecords.bytes2long(row[2]);
wordEntry = new indexURLEntry(row.getColBytes(3));
wordEntry = new indexRWIEntryOld(row.getColBytes(3));
// store to cache
addEntry(wordHash, wordEntry, startTime, false);
urlCount++;
@ -437,10 +437,10 @@ public final class indexRAMRI implements indexRI {
return null;
}
public synchronized indexContainer addEntry(String wordHash, indexEntry newEntry, long updateTime, boolean dhtCase) {
public synchronized indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) {
indexContainer container = (indexContainer) cache.get(wordHash);
if (container == null) container = new indexContainer(wordHash, this.payloadrow);
indexEntry[] entries = new indexEntry[] { newEntry };
indexRWIEntry[] entries = new indexRWIEntry[] { newEntry };
if (container.add(entries, updateTime) > 0) {
cache.put(wordHash, container);
hashScore.incScore(wordHash);

@ -44,7 +44,7 @@ public interface indexRI {
public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete);
public int removeEntries(String wordHash, Set urlHashes, boolean deleteComplete);
public indexContainer addEntry(String wordHash, indexEntry entry, long updateTime, boolean dhtCase);
public indexContainer addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean dhtCase);
public indexContainer addEntries(indexContainer newEntries, long creationTime, boolean dhtCase);
public void close(int waitingSeconds);

@ -1,4 +1,4 @@
// indexEntry.java
// indexRWIEntry.java
// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 20.05.2006 on http://www.anomic.de
//
@ -28,7 +28,7 @@ package de.anomic.index;
import de.anomic.kelondro.kelondroRow;
public interface indexEntry {
public interface indexRWIEntry {
public Object clone();
public String toPropertyForm(boolean displayFormat);
@ -48,13 +48,13 @@ public interface indexEntry {
public char getType();
public boolean isLocal();
public void combineDistance(indexEntry oe);
public void combineDistance(indexRWIEntry oe);
public int worddistance();
public void min(indexEntry other);
public void max(indexEntry other);
public void normalize(indexEntry min, indexEntry max);
public indexEntry generateNormalized(indexEntry min, indexEntry max);
public boolean isNewer(indexEntry other);
public boolean isOlder(indexEntry other);
public void min(indexRWIEntry other);
public void max(indexRWIEntry other);
public void normalize(indexRWIEntry min, indexRWIEntry max);
public indexRWIEntry generateNormalized(indexRWIEntry min, indexRWIEntry max);
public boolean isNewer(indexRWIEntry other);
public boolean isOlder(indexRWIEntry other);
}

@ -0,0 +1,323 @@
// indexURLEntryNew.java
// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 21.07.2006 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.index;
import de.anomic.kelondro.kelondroColumn;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRow.Entry;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.yacy.yacySeedDB;
public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
// this object stores attributes to URL references inside RWI collections
// statics for value lengths
public static final int urlStringLength = 256;// not too short for links without parameters
public static final int urlDescrLength = 80; // The headline of a web page (meta-tag or <h1>)
public static final int urlNameLength = 40; // the tag content between <a> and </a>
public static final int urldescrtagsLength = 320;// the url, the description and tags in one string
public static final int urlErrorLength = 80; // a reason description for unavailable urls
public static final int urlDateLength = 4; // any date, shortened
public static final int urlCopyCountLength = 2; // counter for numbers of copies of this index
public static final int urlFlagLength = 2; // any stuff
public static final int urlLanguageLength = 2; // taken from TLD suffix as quick-hack
public static final int urlDoctypeLength = 1; // taken from extension
public static final int urlSizeLength = 6; // the source size, from cache
public static final int urlWordCountLength = 3; // the number of words, from condenser
public static final int urlCrawlProfileHandleLength = 4; // name of the prefetch profile
public static final int urlCrawlDepthLength = 2; // prefetch depth, first is '0'
public static final int urlParentBranchesLength = 3; // number of anchors of the parent
public static final int urlForkFactorLength = 4; // sum of anchors of all ancestors
public static final int urlRetryLength = 2; // number of load retries
public static final int urlHostLength = 8; // the host as struncated name
public static final int urlHandleLength = 4; // a handle
public static final int urlQualityLength = 3; // taken from heuristic
public static kelondroRow urlEntryRow = new kelondroRow(new kelondroColumn[]{
new kelondroColumn("h", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, yacySeedDB.commonHashLength, "urlhash"),
new kelondroColumn("q", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, urlQualityLength, "quality"),
new kelondroColumn("a", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 3, "lastModified"),
new kelondroColumn("c", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "hitcount"),
new kelondroColumn("l", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, urlLanguageLength, "language"),
new kelondroColumn("d", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 1, "doctype"),
new kelondroColumn("f", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 1, "localflag"),
new kelondroColumn("t", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posintext"),
new kelondroColumn("r", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posinphrase"),
new kelondroColumn("o", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posofphrase"),
new kelondroColumn("i", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "worddistance"),
new kelondroColumn("w", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "wordcount"),
new kelondroColumn("p", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "phrasecount")
});
private static final int col_urlhash = 0;
private static final int col_quality = 1;
private static final int col_lastModified = 2;
private static final int col_hitcount = 3;
private static final int col_language = 4;
private static final int col_doctype = 5;
private static final int col_localflag = 6;
private static final int col_posintext = 7;
private static final int col_posinphrase = 8;
private static final int col_posofphrase = 9;
private static final int col_worddistance = 10;
private static final int col_wordcount = 11;
private static final int col_phrasecount = 12;
private kelondroRow.Entry entry;
public indexRWIEntryOld(String urlHash,
int urlLength, // byte-length of complete URL
int urlComps, // number of path components
int titleLength, // length of description/length (longer are better?)
int hitcount, //*how often appears this word in the text
int wordcount, //*total number of words
int phrasecount, //*total number of phrases
int posintext, //*position of word in all words
int posinphrase, //*position of word in its phrase
int posofphrase, //*number of the phrase where word appears
int worddistance, //*word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search
int sizeOfPage, // # of bytes of the page
long lastmodified, //*last-modified time of the document where word appears
long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
int quality, //*the entropy value
String language, //*(guessed) language of document
char doctype, //*type of document
int outlinksSame, // outlinks to same domain
int outlinksOther,// outlinks to other domain
boolean local //*flag shows that this index was generated locally; othervise its from a remote peer
) {
// more needed attributes:
// - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag, hervorhebungen, meta-tags, word in link etc
// - boolean: URL attributes
assert (urlHash.length() == 12) : "urlhash = " + urlHash;
if ((language == null) || (language.length() != urlLanguageLength)) language = "uk";
this.entry = urlEntryRow.newEntry();
this.entry.setCol(col_urlhash, urlHash, null);
this.entry.setCol(col_quality, quality);
this.entry.setCol(col_lastModified, lastmodified);
this.entry.setCol(col_hitcount, hitcount);
this.entry.setCol(col_language, language, null);
this.entry.setCol(col_doctype, (byte) doctype);
this.entry.setCol(col_localflag, (byte) ((local) ? indexEntryAttribute.LT_LOCAL : indexEntryAttribute.LT_GLOBAL));
this.entry.setCol(col_posintext, posintext);
this.entry.setCol(col_posinphrase, posinphrase);
this.entry.setCol(col_posofphrase, posofphrase);
this.entry.setCol(col_worddistance, worddistance);
this.entry.setCol(col_wordcount, wordcount);
this.entry.setCol(col_phrasecount, phrasecount);
//System.out.println("DEBUG-NEWENTRY " + toPropertyForm());
}
public indexRWIEntryOld(String urlHash, String code) {
// the code is the external form of the row minus the leading urlHash entry
this.entry = urlEntryRow.newEntry((urlHash + code).getBytes());
}
public indexRWIEntryOld(String external) {
this.entry = urlEntryRow.newEntry(external);
}
public indexRWIEntryOld(byte[] row) {
this.entry = urlEntryRow.newEntry(row);
}
public indexRWIEntryOld(kelondroRow.Entry rentry) {
// FIXME: see if cloning is necessary
this.entry = rentry;
}
public Object clone() {
byte[] b = new byte[urlEntryRow.objectsize()];
System.arraycopy(entry.bytes(), 0, b, 0, urlEntryRow.objectsize());
return new indexRWIEntryOld(b);
}
public String toPropertyForm(boolean displayFormat) {
return entry.toPropertyForm(true, displayFormat, displayFormat);
}
public Entry toKelondroEntry() {
return this.entry;
}
public String urlHash() {
return this.entry.getColString(col_urlhash, null);
}
public int quality() {
return (int) this.entry.getColLong(col_quality);
}
public int virtualAge() {
return plasmaWordIndex.microDateDays(lastModified());
}
public long lastModified() {
return (int) this.entry.getColLong(col_lastModified);
}
public int hitcount() {
return (int) this.entry.getColLong(col_hitcount);
}
public int posintext() {
return (int) this.entry.getColLong(col_posintext);
}
public int posinphrase() {
return (int) this.entry.getColLong(col_posinphrase);
}
public int posofphrase() {
return (int) this.entry.getColLong(col_posofphrase);
}
public int wordcount() {
return (int) this.entry.getColLong(col_wordcount);
}
public int phrasecount() {
return (int) this.entry.getColLong(col_phrasecount);
}
public String getLanguage() {
return this.entry.getColString(col_language, null);
}
public char getType() {
return (char) this.entry.getColByte(col_doctype);
}
public boolean isLocal() {
return this.entry.getColByte(col_localflag) == indexEntryAttribute.LT_LOCAL;
}
public static indexRWIEntryOld combineDistance(indexRWIEntryOld ie1, indexRWIEntry ie2) {
// returns a modified entry of the first argument
ie1.entry.setCol(col_worddistance, ie1.worddistance() + ie2.worddistance() + Math.abs(ie1.posintext() - ie2.posintext()));
ie1.entry.setCol(col_posintext, Math.min(ie1.posintext(), ie2.posintext()));
ie1.entry.setCol(col_posinphrase, (ie1.posofphrase() == ie2.posofphrase()) ? ie1.posofphrase() : 0 /*unknown*/);
ie1.entry.setCol(col_posofphrase, Math.min(ie1.posofphrase(), ie2.posofphrase()));
ie1.entry.setCol(col_wordcount, (ie1.wordcount() + ie2.wordcount()) / 2);
return ie1;
}
public void combineDistance(indexRWIEntry oe) {
combineDistance(this, oe);
}
public int worddistance() {
return (int) this.entry.getColLong(col_worddistance);
}
public static final void min(indexRWIEntryOld t, indexRWIEntry other) {
if (t.hitcount() > other.hitcount()) t.entry.setCol(col_hitcount, other.hitcount());
if (t.wordcount() > other.wordcount()) t.entry.setCol(col_wordcount, other.wordcount());
if (t.phrasecount() > other.phrasecount()) t.entry.setCol(col_phrasecount, other.phrasecount());
if (t.posintext() > other.posintext()) t.entry.setCol(col_posintext, other.posintext());
if (t.posinphrase() > other.posinphrase()) t.entry.setCol(col_posinphrase, other.posinphrase());
if (t.posofphrase() > other.posofphrase()) t.entry.setCol(col_posofphrase, other.posofphrase());
if (t.worddistance() > other.worddistance()) t.entry.setCol(col_worddistance, other.worddistance());
if (t.lastModified() > other.lastModified()) t.entry.setCol(col_lastModified, other.lastModified());
if (t.quality() > other.quality()) t.entry.setCol(col_quality, other.quality());
}
public static final void max(indexRWIEntryOld t, indexRWIEntry other) {
if (t.hitcount() < other.hitcount()) t.entry.setCol(col_hitcount, other.hitcount());
if (t.wordcount() < other.wordcount()) t.entry.setCol(col_wordcount, other.wordcount());
if (t.phrasecount() < other.phrasecount()) t.entry.setCol(col_phrasecount, other.phrasecount());
if (t.posintext() < other.posintext()) t.entry.setCol(col_posintext, other.posintext());
if (t.posinphrase() < other.posinphrase()) t.entry.setCol(col_posinphrase, other.posinphrase());
if (t.posofphrase() < other.posofphrase()) t.entry.setCol(col_posofphrase, other.posofphrase());
if (t.worddistance() < other.worddistance()) t.entry.setCol(col_worddistance, other.worddistance());
if (t.lastModified() < other.lastModified()) t.entry.setCol(col_lastModified, other.lastModified());
if (t.quality() < other.quality()) t.entry.setCol(col_quality, other.quality());
}
public void min(indexRWIEntry other) {
min(this, other);
}
public void max(indexRWIEntry other) {
max(this, other);
}
static void normalize(indexRWIEntryOld t, indexRWIEntry min, indexRWIEntry max) {
assert (t.urlHash().length() == 12) : "turlhash = " + t.urlHash();
assert (min.urlHash().length() == 12) : "minurlhash = " + min.urlHash();
assert (max.urlHash().length() == 12) : "maxurlhash = " + max.urlHash();
if (1 + max.worddistance() - min.worddistance() == 0) System.out.println("min = " + min.toPropertyForm(true) + "\nmax=" + max.toPropertyForm(true));
//System.out.println("Normalize:\nentry = " + t.toPropertyForm(true));
//System.out.println("min = " + min.toPropertyForm(true));
//System.out.println("max = " + max.toPropertyForm(true));
t.entry.setCol(col_hitcount , (t.hitcount() == 0) ? 0 : 1 + 255 * (t.hitcount() - min.hitcount() ) / (1 + max.hitcount() - min.hitcount()));
t.entry.setCol(col_wordcount , (t.wordcount() == 0) ? 0 : 1 + 255 * (t.wordcount() - min.wordcount() ) / (1 + max.wordcount() - min.wordcount()));
t.entry.setCol(col_phrasecount , (t.phrasecount() == 0) ? 0 : 1 + 255 * (t.phrasecount() - min.phrasecount() ) / (1 + max.phrasecount() - min.phrasecount()));
t.entry.setCol(col_posintext , (t.posintext() == 0) ? 0 : 1 + 255 * (t.posintext() - min.posintext() ) / (1 + max.posintext() - min.posintext()));
t.entry.setCol(col_posinphrase , (t.posinphrase() == 0) ? 0 : 1 + 255 * (t.posinphrase() - min.posinphrase() ) / (1 + max.posinphrase() - min.posinphrase()));
t.entry.setCol(col_posofphrase , (t.posofphrase() == 0) ? 0 : 1 + 255 * (t.posofphrase() - min.posofphrase() ) / (1 + max.posofphrase() - min.posofphrase()));
t.entry.setCol(col_worddistance , (t.worddistance() == 0) ? 0 : 1 + 255 * (t.worddistance() - min.worddistance()) / (1 + max.worddistance() - min.worddistance())); // FIXME: hier gibts ein division by zero, was nur sein kann wenn die Normalisierung nicht geklappt hat.
t.entry.setCol(col_lastModified , (t.lastModified() == 0) ? 0 : 1 + 255 * (t.lastModified() - min.lastModified()) / (1 + max.lastModified() - min.lastModified()));
t.entry.setCol(col_quality , (t.quality() == 0) ? 0 : 1 + 255 * (t.quality() - min.quality() ) / (1 + max.quality() - min.quality()));
//System.out.println("out = " + t.toPropertyForm(true));
}
public void normalize(indexRWIEntry min, indexRWIEntry max) {
normalize(this, min, max);
}
public indexRWIEntry generateNormalized(indexRWIEntry min, indexRWIEntry max) {
assert (this.urlHash().length() == 12) : "this.urlhash = " + this.urlHash();
indexRWIEntryOld e = (indexRWIEntryOld) this.clone();
e.normalize(min, max);
return e;
}
public boolean isNewer(indexRWIEntry other) {
if (other == null) return true;
if (this.lastModified() > other.lastModified()) return true;
if (this.lastModified() == other.lastModified()) {
if (this.quality() > other.quality()) return true;
}
return false;
}
public boolean isOlder(indexRWIEntry other) {
if (other == null) return false;
if (this.lastModified() < other.lastModified()) return true;
if (this.lastModified() == other.lastModified()) {
if (this.quality() < other.quality()) return true;
}
return false;
}
}

@ -50,29 +50,6 @@ public class indexURL {
// day formatter for entry export
public static final SimpleDateFormat shortDayFormatter = new SimpleDateFormat("yyyyMMdd");
// statics for value lengths
public static final int urlHashLength = yacySeedDB.commonHashLength; // 12
public static final int urlStringLength = 256;// not too short for links without parameters
public static final int urlDescrLength = 80; // The headline of a web page (meta-tag or <h1>)
public static final int urlNameLength = 40; // the tag content between <a> and </a>
public static final int urldescrtagsLength = 320;// the url, the description and tags in one string
public static final int urlErrorLength = 80; // a reason description for unavailable urls
public static final int urlDateLength = 4; // any date, shortened
public static final int urlCopyCountLength = 2; // counter for numbers of copies of this index
public static final int urlFlagLength = 2; // any stuff
public static final int urlQualityLength = 3; // taken from heuristic
public static final int urlLanguageLength = 2; // taken from TLD suffix as quick-hack
public static final int urlDoctypeLength = 1; // taken from extension
public static final int urlSizeLength = 6; // the source size, from cache
public static final int urlWordCountLength = 3; // the number of words, from condenser
public static final int urlCrawlProfileHandleLength = 4; // name of the prefetch profile
public static final int urlCrawlDepthLength = 2; // prefetch depth, first is '0'
public static final int urlParentBranchesLength = 3; // number of anchors of the parent
public static final int urlForkFactorLength = 4; // sum of anchors of all ancestors
public static final int urlRetryLength = 2; // number of load retries
public static final int urlHostLength = 8; // the host as struncated name
public static final int urlHandleLength = 4; // a handle
private static final String[] TLD_NorthAmericaOceania={
// primary english-speaking countries
// english-speaking countries from central america are also included
@ -397,7 +374,7 @@ public class indexURL {
static {
// create a dummy hash
dummyHash = "";
for (int i = 0; i < urlHashLength; i++) dummyHash += "-";
for (int i = 0; i < yacySeedDB.commonHashLength; i++) dummyHash += "-";
// assign TLD-ids and names
insertTLDProps(TLD_EuropaRussia, 0);
@ -602,13 +579,13 @@ public class indexURL {
public static final String oldurlHash(URL url) {
if (url == null) return null;
String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(url.toNormalform())).substring(0, urlHashLength);
String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(url.toNormalform())).substring(0, yacySeedDB.commonHashLength);
return hash;
}
public static final String oldurlHash(String url) throws MalformedURLException {
if ((url == null) || (url.length() < 10)) return null;
String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(new URL(url).toNormalform())).substring(0, urlHashLength);
String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(new URL(url).toNormalform())).substring(0, yacySeedDB.commonHashLength);
return hash;
}
@ -618,10 +595,10 @@ public class indexURL {
TreeMap doms = new TreeMap();
synchronized(inputContainer) {
Iterator i = inputContainer.entries();
indexEntry iEntry;
indexRWIEntry iEntry;
String dom, paths;
while (i.hasNext()) {
iEntry = (indexEntry) i.next();
iEntry = (indexRWIEntry) i.next();
if ((excludeContainer != null) && (excludeContainer.get(iEntry.urlHash()) != null)) continue; // do not include urls that are in excludeContainer
dom = iEntry.urlHash().substring(6);
if ((paths = (String) doms.get(dom)) == null) {

@ -1,6 +1,6 @@
// plasmaCrawlLURLEntry.java
// indexURLEntry.java
// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 13.10.2006 on http://www.anomic.de
// first published 2006 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
@ -24,7 +24,8 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.plasma;
package de.anomic.index;
import java.io.IOException;
import java.net.MalformedURLException;
@ -32,9 +33,9 @@ import java.util.Date;
import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.index.indexEntry;
import de.anomic.index.indexRWIEntry;
public interface plasmaCrawlLURLEntry {
public interface indexURLEntry {
public kelondroRow.Entry toRowEntry() throws IOException;
public String hash();
@ -48,8 +49,8 @@ public interface plasmaCrawlLURLEntry {
public int size();
public int wordCount();
public String snippet();
public indexEntry word();
public boolean isOlder(plasmaCrawlLURLEntry other);
public indexRWIEntry word();
public boolean isOlder(indexURLEntry other);
public String toString(String snippet);
public String toString();
@ -82,4 +83,4 @@ public interface plasmaCrawlLURLEntry {
public String ETag() { return this.ETag; }
}
}
}

@ -1,4 +1,4 @@
package de.anomic.plasma;
package de.anomic.index;
import java.io.IOException;
import java.net.MalformedURLException;
@ -7,9 +7,6 @@ import java.util.Date;
import java.util.Properties;
import java.util.ArrayList;
import de.anomic.index.indexEntry;
import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroRow;
@ -20,8 +17,10 @@ import de.anomic.tools.crypt;
import de.anomic.tools.bitfield;
import de.anomic.tools.nxTools;
public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry {
public class indexURLEntryNew implements indexURLEntry {
// this object stores attributes for URL entries
public static final kelondroRow rowdef = new kelondroRow(
"String hash-12, " + // the url's hash
"String comp-360, " + // components: the url, description, author and tags. As 5th element, an ETag is possible
@ -37,16 +36,16 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry {
"String lang-2, " + // language
"Cardinal llocal-2 {b256}, " + // # of outlinks to same domain; for video and image: width
"Cardinal lother-2 {b256}, " + // # of outlinks to outside domain; for video and image: height
"Cardinal limage-2 {b256}, " + // # of embedded image links
"Cardinal limage-2 {b256}, " + // # of embedded image links
"Cardinal laudio-2 {b256}, " + // # of embedded audio links; for audio: track number; for video: number of audio tracks
"Cardinal lvideo-2 {b256}, " + // # of embedded video links
"Cardinal lapp-2 {b256}"); // # of embedded links to applications
"Cardinal lapp-2 {b256}"); // # of embedded links to applications
private kelondroRow.Entry entry;
private String snippet;
private indexEntry word; // this is only used if the url is transported via remote search requests
private indexRWIEntry word; // this is only used if the url is transported via remote search requests
public plasmaCrawlLURLNewEntry(
public indexURLEntryNew(
URL url,
String descr,
String author,
@ -106,13 +105,13 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry {
return s.toString().getBytes();
}
public plasmaCrawlLURLNewEntry(kelondroRow.Entry entry, indexEntry searchedWord) {
public indexURLEntryNew(kelondroRow.Entry entry, indexRWIEntry searchedWord) {
this.entry = entry;
this.snippet = null;
this.word = searchedWord;
}
public plasmaCrawlLURLNewEntry(Properties prop){
public indexURLEntryNew(Properties prop){
// generates an plasmaLURLEntry using the properties from the argument
// the property names must correspond to the one from toString
//System.out.println("DEBUG-ENTRY: prop=" + prop.toString());
@ -159,12 +158,12 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry {
this.entry.setCol("lvideo", Integer.parseInt(prop.getProperty("lvideo", "0")));
this.entry.setCol("lapp", Integer.parseInt(prop.getProperty("lapp", "0")));
this.snippet = crypt.simpleDecode(prop.getProperty("snippet", ""), null);
this.word = (prop.containsKey("word")) ? new indexURLEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word", ""))) : null;
this.word = (prop.containsKey("word")) ? new indexRWIEntryOld(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word", ""))) : null;
}
private StringBuffer corePropList() {
// generate a parseable string; this is a simple property-list
plasmaCrawlLURLEntry.Components comp = this.comp();
indexURLEntry.Components comp = this.comp();
final StringBuffer s = new StringBuffer(300);
try {
s.append("hash=").append(hash());
@ -217,9 +216,9 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry {
return this.entry.getColString("hash", "", null);
}
public plasmaCrawlLURLEntry.Components comp() {
public indexURLEntry.Components comp() {
ArrayList cl = nxTools.strings(this.entry.getCol("comp", null), "UTF-8");
return new de.anomic.plasma.plasmaCrawlLURLEntry.Components(
return new indexURLEntry.Components(
(cl.size() > 0) ? (String) cl.get(0) : "",
(cl.size() > 1) ? (String) cl.get(1) : "",
(cl.size() > 2) ? (String) cl.get(2) : "",
@ -299,11 +298,11 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry {
return snippet;
}
public indexEntry word() {
public indexRWIEntry word() {
return word;
}
public boolean isOlder(plasmaCrawlLURLEntry other) {
public boolean isOlder(indexURLEntry other) {
if (other == null) return false;
Date tmoddate = moddate();
Date omoddate = other.moddate();

@ -24,39 +24,37 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.plasma;
package de.anomic.index;
import java.io.IOException;
import java.util.Date;
import java.util.Properties;
import de.anomic.http.httpc;
import de.anomic.index.indexEntry;
import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.bitfield;
import de.anomic.tools.crypt;
import de.anomic.yacy.yacySeedDB;
public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
public class indexURLEntryOld implements indexURLEntry {
public static final kelondroRow rowdef = new kelondroRow(
"String urlhash-" + indexURL.urlHashLength + ", " + // the url's hash
"String urlstring-" + indexURL.urlStringLength + ", " + // the url as string
"String urldescr-" + indexURL.urlDescrLength + ", " + // the description of the url
"Cardinal moddate-" + indexURL.urlDateLength + " {b64e}, " + // last-modified from the httpd
"Cardinal loaddate-" + indexURL.urlDateLength + " {b64e}, " + // time when the url was loaded
"String refhash-" + indexURL.urlHashLength + ", " + // the url's referrer hash
"Cardinal copycount-" + indexURL.urlCopyCountLength + " {b64e}, " + //
"byte[] flags-" + indexURL.urlFlagLength + ", " + // flags
"Cardinal quality-" + indexURL.urlQualityLength + " {b64e}, " + //
"String language-" + indexURL.urlLanguageLength + ", " + //
"byte[] doctype-" + indexURL.urlDoctypeLength + ", " + //
"Cardinal size-" + indexURL.urlSizeLength + " {b64e}, " + // size of file in bytes
"Cardinal wc-" + indexURL.urlWordCountLength + " {b64e}"); // word count
"String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash
"String urlstring-" + indexRWIEntryOld.urlStringLength + ", " + // the url as string
"String urldescr-" + indexRWIEntryOld.urlDescrLength + ", " + // the description of the url
"Cardinal moddate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // last-modified from the httpd
"Cardinal loaddate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // time when the url was loaded
"String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"Cardinal copycount-" + indexRWIEntryOld.urlCopyCountLength + " {b64e}, " + //
"byte[] flags-" + indexRWIEntryOld.urlFlagLength + ", " + // flags
"Cardinal quality-" + indexRWIEntryOld.urlQualityLength + " {b64e}, " + //
"String language-" + indexRWIEntryOld.urlLanguageLength + ", " + //
"byte[] doctype-" + indexRWIEntryOld.urlDoctypeLength + ", " + //
"Cardinal size-" + indexRWIEntryOld.urlSizeLength + " {b64e}, " + // size of file in bytes
"Cardinal wc-" + indexRWIEntryOld.urlWordCountLength + " {b64e}"); // word count
private URL url;
private String descr;
@ -72,9 +70,9 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
private int size;
private int wordCount;
private String snippet;
private indexEntry word; // this is only used if the url is transported via remote search requests
private indexRWIEntry word; // this is only used if the url is transported via remote search requests
public plasmaCrawlLURLOldEntry(
public indexURLEntryOld(
URL url,
String descr,
String author,
@ -114,7 +112,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
this.word = null;
}
public plasmaCrawlLURLOldEntry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException {
public indexURLEntryOld(kelondroRow.Entry entry, indexRWIEntry searchedWord) throws IOException {
try {
this.urlHash = entry.getColString(0, null);
this.url = new URL(entry.getColString(1, "UTF-8"));
@ -138,7 +136,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
}
}
public plasmaCrawlLURLOldEntry(Properties prop) {
public indexURLEntryOld(Properties prop) {
// generates an plasmaLURLEntry using the properties from the argument
// the property names must correspond to the one from toString
//System.out.println("DEBUG-ENTRY: prop=" + prop.toString());
@ -161,7 +159,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
this.snippet = prop.getProperty("snippet", "");
if (snippet.length() == 0) snippet = null;
else snippet = crypt.simpleDecode(snippet, null);
this.word = (prop.containsKey("word")) ? new indexURLEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word", ""))) : null;
this.word = (prop.containsKey("word")) ? new indexRWIEntryOld(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word", ""))) : null;
} catch (Exception e) {
serverLog.logSevere("PLASMA",
"INTERNAL ERROR in plasmaLURL.entry/2:"
@ -178,8 +176,8 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
}
public kelondroRow.Entry toRowEntry() throws IOException {
final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, indexURL.urlDateLength);
final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexURL.urlDateLength);
final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, indexRWIEntryOld.urlDateLength);
final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexRWIEntryOld.urlDateLength);
final byte[][] entry = new byte[][] {
urlHash.getBytes(),
@ -188,13 +186,13 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
moddatestr.getBytes(),
loaddatestr.getBytes(),
referrerHash.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(copyCount, indexURL.urlCopyCountLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(copyCount, indexRWIEntryOld.urlCopyCountLength).getBytes(),
flags.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(quality, indexURL.urlQualityLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(quality, indexRWIEntryOld.urlQualityLength).getBytes(),
language.getBytes(),
new byte[] { (byte) doctype },
kelondroBase64Order.enhancedCoder.encodeLong(size, indexURL.urlSizeLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(wordCount, indexURL.urlWordCountLength).getBytes()};
kelondroBase64Order.enhancedCoder.encodeLong(size, indexRWIEntryOld.urlSizeLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(wordCount, indexRWIEntryOld.urlWordCountLength).getBytes()};
return rowdef.newEntry(entry);
}
@ -264,11 +262,11 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
return snippet;
}
public indexEntry word() {
public indexRWIEntry word() {
return word;
}
public boolean isOlder(plasmaCrawlLURLEntry other) {
public boolean isOlder(indexURLEntry other) {
if (other == null) return false;
if (moddate.before(other.moddate())) return true;
if (moddate.equals(other.moddate())) {
@ -292,7 +290,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
",local=").append(((local()) ? "true" : "false"))
.append(",q=").append(
kelondroBase64Order.enhancedCoder.encodeLong(
quality, indexURL.urlQualityLength))
quality, indexRWIEntryOld.urlQualityLength))
.append(",dt=").append(doctype).append(",lang=").append(
language).append(",url=").append(
crypt.simpleEncode(url.toString())).append(

@ -51,6 +51,7 @@ import java.io.File;
import java.io.IOException;
import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlLoaderMessage;
@ -297,7 +298,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
yacyCore.seedDB.mySeed.hash,
this.name,
(failreason==null)?"Unknown reason":failreason,
new bitfield(indexURL.urlFlagLength)
new bitfield(indexRWIEntryOld.urlFlagLength)
);
// store the entry

@ -5,7 +5,7 @@ import java.io.IOException;
import java.util.Iterator;
import de.anomic.index.indexContainer;
import de.anomic.index.indexURLEntry;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndexAssortment;
@ -63,7 +63,7 @@ public class AssortmentImporter extends AbstractImporter implements dbImporter{
// initializing the import assortment db
this.log.logInfo("Initializing source assortment file");
try {
this.assortmentFile = new plasmaWordIndexAssortment(importAssortmentPath, indexURLEntry.urlEntryRow, assortmentNr, this.cacheSize/1024, preloadTime, this.log);
this.assortmentFile = new plasmaWordIndexAssortment(importAssortmentPath, indexRWIEntryOld.urlEntryRow, assortmentNr, this.cacheSize/1024, preloadTime, this.log);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);

@ -7,10 +7,10 @@ import java.util.Iterator;
import java.util.TreeSet;
import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.server.serverDate;
@ -134,13 +134,13 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
// loop throug the entities of the container and get the
// urlhash
Iterator importWordIdxEntries = newContainer.entries();
indexEntry importWordIdxEntry;
indexRWIEntry importWordIdxEntry;
while (importWordIdxEntries.hasNext()) {
// testing if import process was aborted
if (isAborted()) break;
// getting next word index entry
importWordIdxEntry = (indexEntry) importWordIdxEntries.next();
importWordIdxEntry = (indexRWIEntry) importWordIdxEntries.next();
String urlHash = importWordIdxEntry.urlHash();
entityUrls.add(urlHash);
}
@ -162,7 +162,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
// we need to import the url
// getting the url entry
plasmaCrawlLURLEntry urlEntry = this.importUrlDB.load(urlHash, null);
indexURLEntry urlEntry = this.importUrlDB.load(urlHash, null);
if (urlEntry != null) {
/* write it into the home url db */

@ -48,10 +48,10 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import de.anomic.index.indexURL;
import de.anomic.kelondro.kelondroRecords;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroStack;
import de.anomic.yacy.yacySeedDB;
public class plasmaCrawlBalancer {
@ -59,7 +59,7 @@ public class plasmaCrawlBalancer {
private HashMap domainStacks;
public plasmaCrawlBalancer(File stackFile) {
stack = kelondroStack.open(stackFile, new kelondroRow("byte[] urlhash-" + indexURL.urlHashLength));
stack = kelondroStack.open(stackFile, new kelondroRow("byte[] urlhash-" + yacySeedDB.commonHashLength));
domainStacks = new HashMap();
}

@ -54,12 +54,14 @@ import java.util.Iterator;
import java.util.LinkedList;
import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroFlexTable;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroTree;
import de.anomic.net.URL;
import de.anomic.tools.bitfield;
import de.anomic.yacy.yacySeedDB;
public class plasmaCrawlEURL extends indexURL {
@ -134,17 +136,17 @@ public class plasmaCrawlEURL extends indexURL {
public plasmaCrawlEURL(File cachePath, int bufferkb, long preloadTime, boolean newdb) {
super();
kelondroRow rowdef = new kelondroRow(
"String urlhash-" + urlHashLength + ", " + // the url's hash
"String refhash-" + urlHashLength + ", " + // the url's referrer hash
"String initiator-" + urlHashLength + ", " + // the crawling initiator
"String executor-" + urlHashLength + ", " + // the crawling executor
"String urlstring-" + urlStringLength + ", " + // the url as string
"String urlname-" + urlNameLength + ", " + // the name of the url, from anchor tag <a>name</a>
"Cardinal appdate-" + urlDateLength + " {b64e}, " + // the time when the url was first time appeared
"Cardinal loaddate-" + urlDateLength + " {b64e}, " + // the time when the url was last time tried to load
"Cardinal retrycount-" + urlRetryLength + " {b64e}, " + // number of load retries
"String failcause-" + urlErrorLength + ", " + // string describing load failure
"byte[] flags-" + urlFlagLength); // extra space
"String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash
"String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
"String executor-" + yacySeedDB.commonHashLength + ", " + // the crawling executor
"String urlstring-" + indexRWIEntryOld.urlStringLength + ", " + // the url as string
"String urlname-" + indexRWIEntryOld.urlNameLength + ", " + // the name of the url, from anchor tag <a>name</a>
"Cardinal appdate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // the time when the url was first time appeared
"Cardinal loaddate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // the time when the url was last time tried to load
"Cardinal retrycount-" + indexRWIEntryOld.urlRetryLength + " {b64e}, " + // number of load retries
"String failcause-" + indexRWIEntryOld.urlErrorLength + ", " + // string describing load failure
"byte[] flags-" + indexRWIEntryOld.urlFlagLength); // extra space
if (newdb) {
String newCacheName = "urlErr3.table";
@ -164,9 +166,9 @@ public class plasmaCrawlEURL extends indexURL {
public synchronized Entry newEntry(URL url, String referrer, String initiator, String executor,
String name, String failreason, bitfield flags) {
if ((referrer == null) || (referrer.length() < urlHashLength)) referrer = dummyHash;
if ((initiator == null) || (initiator.length() < urlHashLength)) initiator = dummyHash;
if ((executor == null) || (executor.length() < urlHashLength)) executor = dummyHash;
if ((referrer == null) || (referrer.length() < yacySeedDB.commonHashLength)) referrer = dummyHash;
if ((initiator == null) || (initiator.length() < yacySeedDB.commonHashLength)) initiator = dummyHash;
if ((executor == null) || (executor.length() < yacySeedDB.commonHashLength)) executor = dummyHash;
if (failreason == null) failreason = "unknown";
return new Entry(url, referrer, initiator, executor, name, failreason, flags);
}
@ -289,8 +291,8 @@ public class plasmaCrawlEURL extends indexURL {
// stores the values from the object variables into the database
if (this.stored) return;
if (this.hash == null) return;
String initdatestr = kelondroBase64Order.enhancedCoder.encodeLong(initdate.getTime() / 86400000, urlDateLength);
String trydatestr = kelondroBase64Order.enhancedCoder.encodeLong(trydate.getTime() / 86400000, urlDateLength);
String initdatestr = kelondroBase64Order.enhancedCoder.encodeLong(initdate.getTime() / 86400000, indexRWIEntryOld.urlDateLength);
String trydatestr = kelondroBase64Order.enhancedCoder.encodeLong(trydate.getTime() / 86400000, indexRWIEntryOld.urlDateLength);
// store the hash in the hash cache
try {
@ -304,7 +306,7 @@ public class plasmaCrawlEURL extends indexURL {
this.name.getBytes(),
initdatestr.getBytes(),
trydatestr.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.trycount, urlRetryLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.trycount, indexRWIEntryOld.urlRetryLength).getBytes(),
this.failreason.getBytes(),
this.flags.getBytes()
};

@ -55,17 +55,18 @@ package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Locale;
import de.anomic.http.httpc;
import de.anomic.http.httpc.response;
import de.anomic.index.indexEntry;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.index.indexURLEntryNew;
import de.anomic.index.indexURLEntryOld;
import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroFlexSplitTable;
import de.anomic.kelondro.kelondroBase64Order;
@ -74,12 +75,9 @@ import de.anomic.kelondro.kelondroTree;
import de.anomic.net.URL;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCodings;
import de.anomic.server.serverObjects;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.bitfield;
import de.anomic.tools.nxTools;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacySeedDB;
public final class plasmaCrawlLURL extends indexURL {
@ -101,11 +99,11 @@ public final class plasmaCrawlLURL extends indexURL {
try {
if (newdb) {
urlIndexFile = new kelondroFlexSplitTable(new File(indexPath, "PUBLIC/TEXT"), "urls", bufferkb * 0x400, preloadTime, plasmaCrawlLURLNewEntry.rowdef, kelondroBase64Order.enhancedCoder);
urlIndexFile = new kelondroFlexSplitTable(new File(indexPath, "PUBLIC/TEXT"), "urls", bufferkb * 0x400, preloadTime, indexURLEntryNew.rowdef, kelondroBase64Order.enhancedCoder);
} else {
File oldLURLDB = new File(plasmaPath, "urlHash.db");
oldLURLDB.getParentFile().mkdirs();
urlIndexFile = new kelondroCache(new kelondroTree(oldLURLDB, bufferkb / 2 * 0x400, preloadTime, plasmaCrawlLURLOldEntry.rowdef), bufferkb / 2 * 0x400, true, false);
urlIndexFile = new kelondroCache(new kelondroTree(oldLURLDB, bufferkb / 2 * 0x400, preloadTime, indexURLEntryOld.rowdef), bufferkb / 2 * 0x400, true, false);
}
} catch (IOException e) {
e.printStackTrace();
@ -121,7 +119,7 @@ public final class plasmaCrawlLURL extends indexURL {
gcrawlResultStack = new LinkedList();
}
public synchronized void stack(plasmaCrawlLURLEntry e, String initiatorHash, String executorHash, int stackType) {
public synchronized void stack(indexURLEntry e, String initiatorHash, String executorHash, int stackType) {
if (e == null) { return; }
try {
if (initiatorHash == null) { initiatorHash = dummyHash; }
@ -159,7 +157,7 @@ public final class plasmaCrawlLURL extends indexURL {
return 0;
}
public synchronized plasmaCrawlLURLEntry load(String urlHash, indexEntry searchedWord) {
public synchronized indexURLEntry load(String urlHash, indexRWIEntry searchedWord) {
// generates an plasmaLURLEntry using the url hash
// to speed up the access, the url-hashes are buffered
// in the hash cache.
@ -171,17 +169,17 @@ public final class plasmaCrawlLURL extends indexURL {
kelondroRow.Entry entry = urlIndexFile.get(urlHash.getBytes());
if (entry == null) return null;
if (newdb)
return new plasmaCrawlLURLNewEntry(entry, searchedWord);
return new indexURLEntryNew(entry, searchedWord);
else
return new plasmaCrawlLURLOldEntry(entry, searchedWord);
return new indexURLEntryOld(entry, searchedWord);
} catch (IOException e) {
return null;
}
}
public synchronized void store(plasmaCrawlLURLEntry entry) throws IOException {
public synchronized void store(indexURLEntry entry) throws IOException {
// Check if there is a more recent Entry already in the DB
plasmaCrawlLURLEntry oldEntry;
indexURLEntry oldEntry;
try {
if (exists(entry.hash())) {
oldEntry = load(entry.hash(), null);
@ -202,18 +200,18 @@ public final class plasmaCrawlLURL extends indexURL {
urlIndexFile.put(entry.toRowEntry(), entry.loaddate());
}
public synchronized plasmaCrawlLURLEntry newEntry(String propStr) {
public synchronized indexURLEntry newEntry(String propStr) {
if (propStr.startsWith("{") && propStr.endsWith("}")) {
if (newdb)
return new plasmaCrawlLURLNewEntry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)));
return new indexURLEntryNew(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)));
else
return new plasmaCrawlLURLOldEntry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)));
return new indexURLEntryOld(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)));
} else {
return null;
}
}
public synchronized plasmaCrawlLURLEntry newEntry(
public synchronized indexURLEntry newEntry(
URL url,
String descr,
String author,
@ -236,10 +234,10 @@ public final class plasmaCrawlLURL extends indexURL {
int lvideo,
int lapp) {
if (newdb)
return new plasmaCrawlLURLNewEntry(url, descr, author, tags, ETag, mod, load, fresh, referrer, md5,
return new indexURLEntryNew(url, descr, author, tags, ETag, mod, load, fresh, referrer, md5,
size, wc, dt, flags, lang, llocal, lother, laudio, limage, lvideo, lapp);
else
return new plasmaCrawlLURLOldEntry(url, descr, author, tags, ETag, mod, load, fresh, referrer, md5,
return new indexURLEntryOld(url, descr, author, tags, ETag, mod, load, fresh, referrer, md5,
size, wc, dt, flags, lang, llocal, lother, laudio, limage, lvideo, lapp);
}
@ -257,36 +255,36 @@ public final class plasmaCrawlLURL extends indexURL {
public synchronized String getUrlHash(int stack, int pos) {
switch (stack) {
case 1: return ((String) externResultStack.get(pos)).substring(0, urlHashLength);
case 2: return ((String) searchResultStack.get(pos)).substring(0, urlHashLength);
case 3: return ((String) transfResultStack.get(pos)).substring(0, urlHashLength);
case 4: return ((String) proxyResultStack.get(pos)).substring(0, urlHashLength);
case 5: return ((String) lcrawlResultStack.get(pos)).substring(0, urlHashLength);
case 6: return ((String) gcrawlResultStack.get(pos)).substring(0, urlHashLength);
case 1: return ((String) externResultStack.get(pos)).substring(0, yacySeedDB.commonHashLength);
case 2: return ((String) searchResultStack.get(pos)).substring(0, yacySeedDB.commonHashLength);
case 3: return ((String) transfResultStack.get(pos)).substring(0, yacySeedDB.commonHashLength);
case 4: return ((String) proxyResultStack.get(pos)).substring(0, yacySeedDB.commonHashLength);
case 5: return ((String) lcrawlResultStack.get(pos)).substring(0, yacySeedDB.commonHashLength);
case 6: return ((String) gcrawlResultStack.get(pos)).substring(0, yacySeedDB.commonHashLength);
}
return null;
}
public synchronized String getInitiatorHash(int stack, int pos) {
switch (stack) {
case 1: return ((String) externResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2);
case 2: return ((String) searchResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2);
case 3: return ((String) transfResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2);
case 4: return ((String) proxyResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2);
case 5: return ((String) lcrawlResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2);
case 6: return ((String) gcrawlResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2);
case 1: return ((String) externResultStack.get(pos)).substring(yacySeedDB.commonHashLength, yacySeedDB.commonHashLength * 2);
case 2: return ((String) searchResultStack.get(pos)).substring(yacySeedDB.commonHashLength, yacySeedDB.commonHashLength * 2);
case 3: return ((String) transfResultStack.get(pos)).substring(yacySeedDB.commonHashLength, yacySeedDB.commonHashLength * 2);
case 4: return ((String) proxyResultStack.get(pos)).substring(yacySeedDB.commonHashLength, yacySeedDB.commonHashLength * 2);
case 5: return ((String) lcrawlResultStack.get(pos)).substring(yacySeedDB.commonHashLength, yacySeedDB.commonHashLength * 2);
case 6: return ((String) gcrawlResultStack.get(pos)).substring(yacySeedDB.commonHashLength, yacySeedDB.commonHashLength * 2);
}
return null;
}
public synchronized String getExecutorHash(int stack, int pos) {
switch (stack) {
case 1: return ((String) externResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3);
case 2: return ((String) searchResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3);
case 3: return ((String) transfResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3);
case 4: return ((String) proxyResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3);
case 5: return ((String) lcrawlResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3);
case 6: return ((String) gcrawlResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3);
case 1: return ((String) externResultStack.get(pos)).substring(yacySeedDB.commonHashLength * 2, yacySeedDB.commonHashLength * 3);
case 2: return ((String) searchResultStack.get(pos)).substring(yacySeedDB.commonHashLength * 2, yacySeedDB.commonHashLength * 3);
case 3: return ((String) transfResultStack.get(pos)).substring(yacySeedDB.commonHashLength * 2, yacySeedDB.commonHashLength * 3);
case 4: return ((String) proxyResultStack.get(pos)).substring(yacySeedDB.commonHashLength * 2, yacySeedDB.commonHashLength * 3);
case 5: return ((String) lcrawlResultStack.get(pos)).substring(yacySeedDB.commonHashLength * 2, yacySeedDB.commonHashLength * 3);
case 6: return ((String) gcrawlResultStack.get(pos)).substring(yacySeedDB.commonHashLength * 2, yacySeedDB.commonHashLength * 3);
}
return null;
}
@ -341,88 +339,10 @@ public final class plasmaCrawlLURL extends indexURL {
return false;
}
}
private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);
private static String daydate(Date date) {
if (date == null) {
return "";
} else {
return dayFormatter.format(date);
}
}
public serverObjects genTableProps(int tabletype, int lines, boolean showInit, boolean showExec, String dfltInit, String dfltExec, String feedbackpage, boolean makeLink) {
/* serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps tabletype=" + tabletype + " lines=" + lines +
" showInit=" + showInit + " showExec=" + showExec +
" dfltInit=" + dfltInit + " dfltExec=" + dfltExec +
" feedbackpage=" + feedbackpage + " makeLink=" + makeLink); */
final serverObjects prop = new serverObjects();
if (getStackSize(tabletype) == 0) {
prop.put("table", 0);
return prop;
}
prop.put("table", 1);
if (lines > getStackSize(tabletype)) lines = getStackSize(tabletype);
if (lines == getStackSize(tabletype)) {
prop.put("table_size", 0);
} else {
prop.put("table_size", 1);
prop.put("table_size_count", lines);
}
prop.put("table_size_all", getStackSize(tabletype));
prop.put("table_feedbackpage", feedbackpage);
prop.put("table_tabletype", tabletype);
prop.put("table_showInit", (showInit) ? 1 : 0);
prop.put("table_showExec", (showExec) ? 1 : 0);
boolean dark = true;
String urlHash, initiatorHash, executorHash;
String cachepath, urlstr, urltxt;
yacySeed initiatorSeed, executorSeed;
plasmaCrawlLURLEntry urle;
// needed for getCachePath(url)
final plasmaSwitchboard switchboard = plasmaSwitchboard.getSwitchboard();
final plasmaHTCache cacheManager = switchboard.getCacheManager();
int i, cnt = 0;
for (i = getStackSize(tabletype) - 1; i >= (getStackSize(tabletype) - lines); i--) {
initiatorHash = getInitiatorHash(tabletype, i);
executorHash = getExecutorHash(tabletype, i);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps initiatorHash=" + initiatorHash + " executorHash=" + executorHash);
urlHash = getUrlHash(tabletype, i);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash);
try {
urle = load(urlHash, null);
plasmaCrawlLURLEntry.Components comp = urle.comp();
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString());
initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash);
executorSeed = yacyCore.seedDB.getConnected(executorHash);
urlstr = comp.url().toNormalform();
urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL
cachepath = cacheManager.getCachePath(new URL(urlstr)).toString().replace('\\', '/').substring(cacheManager.cachePath.toString().length() + 1);
prop.put("table_indexed_" + cnt + "_dark", (dark) ? 1 : 0);
prop.put("table_indexed_" + cnt + "_feedbackpage", feedbackpage);
prop.put("table_indexed_" + cnt + "_tabletype", tabletype);
prop.put("table_indexed_" + cnt + "_urlhash", urlHash);
prop.put("table_indexed_" + cnt + "_showInit", (showInit) ? 1 : 0);
prop.put("table_indexed_" + cnt + "_showInit_initiatorSeed", (initiatorSeed == null) ? dfltInit : initiatorSeed.getName());
prop.put("table_indexed_" + cnt + "_showExec", (showExec) ? 1 : 0);
prop.put("table_indexed_" + cnt + "_showExec_executorSeed", (executorSeed == null) ? dfltExec : executorSeed.getName());
prop.put("table_indexed_" + cnt + "_moddate", daydate(urle.moddate()));
prop.put("table_indexed_" + cnt + "_wordcount", urle.wordCount());
prop.put("table_indexed_" + cnt + "_urldescr", comp.descr());
prop.put("table_indexed_" + cnt + "_url", (cachepath == null) ? "-not-cached-" : ((makeLink) ? ("<a href=\"CacheAdmin_p.html?action=info&path=" + cachepath + "\" class=\"small\" title=\"" + urlstr + "\">" + urltxt + "</a>") : urlstr));
dark = !dark;
cnt++;
} catch (Exception e) {
serverLog.logSevere("PLASMA", "genTableProps", e);
}
}
prop.put("table_indexed", cnt);
return prop;
public Iterator entries(boolean up, boolean rotating, String firstHash) throws IOException {
// enumerates entry elements
return new kiter(up, rotating, firstHash);
}
public class kiter implements Iterator {
@ -445,9 +365,9 @@ public final class plasmaCrawlLURL extends indexURL {
if (e == null) return null;
try {
if (newdb)
return new plasmaCrawlLURLNewEntry(e, null);
return new indexURLEntryNew(e, null);
else
return new plasmaCrawlLURLOldEntry(e, null);
return new indexURLEntryOld(e, null);
} catch (IOException ex) {
throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null));
}
@ -459,11 +379,6 @@ public final class plasmaCrawlLURL extends indexURL {
}
public Iterator entries(boolean up, boolean rotating, String firstHash) throws IOException {
// enumerates entry elements
return new kiter(up, rotating, firstHash);
}
/**
* Uses an Iteration over urlHash.db to detect malformed URL-Entries.
* Damaged URL-Entries will be marked in a HashSet and removed at the end of the function.
@ -578,8 +493,8 @@ public final class plasmaCrawlLURL extends indexURL {
}
}
plasmaCrawlLURLEntry entry = (plasmaCrawlLURLEntry) eiter.next();
plasmaCrawlLURLEntry.Components comp = entry.comp();
indexURLEntry entry = (indexURLEntry) eiter.next();
indexURLEntry.Components comp = entry.comp();
totalSearchedUrls++;
if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, comp.url()) ||
plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, comp.url())) {
@ -650,7 +565,7 @@ public final class plasmaCrawlLURL extends indexURL {
final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), new File(args[2]), 1, 0, false);
final Iterator enu = urls.entries(true, false, null);
while (enu.hasNext()) {
System.out.println(((plasmaCrawlLURLEntry) enu.next()).toString());
System.out.println(((indexURLEntry) enu.next()).toString());
}
} catch (Exception e) {
e.printStackTrace();

@ -51,6 +51,7 @@ import java.util.HashSet;
import java.util.Iterator;
import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroException;
@ -62,6 +63,7 @@ import de.anomic.kelondro.kelondroTree;
import de.anomic.net.URL;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.bitfield;
import de.anomic.yacy.yacySeedDB;
public class plasmaCrawlNURL extends indexURL {
@ -78,18 +80,18 @@ public class plasmaCrawlNURL extends indexURL {
* column length definition for the {@link plasmaURL#urlIndexFile} DB
*/
public final static kelondroRow rowdef = new kelondroRow(
"String urlhash-" + urlHashLength + ", " + // the url's hash
"String initiator-" + urlHashLength + ", " + // the crawling initiator
"String urlstring-" + urlStringLength + ", " + // the url as string
"String refhash-" + urlHashLength + ", " + // the url's referrer hash
"String urlname-" + urlNameLength + ", " + // the name of the url, from anchor tag <a>name</a>
"Cardinal appdate-" + urlDateLength + " {b64e}, " + // the time when the url was first time appeared
"String profile-" + urlCrawlProfileHandleLength + ", " + // the name of the prefetch profile handle
"Cardinal depth-" + urlCrawlDepthLength + " {b64e}, " + // the prefetch depth so far, starts at 0
"Cardinal parentbr-" + urlParentBranchesLength + " {b64e}, " + // number of anchors of the parent
"Cardinal forkfactor-" + urlForkFactorLength + " {b64e}, " + // sum of anchors of all ancestors
"byte[] flags-" + urlFlagLength + ", " + // flags
"String handle-" + urlHandleLength); // extra handle
"String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash
"String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
"String urlstring-" + indexRWIEntryOld.urlStringLength + ", " + // the url as string
"String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"String urlname-" + indexRWIEntryOld.urlNameLength + ", " + // the name of the url, from anchor tag <a>name</a>
"Cardinal appdate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // the time when the url was first time appeared
"String profile-" + indexRWIEntryOld.urlCrawlProfileHandleLength + ", " + // the name of the prefetch profile handle
"Cardinal depth-" + indexRWIEntryOld.urlCrawlDepthLength + " {b64e}, " + // the prefetch depth so far, starts at 0
"Cardinal parentbr-" + indexRWIEntryOld.urlParentBranchesLength + " {b64e}, " + // number of anchors of the parent
"Cardinal forkfactor-" + indexRWIEntryOld.urlForkFactorLength + " {b64e}, " + // sum of anchors of all ancestors
"byte[] flags-" + indexRWIEntryOld.urlFlagLength + ", " + // flags
"String handle-" + indexRWIEntryOld.urlHandleLength); // extra handle
private final plasmaCrawlBalancer coreStack; // links found by crawling to depth-1
private final plasmaCrawlBalancer limitStack; // links found by crawling at target depth
@ -128,7 +130,7 @@ public class plasmaCrawlNURL extends indexURL {
limitStack = new plasmaCrawlBalancer(limitStackFile);
overhangStack = new plasmaCrawlBalancer(overhangStackFile);
remoteStack = new plasmaCrawlBalancer(remoteStackFile);
kelondroRow rowdef = new kelondroRow("byte[] urlhash-" + indexURL.urlHashLength);
kelondroRow rowdef = new kelondroRow("byte[] urlhash-" + yacySeedDB.commonHashLength);
imageStack = kelondroStack.open(imageStackFile, rowdef);
movieStack = kelondroStack.open(movieStackFile, rowdef);
musicStack = kelondroStack.open(musicStackFile, rowdef);
@ -257,7 +259,7 @@ public class plasmaCrawlNURL extends indexURL {
private static String normalizeHandle(int h) {
String d = Integer.toHexString(h);
while (d.length() < urlHandleLength) d = "0" + d;
while (d.length() < indexRWIEntryOld.urlHandleLength) d = "0" + d;
return d;
}
@ -479,7 +481,7 @@ public class plasmaCrawlNURL extends indexURL {
this.depth = depth;
this.anchors = anchors;
this.forkfactor = forkfactor;
this.flags = new bitfield(urlFlagLength);
this.flags = new bitfield(indexRWIEntryOld.urlFlagLength);
this.handle = 0;
this.stored = false;
}
@ -533,7 +535,7 @@ public class plasmaCrawlNURL extends indexURL {
public void store() {
// stores the values from the object variables into the database
if (this.stored) return;
String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, urlDateLength);
String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexRWIEntryOld.urlDateLength);
// store the hash in the hash cache
try {
// even if the entry exists, we simply overwrite it
@ -545,9 +547,9 @@ public class plasmaCrawlNURL extends indexURL {
this.name.getBytes("UTF-8"),
loaddatestr.getBytes(),
(this.profileHandle == null) ? null : this.profileHandle.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.depth, urlCrawlDepthLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, urlParentBranchesLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, urlForkFactorLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.depth, indexRWIEntryOld.urlCrawlDepthLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, indexRWIEntryOld.urlParentBranchesLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, indexRWIEntryOld.urlForkFactorLength).getBytes(),
this.flags.getBytes(),
normalizeHandle(this.handle).getBytes()
};

@ -48,7 +48,7 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroException;
@ -68,7 +68,7 @@ public class plasmaCrawlProfile {
this.bufferkb = bufferkb;
this.preloadTime = preloadTime;
profileTableFile.getParentFile().mkdirs();
kelondroDyn dyn = kelondroDyn.open(profileTableFile, bufferkb * 1024, preloadTime, indexURL.urlCrawlProfileHandleLength, 2000, '#');
kelondroDyn dyn = kelondroDyn.open(profileTableFile, bufferkb * 1024, preloadTime, indexRWIEntryOld.urlCrawlProfileHandleLength, 2000, '#');
profileTable = new kelondroMap(dyn);
domsCache = new HashMap();
}
@ -94,7 +94,7 @@ public class plasmaCrawlProfile {
if (profileTable != null) try { profileTable.close(); } catch (IOException e) {}
if (!(profileTableFile.delete())) throw new RuntimeException("cannot delete crawl profile database");
profileTableFile.getParentFile().mkdirs();
kelondroDyn dyn = kelondroDyn.open(profileTableFile, bufferkb * 1024, preloadTime, indexURL.urlCrawlProfileHandleLength, 2000, '#');
kelondroDyn dyn = kelondroDyn.open(profileTableFile, bufferkb * 1024, preloadTime, indexRWIEntryOld.urlCrawlProfileHandleLength, 2000, '#');
profileTable = new kelondroMap(dyn);
}
@ -256,7 +256,7 @@ public class plasmaCrawlProfile {
boolean storeHTCache, boolean storeTXCache,
boolean localIndexing, boolean remoteIndexing,
boolean xsstopw, boolean xdstopw, boolean xpstopw) {
String handle = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, indexURL.urlCrawlProfileHandleLength);
String handle = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, indexRWIEntryOld.urlCrawlProfileHandleLength);
mem = new HashMap();
mem.put("handle", handle);
mem.put("name", name);

@ -60,6 +60,8 @@ import org.apache.commons.pool.impl.GenericObjectPool;
import de.anomic.data.robotsParser;
import de.anomic.http.httpc;
import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroException;
@ -391,7 +393,7 @@ public final class plasmaCrawlStacker {
checkInterruption();
String nexturlhash = indexURL.urlHash(nexturl);
String dbocc = this.sb.urlPool.exists(nexturlhash);
plasmaCrawlLURLEntry oldEntry = null;
indexURLEntry oldEntry = null;
oldEntry = this.sb.urlPool.loadedURL.load(nexturlhash, null);
boolean recrawl = (oldEntry != null) && (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder());
if ((dbocc != null) && (!(recrawl))) {
@ -490,7 +492,7 @@ public final class plasmaCrawlStacker {
this.depth = depth;
this.anchors = anchors;
this.forkfactor = forkfactor;
this.flags = new bitfield(indexURL.urlFlagLength);
this.flags = new bitfield(indexRWIEntryOld.urlFlagLength);
this.handle = 0;
} catch (Exception e) {
e.printStackTrace();
@ -573,7 +575,7 @@ public final class plasmaCrawlStacker {
public byte[][] getBytes() {
// stores the values from the object variables into the database
String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexURL.urlDateLength);
String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexRWIEntryOld.urlDateLength);
// store the hash in the hash cache
// even if the entry exists, we simply overwrite it
@ -587,9 +589,9 @@ public final class plasmaCrawlStacker {
this.name.getBytes("UTF-8"),
loaddatestr.getBytes(),
(this.profileHandle == null) ? null : this.profileHandle.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.depth, indexURL.urlCrawlDepthLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, indexURL.urlParentBranchesLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, indexURL.urlForkFactorLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.depth, indexRWIEntryOld.urlCrawlDepthLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, indexRWIEntryOld.urlParentBranchesLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, indexRWIEntryOld.urlForkFactorLength).getBytes(),
this.flags.getBytes(),
normalizeHandle(this.handle).getBytes()
};
@ -599,7 +601,7 @@ public final class plasmaCrawlStacker {
private String normalizeHandle(int h) {
String d = Integer.toHexString(h);
while (d.length() < indexURL.urlHandleLength) d = "0" + d;
while (d.length() < indexRWIEntryOld.urlHandleLength) d = "0" + d;
return d;
}
}
@ -1057,7 +1059,7 @@ public final class plasmaCrawlStacker {
yacyCore.seedDB.mySeed.hash,
this.theMsg.name,
rejectReason,
new bitfield(indexURL.urlFlagLength)
new bitfield(indexRWIEntryOld.urlFlagLength)
);
ee.store();
sb.urlPool.errorURL.stackPushEntry(ee);

@ -48,7 +48,8 @@ import java.util.HashSet;
import java.util.Iterator;
import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException;
import de.anomic.server.serverCodings;
@ -200,8 +201,8 @@ public class plasmaDHTChunk {
Iterator indexContainerIterator = wordIndex.indexContainerSet(hash, resourceLevel, true, maxcount).iterator();
indexContainer container;
Iterator urlIter;
indexEntry iEntry;
plasmaCrawlLURLEntry lurl;
indexRWIEntry iEntry;
indexURLEntry lurl;
int refcount = 0;
int wholesize;
@ -227,7 +228,7 @@ public class plasmaDHTChunk {
urlIter = container.entries();
// iterate over indexes to fetch url entries and store them in the urlCache
while ((urlIter.hasNext()) && (maxcount > refcount) && (System.currentTimeMillis() < timeout)) {
iEntry = (indexEntry) urlIter.next();
iEntry = (indexRWIEntry) urlIter.next();
lurl = lurls.load(iEntry.urlHash(), iEntry);
if ((lurl == null) || (lurl.comp().url() == null)) {
//yacyCore.log.logFine("DEBUG selectTransferContainersResource: not-bound url hash '" + iEntry.urlHash() + "' for word hash " + container.getWordHash());
@ -243,7 +244,7 @@ public class plasmaDHTChunk {
// remove all remaining; we have enough
while (urlIter.hasNext()) {
iEntry = (indexEntry) urlIter.next();
iEntry = (indexRWIEntry) urlIter.next();
urlIter.remove();
}
@ -285,7 +286,7 @@ public class plasmaDHTChunk {
public synchronized String deleteTransferIndexes() {
Iterator urlIter;
indexEntry iEntry;
indexRWIEntry iEntry;
HashSet urlHashes;
String count = "0";
@ -299,7 +300,7 @@ public class plasmaDHTChunk {
urlHashes = new HashSet(this.indexContainers[i].size());
urlIter = this.indexContainers[i].entries();
while (urlIter.hasNext()) {
iEntry = (indexEntry) urlIter.next();
iEntry = (indexRWIEntry) urlIter.next();
urlHashes.add(iEntry.urlHash());
}
String wordHash = indexContainers[i].getWordHash();

@ -90,6 +90,7 @@ import de.anomic.server.serverThread;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.enumerateFiles;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacySeedDB;
public final class plasmaHTCache {
@ -173,7 +174,7 @@ public final class plasmaHTCache {
// open the response header database
File dbfile = new File(this.cachePath, "responseHeader.db");
try {
this.responseHeaderDB = new kelondroMap(new kelondroDyn(dbfile, bufferkb * 0x400, preloadTime, indexURL.urlHashLength, 150, '#'));
this.responseHeaderDB = new kelondroMap(new kelondroDyn(dbfile, bufferkb * 0x400, preloadTime, yacySeedDB.commonHashLength, 150, '#'));
} catch (IOException e) {
this.log.logSevere("the request header database could not be opened: " + e.getMessage());
System.exit(0);
@ -717,7 +718,7 @@ public final class plasmaHTCache {
if (hexHash.indexOf('.') >= 0) return null;
try {
String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.decodeHex(hexHash));
if (hash.length() == indexURL.urlHashLength) return hash;
if (hash.length() == yacySeedDB.commonHashLength) return hash;
return null;
} catch (Exception e) {
//log.logWarning("getHash: " + e.getMessage(), e);

@ -51,7 +51,8 @@ import java.util.Set;
import java.util.TreeMap;
import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.server.logging.serverLog;
@ -379,8 +380,8 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
//if (searchResult == null) return acc; // strange case where searchResult is not proper: acc is then empty
//if (searchResult.size() == 0) return acc; // case that we have nothing to do
indexEntry entry;
plasmaCrawlLURLEntry page;
indexRWIEntry entry;
indexURLEntry page;
Long preranking;
Object[] preorderEntry;
int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT);
@ -388,7 +389,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
while (preorder.hasNext()) {
if ((System.currentTimeMillis() >= postorderLimitTime) && (acc.sizeFetched() >= minEntries)) break;
preorderEntry = preorder.next();
entry = (indexEntry) preorderEntry[0];
entry = (indexRWIEntry) preorderEntry[0];
// load only urls if there was not yet a root url of that hash
preranking = (Long) preorderEntry[1];
// find the url entry
@ -425,11 +426,11 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
preorder.remove(true, true);
// start url-fetch
indexEntry entry;
indexRWIEntry entry;
try {
while (preorder.hasNext()) {
if (System.currentTimeMillis() >= timeout) break;
entry = (indexEntry) (preorder.next()[0]);
entry = (indexRWIEntry) (preorder.next()[0]);
// find and fetch the url entry
urlStore.load(entry.urlHash(), entry);
}

@ -48,6 +48,7 @@ import java.util.Map;
import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.net.URL;
import de.anomic.plasma.parser.ParserException;
import de.anomic.server.serverDate;
@ -101,7 +102,7 @@ public final class plasmaSearchImages {
public plasmaSearchImages(plasmaSnippetCache sc, long maxTime, plasmaSearchResult sres, int depth) {
long start = System.currentTimeMillis();
this.images = new TreeSet();
plasmaCrawlLURLEntry urlentry;
indexURLEntry urlentry;
while (sres.hasMoreElements()) {
urlentry = sres.nextElement();
addAll(new plasmaSearchImages(sc, serverDate.remainingTime(start, maxTime, 10), urlentry.comp().url(), depth));

@ -50,7 +50,7 @@ import java.util.Map;
import java.util.TreeMap;
import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURL;
import de.anomic.kelondro.kelondroBinSearch;
import de.anomic.server.serverCodings;
@ -61,7 +61,7 @@ public final class plasmaSearchPreOrder {
public static kelondroBinSearch[] ybrTables = null; // block-rank tables
private static boolean useYBR = true;
private indexEntry entryMin, entryMax;
private indexRWIEntry entryMin, entryMax;
private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry
private plasmaSearchQuery query;
private plasmaSearchRankingProfile ranking;
@ -79,7 +79,7 @@ public final class plasmaSearchPreOrder {
this.ranking = ranking;
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
indexEntry iEntry;
indexRWIEntry iEntry;
// first pass: find min/max to obtain limits for normalization
Iterator i = container.entries();
@ -88,9 +88,9 @@ public final class plasmaSearchPreOrder {
this.entryMax = null;
while (i.hasNext()) {
if (System.currentTimeMillis() > limitTime) break;
iEntry = (indexEntry) i.next();
if (this.entryMin == null) this.entryMin = (indexEntry) iEntry.clone(); else this.entryMin.min(iEntry);
if (this.entryMax == null) this.entryMax = (indexEntry) iEntry.clone(); else this.entryMax.max(iEntry);
iEntry = (indexRWIEntry) i.next();
if (this.entryMin == null) this.entryMin = (indexRWIEntry) iEntry.clone(); else this.entryMin.min(iEntry);
if (this.entryMax == null) this.entryMax = (indexRWIEntry) iEntry.clone(); else this.entryMax.max(iEntry);
count++;
}
@ -98,7 +98,7 @@ public final class plasmaSearchPreOrder {
i = container.entries();
this.pageAcc = new TreeMap();
for (int j = 0; j < count; j++) {
iEntry = (indexEntry) i.next();
iEntry = (indexRWIEntry) i.next();
pageAcc.put(serverCodings.encodeHex(Long.MAX_VALUE - this.ranking.preRanking(iEntry.generateNormalized(this.entryMin, this.entryMax), query.words("")), 16) + iEntry.urlHash(), iEntry);
}
}
@ -110,13 +110,13 @@ public final class plasmaSearchPreOrder {
HashSet doubleDoms = new HashSet();
Iterator i = pageAcc.entrySet().iterator();
Map.Entry entry;
indexEntry iEntry;
indexRWIEntry iEntry;
String hashpart;
boolean isWordRootURL;
while (i.hasNext()) {
if (pageAcc.size() <= query.wantedResults) break;
entry = (Map.Entry) i.next();
iEntry = (indexEntry) entry.getValue();
iEntry = (indexRWIEntry) entry.getValue();
hashpart = iEntry.urlHash().substring(6);
isWordRootURL = indexURL.isWordRootURL(iEntry.urlHash(), query.words(""));
if ((!(isWordRootURL)) &&
@ -192,11 +192,11 @@ public final class plasmaSearchPreOrder {
e.printStackTrace();
preranking = new Long(0);
}
return new Object[]{(indexEntry) pageAcc.remove(top), preranking};
return new Object[]{(indexRWIEntry) pageAcc.remove(top), preranking};
}
public indexEntry[] getNormalizer() {
return new indexEntry[] {entryMin, entryMax};
public indexRWIEntry[] getNormalizer() {
return new indexRWIEntry[] {entryMin, entryMax};
}
public static int ybr_p(String urlHash) {

@ -51,6 +51,7 @@ import de.anomic.htmlFilter.htmlFilterAbstractScraper;
import de.anomic.index.indexEntryAttribute;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.server.serverCharBuffer;
import de.anomic.yacy.yacySeedDB;
public final class plasmaSearchQuery {
@ -120,16 +121,16 @@ public final class plasmaSearchQuery {
public static Set hashes2Set(String query) {
if (query == null) return new HashSet();
final HashSet keyhashes = new HashSet(query.length() / indexEntryAttribute.wordHashLength);
for (int i = 0; i < (query.length() / indexEntryAttribute.wordHashLength); i++) {
keyhashes.add(query.substring(i * indexEntryAttribute.wordHashLength, (i + 1) * indexEntryAttribute.wordHashLength));
final HashSet keyhashes = new HashSet(query.length() / yacySeedDB.commonHashLength);
for (int i = 0; i < (query.length() / yacySeedDB.commonHashLength); i++) {
keyhashes.add(query.substring(i * yacySeedDB.commonHashLength, (i + 1) * yacySeedDB.commonHashLength));
}
return keyhashes;
}
public static String hashSet2hashString(Set words) {
Iterator i = words.iterator();
StringBuffer sb = new StringBuffer(words.size() * indexEntryAttribute.wordHashLength);
StringBuffer sb = new StringBuffer(words.size() * yacySeedDB.commonHashLength);
while (i.hasNext()) sb.append((String) i.next());
return new String(sb);
}

@ -46,8 +46,9 @@ import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import de.anomic.index.indexEntry;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
public class plasmaSearchRankingProfile {
@ -164,7 +165,7 @@ public class plasmaSearchRankingProfile {
return new String(ext);
}
public long preRanking(indexEntry normalizedEntry, String searchedWord) {
public long preRanking(indexRWIEntry normalizedEntry, String searchedWord) {
// the normalizedEntry must be a normalized indexEntry
long ranking = 0;
ranking += normalizedEntry.quality() << ((Integer) coeff.get(ENTROPY)).intValue();
@ -191,13 +192,13 @@ public class plasmaSearchRankingProfile {
Set topwords,
String[] urlcomps,
String[] descrcomps,
plasmaCrawlLURLEntry page) {
indexURLEntry page) {
// apply pre-calculated order attributes
long ranking = preranking;
// prefer hit with 'prefer' pattern
plasmaCrawlLURLEntry.Components comp = page.comp();
indexURLEntry.Components comp = page.comp();
if (comp.url().toNormalform().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue();
if (comp.descr().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue();

@ -54,6 +54,7 @@ import java.util.TreeMap;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.net.URL;
import de.anomic.server.serverCodings;
@ -99,16 +100,16 @@ public final class plasmaSearchResult {
return pageAcc.size() > 0;
}
public plasmaCrawlLURLEntry nextElement() {
public indexURLEntry nextElement() {
Object top = pageAcc.firstKey();
//System.out.println("postorder-key: " + ((String) top));
return (plasmaCrawlLURLEntry) pageAcc.remove(top);
return (indexURLEntry) pageAcc.remove(top);
}
protected void addResult(plasmaCrawlLURLEntry page, Long preranking) {
protected void addResult(indexURLEntry page, Long preranking) {
// take out relevant information for reference computation
plasmaCrawlLURLEntry.Components comp = page.comp();
indexURLEntry.Components comp = page.comp();
if ((comp.url() == null) || (comp.descr() == null)) return;
String[] urlcomps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()); // word components of the url
String[] descrcomps = comp.descr().toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description
@ -131,12 +132,12 @@ public final class plasmaSearchResult {
for (int i = 0; i < references.length; i++) commonSense.add(references[i]);
Object[] resultVector;
plasmaCrawlLURLEntry page;
indexURLEntry page;
long ranking;
for (int i = 0; i < results.size(); i++) {
// take out values from result array
resultVector = (Object[]) results.get(i);
page = (plasmaCrawlLURLEntry) resultVector[0];
page = (indexURLEntry) resultVector[0];
// calculate ranking
if (postsort)
@ -172,7 +173,7 @@ public final class plasmaSearchResult {
// first scan all entries and find all urls that are referenced
while (i.hasNext()) {
entry = (Map.Entry) i.next();
path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).comp().url());
path = urlPath(((indexURLEntry) entry.getValue()).comp().url());
paths.put(path, entry.getKey());
//if (path != null) path = shortenPath(path);
//if (path != null) paths.put(path, entry.getKey());
@ -183,7 +184,7 @@ public final class plasmaSearchResult {
String shorten;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).comp().url());
path = urlPath(((indexURLEntry) entry.getValue()).comp().url());
shorten = shortenPath(path);
// scan all subpaths of the url
while (shorten != null) {

@ -58,6 +58,7 @@ import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.net.URL;
import de.anomic.plasma.cache.IResourceInfo;
@ -630,12 +631,12 @@ public class plasmaSnippetCache {
public void fetch(plasmaSearchResult acc, Set queryhashes, String urlmask, int fetchcount, long maxTime) {
// fetch snippets
int i = 0;
plasmaCrawlLURLEntry urlentry;
indexURLEntry urlentry;
String urlstring;
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
while ((acc.hasMoreElements()) && (i < fetchcount) && (System.currentTimeMillis() < limitTime)) {
urlentry = acc.nextElement();
plasmaCrawlLURLEntry.Components comp = urlentry.comp();
indexURLEntry.Components comp = urlentry.comp();
if (comp.url().getHost().endsWith(".yacyh")) continue;
urlstring = comp.url().toNormalform();
if ((urlstring.matches(urlmask)) &&

@ -132,9 +132,10 @@ import de.anomic.http.httpHeader;
import de.anomic.http.httpRemoteProxyConfig;
import de.anomic.http.httpc;
import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException;
@ -1429,14 +1430,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
private plasmaParserDocument parseResource(plasmaSwitchboardQueue.Entry entry, String initiatorHash) throws InterruptedException, ParserException {
plasmaParserDocument document = null;
// the mimetype of this entry
String mimeType = entry.getMimeType();
String charset = entry.getCharacterEncoding();
// the parser logger
serverLog parserLogger = parser.getLogger();
//serverLog parserLogger = parser.getLogger();
// parse the document
return parseResource(entry.url(), mimeType, charset, entry.cacheFile());
@ -1497,7 +1497,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (document == null) return;
} catch (ParserException e) {
this.log.logInfo("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage());
addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorPeerHash, entry.anchorName(), e.getErrorCode(), new bitfield(indexURL.urlFlagLength));
addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorPeerHash, entry.anchorName(), e.getErrorCode(), new bitfield(indexRWIEntryOld.urlFlagLength));
if (document != null) {
document.close();
document = null;
@ -1574,7 +1574,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
checkInterruption();
// create a new loaded URL db entry
plasmaCrawlLURLEntry newEntry = urlPool.loadedURL.newEntry(
indexURLEntry newEntry = urlPool.loadedURL.newEntry(
entry.url(), // URL
docDescription, // document description
"", // author
@ -1660,7 +1660,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String language = indexEntryAttribute.language(entry.url());
char doctype = indexEntryAttribute.docType(document.getMimeType());
plasmaCrawlLURLEntry.Components comp = newEntry.comp();
indexURLEntry.Components comp = newEntry.comp();
int urlLength = comp.url().toNormalform().length();
int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()).length;
@ -1673,7 +1673,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String word = (String) wentry.getKey();
wordStat = (plasmaCondenser.wordStatProp) wentry.getValue();
String wordHash = indexEntryAttribute.word2hash(word);
indexEntry wordIdxEntry = new indexURLEntry(
indexRWIEntry wordIdxEntry = new indexRWIEntryOld(
urlHash,
urlLength, urlComps,
wordStat.count,
@ -1764,7 +1764,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
} else {
log.logFine("Not Indexed Resource '" + entry.normalizedURLString() + "': process case=" + processCase);
addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new bitfield(indexURL.urlFlagLength));
addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new bitfield(indexRWIEntryOld.urlFlagLength));
}
} catch (Exception ee) {
if (ee instanceof InterruptedException) throw (InterruptedException)ee;
@ -1776,7 +1776,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
yacyClient.crawlReceipt(initiatorPeer, "crawl", "exception", ee.getMessage(), null, "");
}
addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR, new bitfield(indexURL.urlFlagLength));
addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR, new bitfield(indexRWIEntryOld.urlFlagLength));
}
} else {
@ -1784,7 +1784,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
checkInterruption();
log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason);
addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, noIndexReason, new bitfield(indexURL.urlFlagLength));
addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, noIndexReason, new bitfield(indexRWIEntryOld.urlFlagLength));
if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
yacyClient.crawlReceipt(initiatorPeer, "crawl", "rejected", noIndexReason, null, "");
}
@ -1991,7 +1991,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String lurl = (String) page.get("lurl");
if ((lurl != null) && (lurl.length() != 0)) {
String propStr = crypt.simpleDecode(lurl, (String) page.get("key"));
plasmaCrawlLURLEntry entry = urlPool.loadedURL.newEntry(propStr);
indexURLEntry entry = urlPool.loadedURL.newEntry(propStr);
urlPool.loadedURL.store(entry);
urlPool.loadedURL.stack(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt?
urlPool.noticeURL.remove(entry.hash());
@ -2070,7 +2070,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
prop.put("type_globalresults", acc.globalContributions);
int i = 0;
int p;
plasmaCrawlLURLEntry urlentry;
indexURLEntry urlentry;
String urlstring, urlname, filename, urlhash;
String host, hash, address;
yacySeed seed;
@ -2081,7 +2081,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (targetTime < System.currentTimeMillis()) targetTime = System.currentTimeMillis() + 1000;
while ((acc.hasMoreElements()) && (i < query.wantedResults) && (System.currentTimeMillis() < targetTime)) {
urlentry = acc.nextElement();
plasmaCrawlLURLEntry.Components comp = urlentry.comp();
indexURLEntry.Components comp = urlentry.comp();
urlhash = urlentry.hash();
assert (urlhash != null);
assert (urlhash.length() == 12) : "urlhash = " + urlhash;
@ -2218,9 +2218,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// finally, delete the url entry
// determine the url string
plasmaCrawlLURLEntry entry = urlPool.loadedURL.load(urlhash, null);
indexURLEntry entry = urlPool.loadedURL.load(urlhash, null);
if (entry == null) return 0;
plasmaCrawlLURLEntry.Components comp = entry.comp();
indexURLEntry.Components comp = entry.comp();
if (comp.url() == null) return 0;
InputStream resourceContent = null;

@ -51,6 +51,8 @@ import java.util.ArrayList;
import java.util.Date;
import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroRow;
@ -79,14 +81,14 @@ public class plasmaSwitchboardQueue {
private void initQueueStack() {
kelondroRow rowdef = new kelondroRow(
"String url-" + indexURL.urlStringLength + ", " + // the url
"String refhash-" + indexURL.urlHashLength + ", " + // the url's referrer hash
"Cardinal modifiedsince-11" + " {b64e}, " + // from ifModifiedSince
"byte[] flags-1" + ", " + // flags
"String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
"Cardinal depth-" + indexURL.urlCrawlDepthLength + " {b64e}, " + // the prefetch depth so far, starts at 0
"String profile-" + indexURL.urlCrawlProfileHandleLength + ", " + // the name of the prefetch profile handle
"String urldescr-" + indexURL.urlDescrLength); //
"String url-" + yacySeedDB.commonHashLength + ", " + // the url
"String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"Cardinal modifiedsince-11" + " {b64e}, " + // from ifModifiedSince
"byte[] flags-1" + ", " + // flags
"String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
"Cardinal depth-" + indexRWIEntryOld.urlCrawlDepthLength + " {b64e}, " + // the prefetch depth so far, starts at 0
"String profile-" + indexRWIEntryOld.urlCrawlProfileHandleLength + ", " + // the name of the prefetch profile handle
"String urldescr-" + indexRWIEntryOld.urlDescrLength); //
sbQueueStack = kelondroStack.open(sbQueueStackPath, rowdef);
}
@ -108,7 +110,7 @@ public class plasmaSwitchboardQueue {
kelondroBase64Order.enhancedCoder.encodeLong((entry.ifModifiedSince == null) ? 0 : entry.ifModifiedSince.getTime(), 11).getBytes(),
new byte[]{entry.flags},
(entry.initiator == null) ? indexURL.dummyHash.getBytes() : entry.initiator.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong((long) entry.depth, indexURL.urlCrawlDepthLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong((long) entry.depth, indexRWIEntryOld.urlCrawlDepthLength).getBytes(),
(entry.profileHandle == null) ? indexURL.dummyHash.getBytes() : entry.profileHandle.getBytes(),
(entry.anchorName == null) ? "-".getBytes("UTF-8") : entry.anchorName.getBytes("UTF-8")
}));
@ -333,7 +335,7 @@ public class plasmaSwitchboardQueue {
public URL referrerURL() {
if (referrerURL == null) {
if ((referrerHash == null) || (referrerHash.equals(indexURL.dummyHash))) return null;
plasmaCrawlLURLEntry entry = lurls.load(referrerHash, null);
indexURLEntry entry = lurls.load(referrerHash, null);
if (entry == null) referrerURL = null; else referrerURL = entry.comp().url();
}
return referrerURL;

@ -48,6 +48,7 @@ import java.io.File;
import java.io.IOException;
import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.net.URL;
public class plasmaURLPool {
@ -83,7 +84,7 @@ public class plasmaURLPool {
plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash);
if (ne != null) return ne.url();
} catch (IOException e) {}
plasmaCrawlLURLEntry le = loadedURL.load(urlhash, null);
indexURLEntry le = loadedURL.load(urlhash, null);
if (le != null) return le.comp().url();
plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash);
if (ee != null) return ee.url();

@ -40,10 +40,11 @@ import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.index.indexCollectionRI;
import de.anomic.index.indexContainer;
import de.anomic.index.indexContainerOrder;
import de.anomic.index.indexEntry;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexRAMRI;
import de.anomic.index.indexRI;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException;
@ -60,7 +61,7 @@ public final class plasmaWordIndex implements indexRI {
private static final String indexAssortmentClusterPath = "ACLUSTER";
private static final int assortmentCount = 64;
private static final kelondroRow payloadrow = indexURLEntry.urlEntryRow;
private static final kelondroRow payloadrow = indexRWIEntryOld.urlEntryRow;
private final File oldDatabaseRoot;
private final kelondroOrder indexOrder = new kelondroNaturalOrder(true);
@ -201,7 +202,7 @@ public final class plasmaWordIndex implements indexRI {
return new indexContainer(wordHash, payloadrow);
}
public indexContainer addEntry(String wordHash, indexEntry entry, long updateTime, boolean dhtInCase) {
public indexContainer addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean dhtInCase) {
// set dhtInCase depending on wordHash
if ((!dhtInCase) && (yacyDHTAction.shallBeOwnWord(wordHash))) dhtInCase = true;
@ -318,7 +319,7 @@ public final class plasmaWordIndex implements indexRI {
Iterator i = condenser.words();
Map.Entry wentry;
String word;
indexEntry ientry;
indexRWIEntry ientry;
plasmaCondenser.wordStatProp wprop;
String wordHash;
int urlLength = url.toString().length();
@ -330,7 +331,7 @@ public final class plasmaWordIndex implements indexRI {
wprop = (plasmaCondenser.wordStatProp) wentry.getValue();
// if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c);
wordHash = indexEntryAttribute.word2hash(word);
ientry = new indexURLEntry(urlHash,
ientry = new indexRWIEntryOld(urlHash,
urlLength, urlComps, (document == null) ? urlLength : document.getMainLongTitle().length(),
wprop.count,
condenser.RESULT_SIMI_WORDS,
@ -685,11 +686,11 @@ public final class plasmaWordIndex implements indexRI {
// the combined container will fit, read the container
try {
Iterator entries = entity.elements(true);
indexEntry entry;
indexRWIEntry entry;
while (entries.hasNext()) {
entry = (indexEntry) entries.next();
entry = (indexRWIEntry) entries.next();
// System.out.println("ENTRY = " + entry.getUrlHash());
container.add(new indexEntry[]{entry}, System.currentTimeMillis());
container.add(new indexRWIEntry[]{entry}, System.currentTimeMillis());
}
// we have read all elements, now delete the entity
entity.deleteComplete();
@ -723,11 +724,11 @@ public final class plasmaWordIndex implements indexRI {
try {
Iterator entries = entity.elements(true);
indexEntry entry;
indexRWIEntry entry;
while (entries.hasNext()) {
entry = (indexEntry) entries.next();
entry = (indexRWIEntry) entries.next();
// System.out.println("ENTRY = " + entry.getUrlHash());
container.add(new indexEntry[] { entry }, System.currentTimeMillis());
container.add(new indexRWIEntry[] { entry }, System.currentTimeMillis());
}
// we have read all elements, now delete the entity
entity.deleteComplete();
@ -775,7 +776,7 @@ public final class plasmaWordIndex implements indexRI {
public void run() {
serverLog.logInfo("INDEXCLEANER", "IndexCleaner-Thread started");
indexContainer container = null;
indexEntry entry = null;
indexRWIEntry entry = null;
URL url = null;
HashSet urlHashs = new HashSet();
try {
@ -787,9 +788,9 @@ public final class plasmaWordIndex implements indexRI {
wordHashNow = container.getWordHash();
while (containerIterator.hasNext() && run) {
waiter();
entry = (indexEntry) containerIterator.next();
entry = (indexRWIEntry) containerIterator.next();
// System.out.println("Wordhash: "+wordHash+" UrlHash: "+entry.getUrlHash());
plasmaCrawlLURLEntry ue = lurl.load(entry.urlHash(), null);
indexURLEntry ue = lurl.load(entry.urlHash(), null);
if (ue == null) {
urlHashs.add(entry.urlHash());
} else {

@ -57,15 +57,15 @@ import java.io.IOException;
import java.util.Iterator;
import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURLEntry;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroColumn;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroTree;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySeedDB;
public final class plasmaWordIndexAssortment {
@ -89,7 +89,7 @@ public final class plasmaWordIndexAssortment {
private kelondroRow bufferStructure(int assortmentCapacity) {
kelondroColumn[] structure = new kelondroColumn[3 + assortmentCapacity];
structure[0] = new kelondroColumn("byte[] wordhash-" + indexEntryAttribute.wordHashLength);
structure[0] = new kelondroColumn("byte[] wordhash-" + yacySeedDB.commonHashLength);
structure[1] = new kelondroColumn("Cardinal occ-4 {b256}");
structure[2] = new kelondroColumn("Cardinal time-8 {b256}");
kelondroColumn p = new kelondroColumn("byte[] urlprops-" + payloadrow.objectsize());
@ -98,7 +98,7 @@ public final class plasmaWordIndexAssortment {
}
private int assortmentCapacity(int rowsize) {
return (rowsize - indexEntryAttribute.wordHashLength - 12) / payloadrow.objectsize();
return (rowsize - yacySeedDB.commonHashLength - 12) / payloadrow.objectsize();
}
public plasmaWordIndexAssortment(File storagePath, kelondroRow payloadrow, int assortmentLength, int bufferkb, long preloadTime, serverLog log) throws IOException {
@ -133,9 +133,9 @@ public final class plasmaWordIndexAssortment {
row.setCol(1, 1);
row.setCol(2, newContainer.updated());
Iterator entries = newContainer.entries();
indexEntry entry;
indexRWIEntry entry;
for (int i = 0; i < assortmentLength; i++) {
entry = (indexEntry) entries.next();
entry = (indexRWIEntry) entries.next();
row.setCol(3 + i, entry.toKelondroEntry().bytes());
}
kelondroRow.Entry oldrow = null;
@ -221,7 +221,7 @@ public final class plasmaWordIndexAssortment {
indexContainer container = new indexContainer(wordHash, payloadrow);
int al = assortmentCapacity(row.objectsize());
for (int i = 0; i < al; i++) {
container.add(new indexEntry[] { new indexURLEntry(row.getColBytes(3 + i)) }, updateTime);
container.add(new indexRWIEntry[] { new indexRWIEntryOld(row.getColBytes(3 + i)) }, updateTime);
}
return container;
}

@ -54,7 +54,7 @@ import java.util.Set;
import de.anomic.index.indexContainer;
import de.anomic.index.indexContainerOrder;
import de.anomic.index.indexEntry;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRI;
import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroMergeIterator;
@ -168,7 +168,7 @@ public final class plasmaWordIndexAssortmentCluster implements indexRI {
c = new indexContainer(newContainer.getWordHash(), payloadrow);
for (int k = 0; k < j; k++) {
if (i.hasNext()) {
c.add((indexEntry) i.next(), newContainer.updated());
c.add((indexRWIEntry) i.next(), newContainer.updated());
} else {
storeForced(c);
return;
@ -178,7 +178,7 @@ public final class plasmaWordIndexAssortmentCluster implements indexRI {
}
}
public indexContainer addEntry(String wordHash, indexEntry newEntry, long updateTime, boolean dhtCase) {
public indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) {
indexContainer container = new indexContainer(wordHash, payloadrow);
container.add(newEntry);
return addEntries(container, updateTime, dhtCase);
@ -223,7 +223,7 @@ public final class plasmaWordIndexAssortmentCluster implements indexRI {
c = new indexContainer(newContainer.getWordHash(), payloadrow);
for (int k = 0; k <= j; k++) {
assert (i.hasNext());
c.add((indexEntry) i.next(), newContainer.updated());
c.add((indexRWIEntry) i.next(), newContainer.updated());
}
try {
storeForced(c);
@ -306,9 +306,9 @@ public final class plasmaWordIndexAssortmentCluster implements indexRI {
if (buffer != null) {
// sort out url hashes that shall be deleted
Iterator bi = buffer.entries();
indexEntry entry;
indexRWIEntry entry;
while (bi.hasNext()) {
entry = (indexEntry) bi.next();
entry = (indexRWIEntry) bi.next();
if (urlHashes.remove(entry.urlHash())) bi.remove();
}
record.add(buffer, -1);

@ -49,13 +49,13 @@ import java.io.IOException;
import java.util.Iterator;
import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroTree;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySeedDB;
public final class plasmaWordIndexFile {
@ -91,7 +91,7 @@ public final class plasmaWordIndexFile {
long cacheSize = theLocation.length();
if (cacheSize > 1048576) cacheSize = 1048576;
return kelondroTree.open(theLocation, cacheSize, 0,
new kelondroRow("byte[] urlhash-" + indexURL.urlHashLength + ", byte[] ba-" + (indexURLEntry.urlEntryRow.objectsize() - indexURL.urlHashLength)));
new kelondroRow("byte[] urlhash-" + yacySeedDB.commonHashLength + ", byte[] ba-" + (indexRWIEntryOld.urlEntryRow.objectsize() - yacySeedDB.commonHashLength)));
}
public static File wordHash2path(File databaseRoot, String hash) {
@ -128,23 +128,23 @@ public final class plasmaWordIndexFile {
} catch (IOException e) {}
}
public indexEntry getEntry(String urlhash) throws IOException {
public indexRWIEntry getEntry(String urlhash) throws IOException {
kelondroRow.Entry n = theIndex.get(urlhash.getBytes());
if (n == null) return null;
return new indexURLEntry(n.getColString(0, null), n.getColString(1, null));
return new indexRWIEntryOld(n.getColString(0, null), n.getColString(1, null));
}
public boolean contains(String urlhash) throws IOException {
return (theIndex.get(urlhash.getBytes()) != null);
}
public boolean contains(indexEntry entry) throws IOException {
public boolean contains(indexRWIEntry entry) throws IOException {
return (theIndex.get(entry.urlHash().getBytes()) != null);
}
public boolean addEntry(indexEntry entry) throws IOException {
public boolean addEntry(indexRWIEntry entry) throws IOException {
if (entry == null) return false;
indexEntry oldEntry = getEntry(entry.urlHash());
indexRWIEntry oldEntry = getEntry(entry.urlHash());
if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this entity
return false;
}
@ -163,7 +163,7 @@ public final class plasmaWordIndexFile {
if (container != null) {
Iterator i = container.entries();
while (i.hasNext()) {
if (addEntry((indexEntry) i.next())) count++;
if (addEntry((indexRWIEntry) i.next())) count++;
}
}
@ -228,7 +228,7 @@ public final class plasmaWordIndexFile {
public Object next() {
if (i == null) return null;
kelondroRow.Entry n = (kelondroRow.Entry) i.next();
return new indexURLEntry(n.getColString(0, null), n.getColString(1, null));
return new indexRWIEntryOld(n.getColString(0, null), n.getColString(1, null));
}
public void remove() {
throw new UnsupportedOperationException();
@ -248,7 +248,7 @@ public final class plasmaWordIndexFile {
long timeout = (time == -1) ? Long.MAX_VALUE : System.currentTimeMillis() + time;
try {
while ((i.hasNext()) && (System.currentTimeMillis() < timeout)) {
addEntry((indexEntry) i.next());
addEntry((indexRWIEntry) i.next());
}
} catch (kelondroException e) {
serverLog.logSevere("PLASMA", "plasmaWordIndexEntity.merge: " + e.getMessage());

@ -51,7 +51,7 @@ import java.util.Set;
import java.util.TreeSet;
import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRI;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroRow;
@ -235,10 +235,10 @@ public class plasmaWordIndexFileCluster implements indexRI {
if (exists(wordHash)) {
plasmaWordIndexFile entity = this.getEntity(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime * 9 / 10);
indexContainer container = new indexContainer(wordHash, payloadrow);
indexEntry entry;
indexRWIEntry entry;
Iterator i = entity.elements(true);
while ((i.hasNext()) && (System.currentTimeMillis() < (start + maxTime))) {
entry = (indexEntry) i.next();
entry = (indexRWIEntry) i.next();
if ((urlselection == null) || (urlselection.contains(entry.urlHash()))) container.add(entry);
}
return container;
@ -302,7 +302,7 @@ public class plasmaWordIndexFileCluster implements indexRI {
} else return 0;
}
public indexContainer addEntry(String wordHash, indexEntry newEntry, long updateTime, boolean dhtCase) {
public indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) {
indexContainer container = new indexContainer(wordHash, payloadrow);
container.add(newEntry);
return addEntries(container, updateTime, dhtCase);

@ -55,14 +55,14 @@ import java.util.TreeMap;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpc;
import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSearchTimingProfile;
import de.anomic.plasma.plasmaSnippetCache;
@ -491,33 +491,33 @@ public final class yacyClient {
//System.out.println("***result count " + results);
// create containers
final int words = wordhashes.length() / indexEntryAttribute.wordHashLength;
final int words = wordhashes.length() / yacySeedDB.commonHashLength;
indexContainer[] container = new indexContainer[words];
for (int i = 0; i < words; i++) {
container[i] = new indexContainer(wordhashes.substring(i * indexEntryAttribute.wordHashLength, (i + 1) * indexEntryAttribute.wordHashLength), indexURLEntry.urlEntryRow);
container[i] = new indexContainer(wordhashes.substring(i * yacySeedDB.commonHashLength, (i + 1) * yacySeedDB.commonHashLength), indexRWIEntryOld.urlEntryRow);
}
// insert results to containers
plasmaCrawlLURLEntry urlEntry;
indexURLEntry urlEntry;
String[] urls = new String[results];
for (int n = 0; n < results; n++) {
// get one single search result
urlEntry = urlManager.newEntry((String) result.get("resource" + n));
if (urlEntry == null) continue;
assert (urlEntry.hash().length() == 12) : "urlEntry.hash() = " + urlEntry.hash();
plasmaCrawlLURLEntry.Components comp = urlEntry.comp();
indexURLEntry.Components comp = urlEntry.comp();
if (blacklist.isListed(plasmaURLPattern.BLACKLIST_SEARCH, comp.url())) continue; // block with backlist
urlManager.store(urlEntry);
urlManager.stack(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2);
// save the url entry
final indexEntry entry;
final indexRWIEntry entry;
if (urlEntry.word() == null) {
// the old way to define words
int urlLength = comp.url().toNormalform().length();
int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()).length;
entry = new indexURLEntry(
entry = new indexRWIEntryOld(
urlEntry.hash(),
urlLength,
urlComps,
@ -545,7 +545,7 @@ public final class yacyClient {
}
// add the url entry to the word indexes
for (int m = 0; m < words; m++) {
container[m].add(new indexEntry[]{entry}, System.currentTimeMillis());
container[m].add(new indexRWIEntry[]{entry}, System.currentTimeMillis());
}
// store url hash for statistics
urls[n] = urlEntry.hash();
@ -869,7 +869,7 @@ public final class yacyClient {
-er crawlt, Ergebnis erscheint aber unter falschem initiator
*/
public static HashMap crawlReceipt(yacySeed targetSeed, String process, String result, String reason, plasmaCrawlLURLEntry entry, String wordhashes) {
public static HashMap crawlReceipt(yacySeed targetSeed, String process, String result, String reason, indexURLEntry entry, String wordhashes) {
if (targetSeed == null) { return null; }
if (yacyCore.seedDB.mySeed == null) { return null; }
if (yacyCore.seedDB.mySeed == targetSeed) { return null; }
@ -943,11 +943,11 @@ public final class yacyClient {
// check if we got all necessary urls in the urlCache (only for debugging)
Iterator eenum;
indexEntry entry;
indexRWIEntry entry;
for (int i = 0; i < indexes.length; i++) {
eenum = indexes[i].entries();
while (eenum.hasNext()) {
entry = (indexEntry) eenum.next();
entry = (indexRWIEntry) eenum.next();
if (urlCache.get(entry.urlHash()) == null) {
yacyCore.log.logFine("DEBUG transferIndex: to-send url hash '" + entry.urlHash() + "' is not contained in urlCache");
}
@ -988,9 +988,9 @@ public final class yacyClient {
if (uhs.length == 0) { return resultObj; } // all url's known
// extract the urlCache from the result
plasmaCrawlLURLEntry[] urls = new plasmaCrawlLURLEntry[uhs.length];
indexURLEntry[] urls = new indexURLEntry[uhs.length];
for (int i = 0; i < uhs.length; i++) {
urls[i] = (plasmaCrawlLURLEntry) urlCache.get(uhs[i]);
urls[i] = (indexURLEntry) urlCache.get(uhs[i]);
if (urls[i] == null) {
yacyCore.log.logFine("DEBUG transferIndex: requested url hash '" + uhs[i] + "', unknownURL='" + uhss + "'");
}
@ -1051,11 +1051,11 @@ public final class yacyClient {
int indexcount = 0;
final StringBuffer entrypost = new StringBuffer(indexes.length*73);
Iterator eenum;
indexEntry entry;
indexRWIEntry entry;
for (int i = 0; i < indexes.length; i++) {
eenum = indexes[i].entries();
while (eenum.hasNext()) {
entry = (indexEntry) eenum.next();
entry = (indexRWIEntry) eenum.next();
entrypost.append(indexes[i].getWordHash())
.append(entry.toPropertyForm(false))
.append(serverCore.crlfString);
@ -1099,7 +1099,7 @@ public final class yacyClient {
}
}
private static HashMap transferURL(yacySeed targetSeed, plasmaCrawlLURLEntry[] urls, boolean gzipBody, int timeout) {
private static HashMap transferURL(yacySeed targetSeed, indexURLEntry[] urls, boolean gzipBody, int timeout) {
// this post a message to the remote message board
final String address = targetSeed.getAddress();
if (address == null) { return null; }

@ -71,10 +71,11 @@ import de.anomic.http.httpd;
import de.anomic.http.httpdFileHandler;
import de.anomic.http.httpdProxyHandler;
import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry;
import de.anomic.index.indexURLEntryOld;
import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroMap;
@ -83,8 +84,6 @@ import de.anomic.kelondro.kelondroTree;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaCrawlLURLOldEntry;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURLPool;
@ -623,7 +622,7 @@ public final class yacy {
kelondroMScoreCluster hs = new kelondroMScoreCluster();
while (ef.hasMoreElements()) {
f = (File) ef.nextElement();
h = f.getName().substring(0, indexURL.urlHashLength);
h = f.getName().substring(0, yacySeedDB.commonHashLength);
hs.addScore(h, (int) f.length());
}
@ -740,12 +739,12 @@ public final class yacy {
// the combined container will fit, read the container
Iterator wordIdxEntries = wordIdxContainer.entries();
indexEntry iEntry;
indexRWIEntry iEntry;
while (wordIdxEntries.hasNext()) {
iEntry = (indexEntry) wordIdxEntries.next();
iEntry = (indexRWIEntry) wordIdxEntries.next();
String urlHash = iEntry.urlHash();
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
plasmaCrawlLURLEntry urlEntry = currentUrlDB.load(urlHash, null);
indexURLEntry urlEntry = currentUrlDB.load(urlHash, null);
urlCounter++;
minimizedUrlDB.store(urlEntry);
if (urlCounter % 500 == 0) {
@ -965,11 +964,11 @@ public final class yacy {
long start = System.currentTimeMillis();
if (source.equals("lurl")) {
Iterator eiter = pool.loadedURL.entries(true, false, null);
plasmaCrawlLURLEntry entry;
indexURLEntry entry;
while (eiter.hasNext()) {
try {
entry = (plasmaCrawlLURLEntry) eiter.next();
plasmaCrawlLURLEntry.Components comp = entry.comp();
entry = (indexURLEntry) eiter.next();
indexURLEntry.Components comp = entry.comp();
if ((entry != null) && (comp.url() != null)) doms.put(comp.url().getHost(), null);
} catch (Exception e) {
// here a MalformedURLException may occur
@ -1077,10 +1076,10 @@ public final class yacy {
if (source.equals("lurl")) {
Iterator eiter = pool.loadedURL.entries(true, false, null);
plasmaCrawlLURLEntry entry;
indexURLEntry entry;
while (eiter.hasNext()) {
entry = (plasmaCrawlLURLEntry) eiter.next();
plasmaCrawlLURLEntry.Components comp = entry.comp();
entry = (indexURLEntry) eiter.next();
indexURLEntry.Components comp = entry.comp();
if ((entry != null) && (comp.url() != null)) {
if (html) {
bos.write(("<a href=\"" + comp.url().toNormalform() + "\">" + comp.descr() + "</a><br>").getBytes("UTF-8"));
@ -1135,7 +1134,7 @@ public final class yacy {
plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, true, 1000, true, 1000, true, 10000);
kelondroTree oldindex = null;
try {
oldindex = new kelondroTree(urlHash, 1000, -1, plasmaCrawlLURLOldEntry.rowdef);
oldindex = new kelondroTree(urlHash, 1000, -1, indexURLEntryOld.rowdef);
} catch (IOException e) {
System.out.println("ERROR: CANNOT OPEN OLD INDEX: " + e.getMessage());
}
@ -1145,9 +1144,9 @@ public final class yacy {
int tc = oldindex.size(), c = 0;
Iterator eiter = oldindex.contentRows(-1);
kelondroRow.Entry oldrow;
plasmaCrawlLURLEntry oldentry;
plasmaCrawlLURLEntry newentry;
plasmaCrawlLURLEntry.Components comp;
indexURLEntry oldentry;
indexURLEntry newentry;
indexURLEntry.Components comp;
byte[] dummymd5 = new byte[0];
while (eiter.hasNext()) {
try {
@ -1158,7 +1157,7 @@ public final class yacy {
oldrow = null;
}
if (oldrow != null) try {
oldentry = new plasmaCrawlLURLOldEntry(oldrow, null);
oldentry = new indexURLEntryOld(oldrow, null);
comp = oldentry.comp();
newentry = pool.loadedURL.newEntry(
comp.url(),
@ -1236,7 +1235,7 @@ public final class yacy {
WordIndex = new plasmaWordIndex(homeDBroot, indexRoot, true, 8*1024*1024, 3000, log, sps.getConfigBool("useCollectionIndex", false));
indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false);
} else if (resource.equals("assortments")) {
plasmaWordIndexAssortmentCluster assortmentCluster = new plasmaWordIndexAssortmentCluster(new File(homeDBroot, "ACLUSTER"), 64, indexURLEntry.urlEntryRow, 16*1024*1024, 3000, log);
plasmaWordIndexAssortmentCluster assortmentCluster = new plasmaWordIndexAssortmentCluster(new File(homeDBroot, "ACLUSTER"), 64, indexRWIEntryOld.urlEntryRow, 16*1024*1024, 3000, log);
indexContainerIterator = assortmentCluster.wordContainers(wordChunkStartHash, true, false);
} /*else if (resource.startsWith("assortment")) {
int a = Integer.parseInt(resource.substring(10));

Loading…
Cancel
Save